]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #23339 from poettering/sockaddr-size-limit
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "barrier.h"
28 #include "base-filesystem.h"
29 #include "blkid-util.h"
30 #include "btrfs-util.h"
31 #include "bus-error.h"
32 #include "bus-util.h"
33 #include "cap-list.h"
34 #include "capability-util.h"
35 #include "cgroup-util.h"
36 #include "chase-symlinks.h"
37 #include "copy.h"
38 #include "cpu-set-util.h"
39 #include "creds-util.h"
40 #include "dev-setup.h"
41 #include "discover-image.h"
42 #include "dissect-image.h"
43 #include "env-util.h"
44 #include "escape.h"
45 #include "fd-util.h"
46 #include "fdset.h"
47 #include "fileio.h"
48 #include "format-util.h"
49 #include "fs-util.h"
50 #include "gpt.h"
51 #include "hexdecoct.h"
52 #include "hostname-setup.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "io-util.h"
56 #include "log.h"
57 #include "loop-util.h"
58 #include "loopback-setup.h"
59 #include "macro.h"
60 #include "main-func.h"
61 #include "missing_sched.h"
62 #include "mkdir.h"
63 #include "mount-util.h"
64 #include "mountpoint-util.h"
65 #include "namespace-util.h"
66 #include "netlink-util.h"
67 #include "nspawn-bind-user.h"
68 #include "nspawn-cgroup.h"
69 #include "nspawn-creds.h"
70 #include "nspawn-def.h"
71 #include "nspawn-expose-ports.h"
72 #include "nspawn-mount.h"
73 #include "nspawn-network.h"
74 #include "nspawn-oci.h"
75 #include "nspawn-patch-uid.h"
76 #include "nspawn-register.h"
77 #include "nspawn-seccomp.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "nspawn-util.h"
82 #include "nspawn.h"
83 #include "nulstr-util.h"
84 #include "os-util.h"
85 #include "pager.h"
86 #include "parse-argument.h"
87 #include "parse-util.h"
88 #include "pretty-print.h"
89 #include "process-util.h"
90 #include "ptyfwd.h"
91 #include "random-util.h"
92 #include "raw-clone.h"
93 #include "resolve-util.h"
94 #include "rlimit-util.h"
95 #include "rm-rf.h"
96 #if HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "socket-util.h"
102 #include "stat-util.h"
103 #include "stdio-util.h"
104 #include "string-table.h"
105 #include "string-util.h"
106 #include "strv.h"
107 #include "sysctl-util.h"
108 #include "terminal-util.h"
109 #include "tmpfile-util.h"
110 #include "umask-util.h"
111 #include "unit-name.h"
112 #include "user-util.h"
113 #include "util.h"
114
115 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
116 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
117
118 #define EXIT_FORCE_RESTART 133
119
120 typedef enum ContainerStatus {
121 CONTAINER_TERMINATED,
122 CONTAINER_REBOOTED,
123 } ContainerStatus;
124
125 static char *arg_directory = NULL;
126 static char *arg_template = NULL;
127 static char *arg_chdir = NULL;
128 static char *arg_pivot_root_new = NULL;
129 static char *arg_pivot_root_old = NULL;
130 static char *arg_user = NULL;
131 static uid_t arg_uid = UID_INVALID;
132 static gid_t arg_gid = GID_INVALID;
133 static gid_t* arg_supplementary_gids = NULL;
134 static size_t arg_n_supplementary_gids = 0;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL; /* The name used by the host to refer to this */
137 static char *arg_hostname = NULL; /* The name the payload sees by default */
138 static const char *arg_selinux_context = NULL;
139 static const char *arg_selinux_apifs_context = NULL;
140 static char *arg_slice = NULL;
141 static bool arg_private_network = false;
142 static bool arg_read_only = false;
143 static StartMode arg_start_mode = START_PID1;
144 static bool arg_ephemeral = false;
145 static LinkJournal arg_link_journal = LINK_AUTO;
146 static bool arg_link_journal_try = false;
147 static uint64_t arg_caps_retain =
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
159 (1ULL << CAP_MKNOD) |
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
163 (1ULL << CAP_SETFCAP) |
164 (1ULL << CAP_SETGID) |
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
168 (1ULL << CAP_SYS_BOOT) |
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
172 (1ULL << CAP_SYS_RESOURCE) |
173 (1ULL << CAP_SYS_TTY_CONFIG);
174 static uint64_t arg_caps_ambient = 0;
175 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
176 static CustomMount *arg_custom_mounts = NULL;
177 static size_t arg_n_custom_mounts = 0;
178 static char **arg_setenv = NULL;
179 static bool arg_quiet = false;
180 static bool arg_register = true;
181 static bool arg_keep_unit = false;
182 static char **arg_network_interfaces = NULL;
183 static char **arg_network_macvlan = NULL;
184 static char **arg_network_ipvlan = NULL;
185 static bool arg_network_veth = false;
186 static char **arg_network_veth_extra = NULL;
187 static char *arg_network_bridge = NULL;
188 static char *arg_network_zone = NULL;
189 static char *arg_network_namespace_path = NULL;
190 static PagerFlags arg_pager_flags = 0;
191 static unsigned long arg_personality = PERSONALITY_INVALID;
192 static char *arg_image = NULL;
193 static char *arg_oci_bundle = NULL;
194 static VolatileMode arg_volatile_mode = VOLATILE_NO;
195 static ExposePort *arg_expose_ports = NULL;
196 static char **arg_property = NULL;
197 static sd_bus_message *arg_property_message = NULL;
198 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
199 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
200 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
201 static int arg_kill_signal = 0;
202 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
203 static SettingsMask arg_settings_mask = 0;
204 static int arg_settings_trusted = -1;
205 static char **arg_parameters = NULL;
206 static const char *arg_container_service_name = "systemd-nspawn";
207 static bool arg_notify_ready = false;
208 static bool arg_use_cgns = true;
209 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
210 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
211 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
212 static char **arg_syscall_allow_list = NULL;
213 static char **arg_syscall_deny_list = NULL;
214 #if HAVE_SECCOMP
215 static scmp_filter_ctx arg_seccomp = NULL;
216 #endif
217 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
218 static bool arg_no_new_privileges = false;
219 static int arg_oom_score_adjust = 0;
220 static bool arg_oom_score_adjust_set = false;
221 static CPUSet arg_cpu_set = {};
222 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
223 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
224 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
225 static DeviceNode* arg_extra_nodes = NULL;
226 static size_t arg_n_extra_nodes = 0;
227 static char **arg_sysctl = NULL;
228 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
229 static Credential *arg_credentials = NULL;
230 static size_t arg_n_credentials = 0;
231 static char **arg_bind_user = NULL;
232 static bool arg_suppress_sync = false;
233 static char *arg_settings_filename = NULL;
234
235 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
257 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
259 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
261 #if HAVE_SECCOMP
262 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
263 #endif
264 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
265 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
266 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
267 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
268
269 static int handle_arg_console(const char *arg) {
270 if (streq(arg, "help")) {
271 puts("autopipe\n"
272 "interactive\n"
273 "passive\n"
274 "pipe\n"
275 "read-only");
276 return 0;
277 }
278
279 if (streq(arg, "interactive"))
280 arg_console_mode = CONSOLE_INTERACTIVE;
281 else if (streq(arg, "read-only"))
282 arg_console_mode = CONSOLE_READ_ONLY;
283 else if (streq(arg, "passive"))
284 arg_console_mode = CONSOLE_PASSIVE;
285 else if (streq(arg, "pipe")) {
286 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
287 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
288 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
289 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
290 "Proceeding anyway.");
291
292 arg_console_mode = CONSOLE_PIPE;
293 } else if (streq(arg, "autopipe")) {
294 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
295 arg_console_mode = CONSOLE_INTERACTIVE;
296 else
297 arg_console_mode = CONSOLE_PIPE;
298 } else
299 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
300
301 arg_settings_mask |= SETTING_CONSOLE_MODE;
302 return 1;
303 }
304
305 static int help(void) {
306 _cleanup_free_ char *link = NULL;
307 int r;
308
309 pager_open(arg_pager_flags);
310
311 r = terminal_urlify_man("systemd-nspawn", "1", &link);
312 if (r < 0)
313 return log_oom();
314
315 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
316 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
317 " -h --help Show this help\n"
318 " --version Print version string\n"
319 " -q --quiet Do not show status information\n"
320 " --no-pager Do not pipe output into a pager\n"
321 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
322 "%3$sImage:%4$s\n"
323 " -D --directory=PATH Root directory for the container\n"
324 " --template=PATH Initialize root directory from template directory,\n"
325 " if missing\n"
326 " -x --ephemeral Run container with snapshot of root directory, and\n"
327 " remove it after exit\n"
328 " -i --image=PATH Root file system disk image (or device node) for\n"
329 " the container\n"
330 " --oci-bundle=PATH OCI bundle directory\n"
331 " --read-only Mount the root directory read-only\n"
332 " --volatile[=MODE] Run the system in volatile mode\n"
333 " --root-hash=HASH Specify verity root hash for root disk image\n"
334 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
335 " as a DER encoded PKCS7, either as a path to a file\n"
336 " or as an ASCII base64 encoded string prefixed by\n"
337 " 'base64:'\n"
338 " --verity-data=PATH Specify hash device for verity\n"
339 " --pivot-root=PATH[:PATH]\n"
340 " Pivot root to given directory in the container\n\n"
341 "%3$sExecution:%4$s\n"
342 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
343 " -b --boot Boot up full system (i.e. invoke init)\n"
344 " --chdir=PATH Set working directory in the container\n"
345 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
346 " -u --user=USER Run the command under specified user or UID\n"
347 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
348 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
349 " --suppress-sync=BOOLEAN\n"
350 " Suppress any form of disk data synchronization\n\n"
351 "%3$sSystem Identity:%4$s\n"
352 " -M --machine=NAME Set the machine name for the container\n"
353 " --hostname=NAME Override the hostname for the container\n"
354 " --uuid=UUID Set a specific machine UUID for the container\n\n"
355 "%3$sProperties:%4$s\n"
356 " -S --slice=SLICE Place the container in the specified slice\n"
357 " --property=NAME=VALUE Set scope unit property\n"
358 " --register=BOOLEAN Register container as machine\n"
359 " --keep-unit Do not register a scope for the machine, reuse\n"
360 " the service unit nspawn is running in\n\n"
361 "%3$sUser Namespacing:%4$s\n"
362 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
363 " --private-users[=UIDBASE[:NUIDS]]\n"
364 " Similar, but with user configured UID/GID range\n"
365 " --private-users-ownership=MODE\n"
366 " Adjust ('chown') or map ('map') OS tree ownership\n"
367 " to private UID/GID range\n\n"
368 "%3$sNetworking:%4$s\n"
369 " --private-network Disable network in container\n"
370 " --network-interface=INTERFACE\n"
371 " Assign an existing network interface to the\n"
372 " container\n"
373 " --network-macvlan=INTERFACE\n"
374 " Create a macvlan network interface based on an\n"
375 " existing network interface to the container\n"
376 " --network-ipvlan=INTERFACE\n"
377 " Create an ipvlan network interface based on an\n"
378 " existing network interface to the container\n"
379 " -n --network-veth Add a virtual Ethernet connection between host\n"
380 " and container\n"
381 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
382 " Add an additional virtual Ethernet link between\n"
383 " host and container\n"
384 " --network-bridge=INTERFACE\n"
385 " Add a virtual Ethernet connection to the container\n"
386 " and attach it to an existing bridge on the host\n"
387 " --network-zone=NAME Similar, but attach the new interface to an\n"
388 " an automatically managed bridge interface\n"
389 " --network-namespace-path=PATH\n"
390 " Set network namespace to the one represented by\n"
391 " the specified kernel namespace file node\n"
392 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
393 " Expose a container IP port on the host\n\n"
394 "%3$sSecurity:%4$s\n"
395 " --capability=CAP In addition to the default, retain specified\n"
396 " capability\n"
397 " --drop-capability=CAP Drop the specified capability from the default set\n"
398 " --ambient-capability=CAP\n"
399 " Sets the specified capability for the started\n"
400 " process. Not useful if booting a machine.\n"
401 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
402 " --system-call-filter=LIST|~LIST\n"
403 " Permit/prohibit specific system calls\n"
404 " -Z --selinux-context=SECLABEL\n"
405 " Set the SELinux security context to be used by\n"
406 " processes in the container\n"
407 " -L --selinux-apifs-context=SECLABEL\n"
408 " Set the SELinux security context to be used by\n"
409 " API/tmpfs file systems in the container\n\n"
410 "%3$sResources:%4$s\n"
411 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
412 " --oom-score-adjust=VALUE\n"
413 " Adjust the OOM score value for the payload\n"
414 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
415 " --personality=ARCH Pick personality for this container\n\n"
416 "%3$sIntegration:%4$s\n"
417 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
418 " --timezone=MODE Select mode of /etc/localtime initialization\n"
419 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
420 " host, try-guest, try-host\n"
421 " -j Equivalent to --link-journal=try-guest\n\n"
422 "%3$sMounts:%4$s\n"
423 " --bind=PATH[:PATH[:OPTIONS]]\n"
424 " Bind mount a file or directory from the host into\n"
425 " the container\n"
426 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
427 " Similar, but creates a read-only bind mount\n"
428 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
429 " it\n"
430 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
431 " --overlay=PATH[:PATH...]:PATH\n"
432 " Create an overlay mount from the host to \n"
433 " the container\n"
434 " --overlay-ro=PATH[:PATH...]:PATH\n"
435 " Similar, but creates a read-only overlay mount\n"
436 " --bind-user=NAME Bind user from host to container\n\n"
437 "%3$sInput/Output:%4$s\n"
438 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
439 " set up for the container.\n"
440 " -P --pipe Equivalent to --console=pipe\n\n"
441 "%3$sCredentials:%4$s\n"
442 " --set-credential=ID:VALUE\n"
443 " Pass a credential with literal value to container.\n"
444 " --load-credential=ID:PATH\n"
445 " Load credential to pass to container from file or\n"
446 " AF_UNIX stream socket.\n"
447 "\nSee the %2$s for details.\n",
448 program_invocation_short_name,
449 link,
450 ansi_underline(),
451 ansi_normal(),
452 ansi_highlight(),
453 ansi_normal());
454
455 return 0;
456 }
457
458 static int custom_mount_check_all(void) {
459 size_t i;
460
461 for (i = 0; i < arg_n_custom_mounts; i++) {
462 CustomMount *m = &arg_custom_mounts[i];
463
464 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
465 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
466 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
467 "--private-users-ownership=own may not be combined with custom root mounts.");
468 if (arg_uid_shift == UID_INVALID)
469 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
470 "--private-users with automatic UID shift may not be combined with custom root mounts.");
471 }
472 }
473
474 return 0;
475 }
476
477 static int detect_unified_cgroup_hierarchy_from_environment(void) {
478 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
479 int r;
480
481 /* Allow the user to control whether the unified hierarchy is used */
482
483 e = getenv(var);
484 if (!e) {
485 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
486 var = "UNIFIED_CGROUP_HIERARCHY";
487 e = getenv(var);
488 }
489
490 if (!isempty(e)) {
491 r = parse_boolean(e);
492 if (r < 0)
493 return log_error_errno(r, "Failed to parse $%s: %m", var);
494 if (r > 0)
495 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
496 else
497 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
498 }
499
500 return 0;
501 }
502
503 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
504 int r;
505
506 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
507 * in the image actually supports. */
508 r = cg_all_unified();
509 if (r < 0)
510 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
511 if (r > 0) {
512 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
513 * routine only detects 231, so we'll have a false negative here for 230. */
514 r = systemd_installation_has_version(directory, "230");
515 if (r < 0)
516 return log_error_errno(r, "Failed to determine systemd version in container: %m");
517 if (r > 0)
518 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
519 else
520 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
521 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
522 /* Mixed cgroup hierarchy support was added in 233 */
523 r = systemd_installation_has_version(directory, "233");
524 if (r < 0)
525 return log_error_errno(r, "Failed to determine systemd version in container: %m");
526 if (r > 0)
527 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
528 else
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
530 } else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
532
533 log_debug("Using %s hierarchy for container.",
534 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
535 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
536
537 return 0;
538 }
539
540 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
541 uint64_t mask = 0;
542 int r;
543
544 for (;;) {
545 _cleanup_free_ char *t = NULL;
546
547 r = extract_first_word(&spec, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
550 if (r == 0)
551 break;
552
553 if (streq(t, "help")) {
554 for (int i = 0; i < capability_list_length(); i++) {
555 const char *name;
556
557 name = capability_to_name(i);
558 if (name)
559 puts(name);
560 }
561
562 return 0; /* quit */
563 }
564
565 if (streq(t, "all"))
566 mask = UINT64_MAX;
567 else {
568 r = capability_from_name(t);
569 if (r < 0)
570 return log_error_errno(r, "Failed to parse capability %s.", t);
571
572 mask |= 1ULL << r;
573 }
574 }
575
576 *ret_mask = mask;
577 return 1; /* continue */
578 }
579
580 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
581 int r;
582
583 r = getenv_bool(name);
584 if (r == -ENXIO)
585 return 0;
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse $%s: %m", name);
588
589 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
590 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
591 return 0;
592 }
593
594 static int parse_mount_settings_env(void) {
595 const char *e;
596 int r;
597
598 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
599 if (r < 0 && r != -ENXIO)
600 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
601 if (r >= 0)
602 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
603
604 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
605 if (streq_ptr(e, "network"))
606 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
607
608 else if (e) {
609 r = parse_boolean(e);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
612
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
614 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
615 }
616
617 return 0;
618 }
619
620 static int parse_environment(void) {
621 const char *e;
622 int r;
623
624 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
625 if (r < 0)
626 return r;
627 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
628 if (r < 0)
629 return r;
630 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
631 if (r < 0)
632 return r;
633 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
634 if (r < 0)
635 return r;
636
637 r = parse_mount_settings_env();
638 if (r < 0)
639 return r;
640
641 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
642 * even if it is supported. If not supported, it has no effect. */
643 if (!cg_ns_supported())
644 arg_use_cgns = false;
645 else {
646 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
647 if (r < 0) {
648 if (r != -ENXIO)
649 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
650
651 arg_use_cgns = true;
652 } else {
653 arg_use_cgns = r > 0;
654 arg_settings_mask |= SETTING_USE_CGNS;
655 }
656 }
657
658 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
659 if (e)
660 arg_container_service_name = e;
661
662 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
663 if (r >= 0)
664 arg_suppress_sync = r;
665 else if (r != -ENXIO)
666 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
667
668 return detect_unified_cgroup_hierarchy_from_environment();
669 }
670
671 static int parse_argv(int argc, char *argv[]) {
672 enum {
673 ARG_VERSION = 0x100,
674 ARG_PRIVATE_NETWORK,
675 ARG_UUID,
676 ARG_READ_ONLY,
677 ARG_CAPABILITY,
678 ARG_AMBIENT_CAPABILITY,
679 ARG_DROP_CAPABILITY,
680 ARG_LINK_JOURNAL,
681 ARG_BIND,
682 ARG_BIND_RO,
683 ARG_TMPFS,
684 ARG_OVERLAY,
685 ARG_OVERLAY_RO,
686 ARG_INACCESSIBLE,
687 ARG_SHARE_SYSTEM,
688 ARG_REGISTER,
689 ARG_KEEP_UNIT,
690 ARG_NETWORK_INTERFACE,
691 ARG_NETWORK_MACVLAN,
692 ARG_NETWORK_IPVLAN,
693 ARG_NETWORK_BRIDGE,
694 ARG_NETWORK_ZONE,
695 ARG_NETWORK_VETH_EXTRA,
696 ARG_NETWORK_NAMESPACE_PATH,
697 ARG_PERSONALITY,
698 ARG_VOLATILE,
699 ARG_TEMPLATE,
700 ARG_PROPERTY,
701 ARG_PRIVATE_USERS,
702 ARG_KILL_SIGNAL,
703 ARG_SETTINGS,
704 ARG_CHDIR,
705 ARG_PIVOT_ROOT,
706 ARG_PRIVATE_USERS_CHOWN,
707 ARG_PRIVATE_USERS_OWNERSHIP,
708 ARG_NOTIFY_READY,
709 ARG_ROOT_HASH,
710 ARG_ROOT_HASH_SIG,
711 ARG_VERITY_DATA,
712 ARG_SYSTEM_CALL_FILTER,
713 ARG_RLIMIT,
714 ARG_HOSTNAME,
715 ARG_NO_NEW_PRIVILEGES,
716 ARG_OOM_SCORE_ADJUST,
717 ARG_CPU_AFFINITY,
718 ARG_RESOLV_CONF,
719 ARG_TIMEZONE,
720 ARG_CONSOLE,
721 ARG_PIPE,
722 ARG_OCI_BUNDLE,
723 ARG_NO_PAGER,
724 ARG_SET_CREDENTIAL,
725 ARG_LOAD_CREDENTIAL,
726 ARG_BIND_USER,
727 ARG_SUPPRESS_SYNC,
728 };
729
730 static const struct option options[] = {
731 { "help", no_argument, NULL, 'h' },
732 { "version", no_argument, NULL, ARG_VERSION },
733 { "directory", required_argument, NULL, 'D' },
734 { "template", required_argument, NULL, ARG_TEMPLATE },
735 { "ephemeral", no_argument, NULL, 'x' },
736 { "user", required_argument, NULL, 'u' },
737 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
738 { "as-pid2", no_argument, NULL, 'a' },
739 { "boot", no_argument, NULL, 'b' },
740 { "uuid", required_argument, NULL, ARG_UUID },
741 { "read-only", no_argument, NULL, ARG_READ_ONLY },
742 { "capability", required_argument, NULL, ARG_CAPABILITY },
743 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
744 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
745 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
746 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
747 { "bind", required_argument, NULL, ARG_BIND },
748 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
749 { "tmpfs", required_argument, NULL, ARG_TMPFS },
750 { "overlay", required_argument, NULL, ARG_OVERLAY },
751 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
752 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
753 { "machine", required_argument, NULL, 'M' },
754 { "hostname", required_argument, NULL, ARG_HOSTNAME },
755 { "slice", required_argument, NULL, 'S' },
756 { "setenv", required_argument, NULL, 'E' },
757 { "selinux-context", required_argument, NULL, 'Z' },
758 { "selinux-apifs-context", required_argument, NULL, 'L' },
759 { "quiet", no_argument, NULL, 'q' },
760 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
761 { "register", required_argument, NULL, ARG_REGISTER },
762 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
763 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
764 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
765 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
766 { "network-veth", no_argument, NULL, 'n' },
767 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
768 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
769 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
770 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
771 { "personality", required_argument, NULL, ARG_PERSONALITY },
772 { "image", required_argument, NULL, 'i' },
773 { "volatile", optional_argument, NULL, ARG_VOLATILE },
774 { "port", required_argument, NULL, 'p' },
775 { "property", required_argument, NULL, ARG_PROPERTY },
776 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
777 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
778 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
779 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
780 { "settings", required_argument, NULL, ARG_SETTINGS },
781 { "chdir", required_argument, NULL, ARG_CHDIR },
782 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
783 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
784 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
785 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
786 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
787 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
788 { "rlimit", required_argument, NULL, ARG_RLIMIT },
789 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
790 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
791 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
792 { "timezone", required_argument, NULL, ARG_TIMEZONE },
793 { "console", required_argument, NULL, ARG_CONSOLE },
794 { "pipe", no_argument, NULL, ARG_PIPE },
795 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
796 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
797 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
798 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
799 { "bind-user", required_argument, NULL, ARG_BIND_USER },
800 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
801 {}
802 };
803
804 int c, r;
805 uint64_t plus = 0, minus = 0;
806 bool mask_all_settings = false, mask_no_settings = false;
807
808 assert(argc >= 0);
809 assert(argv);
810
811 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
812 switch (c) {
813
814 case 'h':
815 return help();
816
817 case ARG_VERSION:
818 return version();
819
820 case 'D':
821 r = parse_path_argument(optarg, false, &arg_directory);
822 if (r < 0)
823 return r;
824
825 arg_settings_mask |= SETTING_DIRECTORY;
826 break;
827
828 case ARG_TEMPLATE:
829 r = parse_path_argument(optarg, false, &arg_template);
830 if (r < 0)
831 return r;
832
833 arg_settings_mask |= SETTING_DIRECTORY;
834 break;
835
836 case 'i':
837 r = parse_path_argument(optarg, false, &arg_image);
838 if (r < 0)
839 return r;
840
841 arg_settings_mask |= SETTING_DIRECTORY;
842 break;
843
844 case ARG_OCI_BUNDLE:
845 r = parse_path_argument(optarg, false, &arg_oci_bundle);
846 if (r < 0)
847 return r;
848
849 break;
850
851 case 'x':
852 arg_ephemeral = true;
853 arg_settings_mask |= SETTING_EPHEMERAL;
854 break;
855
856 case 'u':
857 r = free_and_strdup(&arg_user, optarg);
858 if (r < 0)
859 return log_oom();
860
861 arg_settings_mask |= SETTING_USER;
862 break;
863
864 case ARG_NETWORK_ZONE: {
865 char *j;
866
867 j = strjoin("vz-", optarg);
868 if (!j)
869 return log_oom();
870
871 if (!ifname_valid(j)) {
872 log_error("Network zone name not valid: %s", j);
873 free(j);
874 return -EINVAL;
875 }
876
877 free_and_replace(arg_network_zone, j);
878
879 arg_network_veth = true;
880 arg_private_network = true;
881 arg_settings_mask |= SETTING_NETWORK;
882 break;
883 }
884
885 case ARG_NETWORK_BRIDGE:
886
887 if (!ifname_valid(optarg))
888 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
889 "Bridge interface name not valid: %s", optarg);
890
891 r = free_and_strdup(&arg_network_bridge, optarg);
892 if (r < 0)
893 return log_oom();
894
895 _fallthrough_;
896 case 'n':
897 arg_network_veth = true;
898 arg_private_network = true;
899 arg_settings_mask |= SETTING_NETWORK;
900 break;
901
902 case ARG_NETWORK_VETH_EXTRA:
903 r = veth_extra_parse(&arg_network_veth_extra, optarg);
904 if (r < 0)
905 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
906
907 arg_private_network = true;
908 arg_settings_mask |= SETTING_NETWORK;
909 break;
910
911 case ARG_NETWORK_INTERFACE:
912 if (!ifname_valid(optarg))
913 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
914 "Network interface name not valid: %s", optarg);
915
916 r = test_network_interface_initialized(optarg);
917 if (r < 0)
918 return r;
919
920 if (strv_extend(&arg_network_interfaces, optarg) < 0)
921 return log_oom();
922
923 arg_private_network = true;
924 arg_settings_mask |= SETTING_NETWORK;
925 break;
926
927 case ARG_NETWORK_MACVLAN:
928
929 if (!ifname_valid(optarg))
930 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
931 "MACVLAN network interface name not valid: %s", optarg);
932
933 r = test_network_interface_initialized(optarg);
934 if (r < 0)
935 return r;
936
937 if (strv_extend(&arg_network_macvlan, optarg) < 0)
938 return log_oom();
939
940 arg_private_network = true;
941 arg_settings_mask |= SETTING_NETWORK;
942 break;
943
944 case ARG_NETWORK_IPVLAN:
945
946 if (!ifname_valid(optarg))
947 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
948 "IPVLAN network interface name not valid: %s", optarg);
949
950 r = test_network_interface_initialized(optarg);
951 if (r < 0)
952 return r;
953
954 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
955 return log_oom();
956
957 _fallthrough_;
958 case ARG_PRIVATE_NETWORK:
959 arg_private_network = true;
960 arg_settings_mask |= SETTING_NETWORK;
961 break;
962
963 case ARG_NETWORK_NAMESPACE_PATH:
964 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
965 if (r < 0)
966 return r;
967
968 arg_settings_mask |= SETTING_NETWORK;
969 break;
970
971 case 'b':
972 if (arg_start_mode == START_PID2)
973 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
974 "--boot and --as-pid2 may not be combined.");
975
976 arg_start_mode = START_BOOT;
977 arg_settings_mask |= SETTING_START_MODE;
978 break;
979
980 case 'a':
981 if (arg_start_mode == START_BOOT)
982 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
983 "--boot and --as-pid2 may not be combined.");
984
985 arg_start_mode = START_PID2;
986 arg_settings_mask |= SETTING_START_MODE;
987 break;
988
989 case ARG_UUID:
990 r = sd_id128_from_string(optarg, &arg_uuid);
991 if (r < 0)
992 return log_error_errno(r, "Invalid UUID: %s", optarg);
993
994 if (sd_id128_is_null(arg_uuid))
995 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
996 "Machine UUID may not be all zeroes.");
997
998 arg_settings_mask |= SETTING_MACHINE_ID;
999 break;
1000
1001 case 'S': {
1002 _cleanup_free_ char *mangled = NULL;
1003
1004 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1005 if (r < 0)
1006 return log_oom();
1007
1008 free_and_replace(arg_slice, mangled);
1009 arg_settings_mask |= SETTING_SLICE;
1010 break;
1011 }
1012
1013 case 'M':
1014 if (isempty(optarg))
1015 arg_machine = mfree(arg_machine);
1016 else {
1017 if (!hostname_is_valid(optarg, 0))
1018 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019 "Invalid machine name: %s", optarg);
1020
1021 r = free_and_strdup(&arg_machine, optarg);
1022 if (r < 0)
1023 return log_oom();
1024 }
1025 break;
1026
1027 case ARG_HOSTNAME:
1028 if (isempty(optarg))
1029 arg_hostname = mfree(arg_hostname);
1030 else {
1031 if (!hostname_is_valid(optarg, 0))
1032 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1033 "Invalid hostname: %s", optarg);
1034
1035 r = free_and_strdup(&arg_hostname, optarg);
1036 if (r < 0)
1037 return log_oom();
1038 }
1039
1040 arg_settings_mask |= SETTING_HOSTNAME;
1041 break;
1042
1043 case 'Z':
1044 arg_selinux_context = optarg;
1045 break;
1046
1047 case 'L':
1048 arg_selinux_apifs_context = optarg;
1049 break;
1050
1051 case ARG_READ_ONLY:
1052 arg_read_only = true;
1053 arg_settings_mask |= SETTING_READ_ONLY;
1054 break;
1055
1056 case ARG_AMBIENT_CAPABILITY: {
1057 uint64_t m;
1058 r = parse_capability_spec(optarg, &m);
1059 if (r <= 0)
1060 return r;
1061 arg_caps_ambient |= m;
1062 arg_settings_mask |= SETTING_CAPABILITY;
1063 break;
1064 }
1065 case ARG_CAPABILITY:
1066 case ARG_DROP_CAPABILITY: {
1067 uint64_t m;
1068 r = parse_capability_spec(optarg, &m);
1069 if (r <= 0)
1070 return r;
1071
1072 if (c == ARG_CAPABILITY)
1073 plus |= m;
1074 else
1075 minus |= m;
1076 arg_settings_mask |= SETTING_CAPABILITY;
1077 break;
1078 }
1079 case ARG_NO_NEW_PRIVILEGES:
1080 r = parse_boolean(optarg);
1081 if (r < 0)
1082 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1083
1084 arg_no_new_privileges = r;
1085 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1086 break;
1087
1088 case 'j':
1089 arg_link_journal = LINK_GUEST;
1090 arg_link_journal_try = true;
1091 arg_settings_mask |= SETTING_LINK_JOURNAL;
1092 break;
1093
1094 case ARG_LINK_JOURNAL:
1095 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1098
1099 arg_settings_mask |= SETTING_LINK_JOURNAL;
1100 break;
1101
1102 case ARG_BIND:
1103 case ARG_BIND_RO:
1104 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1105 if (r < 0)
1106 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1107
1108 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1109 break;
1110
1111 case ARG_TMPFS:
1112 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1113 if (r < 0)
1114 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1115
1116 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1117 break;
1118
1119 case ARG_OVERLAY:
1120 case ARG_OVERLAY_RO:
1121 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1122 if (r == -EADDRNOTAVAIL)
1123 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1124 if (r < 0)
1125 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1126
1127 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1128 break;
1129
1130 case ARG_INACCESSIBLE:
1131 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1132 if (r < 0)
1133 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1134
1135 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1136 break;
1137
1138 case 'E':
1139 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1140 if (r < 0)
1141 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1142
1143 arg_settings_mask |= SETTING_ENVIRONMENT;
1144 break;
1145
1146 case 'q':
1147 arg_quiet = true;
1148 break;
1149
1150 case ARG_SHARE_SYSTEM:
1151 /* We don't officially support this anymore, except for compat reasons. People should use the
1152 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1153 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1154 arg_clone_ns_flags = 0;
1155 break;
1156
1157 case ARG_REGISTER:
1158 r = parse_boolean(optarg);
1159 if (r < 0) {
1160 log_error("Failed to parse --register= argument: %s", optarg);
1161 return r;
1162 }
1163
1164 arg_register = r;
1165 break;
1166
1167 case ARG_KEEP_UNIT:
1168 arg_keep_unit = true;
1169 break;
1170
1171 case ARG_PERSONALITY:
1172
1173 arg_personality = personality_from_string(optarg);
1174 if (arg_personality == PERSONALITY_INVALID)
1175 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1176 "Unknown or unsupported personality '%s'.", optarg);
1177
1178 arg_settings_mask |= SETTING_PERSONALITY;
1179 break;
1180
1181 case ARG_VOLATILE:
1182
1183 if (!optarg)
1184 arg_volatile_mode = VOLATILE_YES;
1185 else if (streq(optarg, "help")) {
1186 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1187 return 0;
1188 } else {
1189 VolatileMode m;
1190
1191 m = volatile_mode_from_string(optarg);
1192 if (m < 0)
1193 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1194 "Failed to parse --volatile= argument: %s", optarg);
1195 else
1196 arg_volatile_mode = m;
1197 }
1198
1199 arg_settings_mask |= SETTING_VOLATILE_MODE;
1200 break;
1201
1202 case 'p':
1203 r = expose_port_parse(&arg_expose_ports, optarg);
1204 if (r == -EEXIST)
1205 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1206 if (r < 0)
1207 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1208
1209 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1210 break;
1211
1212 case ARG_PROPERTY:
1213 if (strv_extend(&arg_property, optarg) < 0)
1214 return log_oom();
1215
1216 break;
1217
1218 case ARG_PRIVATE_USERS: {
1219 int boolean;
1220
1221 if (!optarg)
1222 boolean = true;
1223 else if (!in_charset(optarg, DIGITS))
1224 /* do *not* parse numbers as booleans */
1225 boolean = parse_boolean(optarg);
1226 else
1227 boolean = -1;
1228
1229 if (boolean == 0) {
1230 /* no: User namespacing off */
1231 arg_userns_mode = USER_NAMESPACE_NO;
1232 arg_uid_shift = UID_INVALID;
1233 arg_uid_range = UINT32_C(0x10000);
1234 } else if (boolean > 0) {
1235 /* yes: User namespacing on, UID range is read from root dir */
1236 arg_userns_mode = USER_NAMESPACE_FIXED;
1237 arg_uid_shift = UID_INVALID;
1238 arg_uid_range = UINT32_C(0x10000);
1239 } else if (streq(optarg, "pick")) {
1240 /* pick: User namespacing on, UID range is picked randomly */
1241 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1242 * implied by USER_NAMESPACE_PICK
1243 * further down. */
1244 arg_uid_shift = UID_INVALID;
1245 arg_uid_range = UINT32_C(0x10000);
1246
1247 } else if (streq(optarg, "identity")) {
1248 /* identitiy: User namespaces on, UID range is map the 0…0xFFFF range to
1249 * itself, i.e. we don't actually map anything, but do take benefit of
1250 * isolation of capability sets. */
1251 arg_userns_mode = USER_NAMESPACE_FIXED;
1252 arg_uid_shift = 0;
1253 arg_uid_range = UINT32_C(0x10000);
1254 } else {
1255 _cleanup_free_ char *buffer = NULL;
1256 const char *range, *shift;
1257
1258 /* anything else: User namespacing on, UID range is explicitly configured */
1259
1260 range = strchr(optarg, ':');
1261 if (range) {
1262 buffer = strndup(optarg, range - optarg);
1263 if (!buffer)
1264 return log_oom();
1265 shift = buffer;
1266
1267 range++;
1268 r = safe_atou32(range, &arg_uid_range);
1269 if (r < 0)
1270 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1271 } else
1272 shift = optarg;
1273
1274 r = parse_uid(shift, &arg_uid_shift);
1275 if (r < 0)
1276 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1277
1278 arg_userns_mode = USER_NAMESPACE_FIXED;
1279
1280 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1282 }
1283
1284 arg_settings_mask |= SETTING_USERNS;
1285 break;
1286 }
1287
1288 case 'U':
1289 if (userns_supported()) {
1290 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1291 * implied by USER_NAMESPACE_PICK
1292 * further down. */
1293 arg_uid_shift = UID_INVALID;
1294 arg_uid_range = UINT32_C(0x10000);
1295
1296 arg_settings_mask |= SETTING_USERNS;
1297 }
1298
1299 break;
1300
1301 case ARG_PRIVATE_USERS_CHOWN:
1302 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1303
1304 arg_settings_mask |= SETTING_USERNS;
1305 break;
1306
1307 case ARG_PRIVATE_USERS_OWNERSHIP:
1308 if (streq(optarg, "help")) {
1309 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1310 return 0;
1311 }
1312
1313 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1314 if (arg_userns_ownership < 0)
1315 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1316
1317 arg_settings_mask |= SETTING_USERNS;
1318 break;
1319
1320 case ARG_KILL_SIGNAL:
1321 if (streq(optarg, "help")) {
1322 DUMP_STRING_TABLE(signal, int, _NSIG);
1323 return 0;
1324 }
1325
1326 arg_kill_signal = signal_from_string(optarg);
1327 if (arg_kill_signal < 0)
1328 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1329
1330 arg_settings_mask |= SETTING_KILL_SIGNAL;
1331 break;
1332
1333 case ARG_SETTINGS:
1334
1335 /* no → do not read files
1336 * yes → read files, do not override cmdline, trust only subset
1337 * override → read files, override cmdline, trust only subset
1338 * trusted → read files, do not override cmdline, trust all
1339 */
1340
1341 r = parse_boolean(optarg);
1342 if (r < 0) {
1343 if (streq(optarg, "trusted")) {
1344 mask_all_settings = false;
1345 mask_no_settings = false;
1346 arg_settings_trusted = true;
1347
1348 } else if (streq(optarg, "override")) {
1349 mask_all_settings = false;
1350 mask_no_settings = true;
1351 arg_settings_trusted = -1;
1352 } else
1353 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1354 } else if (r > 0) {
1355 /* yes */
1356 mask_all_settings = false;
1357 mask_no_settings = false;
1358 arg_settings_trusted = -1;
1359 } else {
1360 /* no */
1361 mask_all_settings = true;
1362 mask_no_settings = false;
1363 arg_settings_trusted = false;
1364 }
1365
1366 break;
1367
1368 case ARG_CHDIR:
1369 if (!path_is_absolute(optarg))
1370 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1371 "Working directory %s is not an absolute path.", optarg);
1372
1373 r = free_and_strdup(&arg_chdir, optarg);
1374 if (r < 0)
1375 return log_oom();
1376
1377 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1378 break;
1379
1380 case ARG_PIVOT_ROOT:
1381 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1382 if (r < 0)
1383 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1384
1385 arg_settings_mask |= SETTING_PIVOT_ROOT;
1386 break;
1387
1388 case ARG_NOTIFY_READY:
1389 r = parse_boolean(optarg);
1390 if (r < 0)
1391 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1392 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1393 arg_notify_ready = r;
1394 arg_settings_mask |= SETTING_NOTIFY_READY;
1395 break;
1396
1397 case ARG_ROOT_HASH: {
1398 _cleanup_free_ void *k = NULL;
1399 size_t l;
1400
1401 r = unhexmem(optarg, strlen(optarg), &k, &l);
1402 if (r < 0)
1403 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1404 if (l < sizeof(sd_id128_t))
1405 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1406
1407 free_and_replace(arg_verity_settings.root_hash, k);
1408 arg_verity_settings.root_hash_size = l;
1409 break;
1410 }
1411
1412 case ARG_ROOT_HASH_SIG: {
1413 char *value;
1414 size_t l;
1415 void *p;
1416
1417 if ((value = startswith(optarg, "base64:"))) {
1418 r = unbase64mem(value, strlen(value), &p, &l);
1419 if (r < 0)
1420 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1421
1422 } else {
1423 r = read_full_file(optarg, (char**) &p, &l);
1424 if (r < 0)
1425 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1426 }
1427
1428 free_and_replace(arg_verity_settings.root_hash_sig, p);
1429 arg_verity_settings.root_hash_sig_size = l;
1430 break;
1431 }
1432
1433 case ARG_VERITY_DATA:
1434 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1435 if (r < 0)
1436 return r;
1437 break;
1438
1439 case ARG_SYSTEM_CALL_FILTER: {
1440 bool negative;
1441 const char *items;
1442
1443 negative = optarg[0] == '~';
1444 items = negative ? optarg + 1 : optarg;
1445
1446 for (;;) {
1447 _cleanup_free_ char *word = NULL;
1448
1449 r = extract_first_word(&items, &word, NULL, 0);
1450 if (r == 0)
1451 break;
1452 if (r == -ENOMEM)
1453 return log_oom();
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to parse system call filter: %m");
1456
1457 if (negative)
1458 r = strv_extend(&arg_syscall_deny_list, word);
1459 else
1460 r = strv_extend(&arg_syscall_allow_list, word);
1461 if (r < 0)
1462 return log_oom();
1463 }
1464
1465 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1466 break;
1467 }
1468
1469 case ARG_RLIMIT: {
1470 const char *eq;
1471 _cleanup_free_ char *name = NULL;
1472 int rl;
1473
1474 if (streq(optarg, "help")) {
1475 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1476 return 0;
1477 }
1478
1479 eq = strchr(optarg, '=');
1480 if (!eq)
1481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1482 "--rlimit= expects an '=' assignment.");
1483
1484 name = strndup(optarg, eq - optarg);
1485 if (!name)
1486 return log_oom();
1487
1488 rl = rlimit_from_string_harder(name);
1489 if (rl < 0)
1490 return log_error_errno(rl, "Unknown resource limit: %s", name);
1491
1492 if (!arg_rlimit[rl]) {
1493 arg_rlimit[rl] = new0(struct rlimit, 1);
1494 if (!arg_rlimit[rl])
1495 return log_oom();
1496 }
1497
1498 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1501
1502 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1503 break;
1504 }
1505
1506 case ARG_OOM_SCORE_ADJUST:
1507 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1508 if (r < 0)
1509 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1510
1511 arg_oom_score_adjust_set = true;
1512 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1513 break;
1514
1515 case ARG_CPU_AFFINITY: {
1516 CPUSet cpuset;
1517
1518 r = parse_cpu_set(optarg, &cpuset);
1519 if (r < 0)
1520 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1521
1522 cpu_set_reset(&arg_cpu_set);
1523 arg_cpu_set = cpuset;
1524 arg_settings_mask |= SETTING_CPU_AFFINITY;
1525 break;
1526 }
1527
1528 case ARG_RESOLV_CONF:
1529 if (streq(optarg, "help")) {
1530 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1531 return 0;
1532 }
1533
1534 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1535 if (arg_resolv_conf < 0)
1536 return log_error_errno(arg_resolv_conf,
1537 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1538
1539 arg_settings_mask |= SETTING_RESOLV_CONF;
1540 break;
1541
1542 case ARG_TIMEZONE:
1543 if (streq(optarg, "help")) {
1544 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1545 return 0;
1546 }
1547
1548 arg_timezone = timezone_mode_from_string(optarg);
1549 if (arg_timezone < 0)
1550 return log_error_errno(arg_timezone,
1551 "Failed to parse /etc/localtime mode: %s", optarg);
1552
1553 arg_settings_mask |= SETTING_TIMEZONE;
1554 break;
1555
1556 case ARG_CONSOLE:
1557 r = handle_arg_console(optarg);
1558 if (r <= 0)
1559 return r;
1560 break;
1561
1562 case 'P':
1563 case ARG_PIPE:
1564 r = handle_arg_console("pipe");
1565 if (r <= 0)
1566 return r;
1567 break;
1568
1569 case ARG_NO_PAGER:
1570 arg_pager_flags |= PAGER_DISABLE;
1571 break;
1572
1573 case ARG_SET_CREDENTIAL: {
1574 _cleanup_free_ char *word = NULL, *data = NULL;
1575 const char *p = optarg;
1576 Credential *a;
1577 ssize_t l;
1578
1579 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1580 if (r == -ENOMEM)
1581 return log_oom();
1582 if (r < 0)
1583 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1584 if (r == 0 || !p)
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1586
1587 if (!credential_name_valid(word))
1588 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1589
1590 for (size_t i = 0; i < arg_n_credentials; i++)
1591 if (streq(arg_credentials[i].id, word))
1592 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1593
1594 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1595 if (l < 0)
1596 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1597
1598 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1599 if (!a)
1600 return log_oom();
1601
1602 a[arg_n_credentials++] = (Credential) {
1603 .id = TAKE_PTR(word),
1604 .data = TAKE_PTR(data),
1605 .size = l,
1606 };
1607
1608 arg_credentials = a;
1609
1610 arg_settings_mask |= SETTING_CREDENTIALS;
1611 break;
1612 }
1613
1614 case ARG_LOAD_CREDENTIAL: {
1615 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1616 _cleanup_(erase_and_freep) char *data = NULL;
1617 _cleanup_free_ char *word = NULL, *j = NULL;
1618 const char *p = optarg;
1619 Credential *a;
1620 size_t size, i;
1621
1622 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1623 if (r == -ENOMEM)
1624 return log_oom();
1625 if (r < 0)
1626 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1627 if (r == 0 || !p)
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1629
1630 if (!credential_name_valid(word))
1631 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1632
1633 for (i = 0; i < arg_n_credentials; i++)
1634 if (streq(arg_credentials[i].id, word))
1635 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1636
1637 if (path_is_absolute(p))
1638 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1639 else {
1640 const char *e;
1641
1642 r = get_credentials_dir(&e);
1643 if (r < 0)
1644 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1645
1646 j = path_join(e, p);
1647 if (!j)
1648 return log_oom();
1649 }
1650
1651 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1652 flags,
1653 NULL,
1654 &data, &size);
1655 if (r < 0)
1656 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1657
1658 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1659 if (!a)
1660 return log_oom();
1661
1662 a[arg_n_credentials++] = (Credential) {
1663 .id = TAKE_PTR(word),
1664 .data = TAKE_PTR(data),
1665 .size = size,
1666 };
1667
1668 arg_credentials = a;
1669
1670 arg_settings_mask |= SETTING_CREDENTIALS;
1671 break;
1672 }
1673
1674 case ARG_BIND_USER:
1675 if (!valid_user_group_name(optarg, 0))
1676 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1677
1678 if (strv_extend(&arg_bind_user, optarg) < 0)
1679 return log_oom();
1680
1681 arg_settings_mask |= SETTING_BIND_USER;
1682 break;
1683
1684 case ARG_SUPPRESS_SYNC:
1685 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1686 if (r < 0)
1687 return r;
1688
1689 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1690 break;
1691
1692 case '?':
1693 return -EINVAL;
1694
1695 default:
1696 assert_not_reached();
1697 }
1698
1699 if (argc > optind) {
1700 strv_free(arg_parameters);
1701 arg_parameters = strv_copy(argv + optind);
1702 if (!arg_parameters)
1703 return log_oom();
1704
1705 arg_settings_mask |= SETTING_START_MODE;
1706 }
1707
1708 if (arg_ephemeral && arg_template && !arg_directory)
1709 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1710 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1711 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1712 * --directory=". */
1713 arg_directory = TAKE_PTR(arg_template);
1714
1715 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1716
1717 /* Make sure to parse environment before we reset the settings mask below */
1718 r = parse_environment();
1719 if (r < 0)
1720 return r;
1721
1722 /* Load all settings from .nspawn files */
1723 if (mask_no_settings)
1724 arg_settings_mask = 0;
1725
1726 /* Don't load any settings from .nspawn files */
1727 if (mask_all_settings)
1728 arg_settings_mask = _SETTINGS_MASK_ALL;
1729
1730 return 1;
1731 }
1732
1733 static int verify_arguments(void) {
1734 int r;
1735
1736 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1737 /* If we are running the stub init in the container, we don't need to look at what the init
1738 * in the container supports, because we are not using it. Let's immediately pick the right
1739 * setting based on the host system configuration.
1740 *
1741 * We only do this, if the user didn't use an environment variable to override the detection.
1742 */
1743
1744 r = cg_all_unified();
1745 if (r < 0)
1746 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1747 if (r > 0)
1748 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1749 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1750 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1751 else
1752 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1753 }
1754
1755 if (arg_userns_mode != USER_NAMESPACE_NO)
1756 arg_mount_settings |= MOUNT_USE_USERNS;
1757
1758 if (arg_private_network)
1759 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1760
1761 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1762 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1763 arg_register = false;
1764 if (arg_start_mode != START_PID1)
1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1766 }
1767
1768 if (arg_userns_ownership < 0)
1769 arg_userns_ownership =
1770 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1771 USER_NAMESPACE_OWNERSHIP_OFF;
1772
1773 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1774 arg_kill_signal = SIGRTMIN+3;
1775
1776 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1777 arg_read_only = true;
1778
1779 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1780 arg_read_only = true;
1781
1782 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1783 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1784 * The latter is not technically a user session, but we don't need to labour the point. */
1785 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1786
1787 if (arg_directory && arg_image)
1788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1789
1790 if (arg_template && arg_image)
1791 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1792
1793 if (arg_template && !(arg_directory || arg_machine))
1794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1795
1796 if (arg_ephemeral && arg_template)
1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1798
1799 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1801
1802 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1803 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1804
1805 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1807 "--read-only and --private-users-ownership=chown may not be combined.");
1808
1809 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1810 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1811 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1812 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1813 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1814
1815 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1816 * we need to error out, to avoid conflicts between different network options. */
1817 if (arg_network_namespace_path &&
1818 (arg_network_interfaces || arg_network_macvlan ||
1819 arg_network_ipvlan || arg_network_veth_extra ||
1820 arg_network_bridge || arg_network_zone ||
1821 arg_network_veth))
1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1823
1824 if (arg_network_bridge && arg_network_zone)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1826 "--network-bridge= and --network-zone= may not be combined.");
1827
1828 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1829 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1830
1831 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1833
1834 if (arg_expose_ports && !arg_private_network)
1835 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1836
1837 if (arg_caps_ambient) {
1838 if (arg_caps_ambient == UINT64_MAX)
1839 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1840
1841 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1842 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1843
1844 if (arg_start_mode == START_BOOT)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1846 }
1847
1848 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1849 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1850
1851 /* Drop duplicate --bind-user= entries */
1852 strv_uniq(arg_bind_user);
1853
1854 r = custom_mount_check_all();
1855 if (r < 0)
1856 return r;
1857
1858 return 0;
1859 }
1860
1861 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1862 assert(p);
1863
1864 if (arg_userns_mode == USER_NAMESPACE_NO)
1865 return 0;
1866
1867 if (uid == UID_INVALID && gid == GID_INVALID)
1868 return 0;
1869
1870 if (uid != UID_INVALID) {
1871 uid += arg_uid_shift;
1872
1873 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1874 return -EOVERFLOW;
1875 }
1876
1877 if (gid != GID_INVALID) {
1878 gid += (gid_t) arg_uid_shift;
1879
1880 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1881 return -EOVERFLOW;
1882 }
1883
1884 return RET_NERRNO(lchown(p, uid, gid));
1885 }
1886
1887 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1888 const char *q;
1889 int r;
1890
1891 q = prefix_roota(root, path);
1892 r = RET_NERRNO(mkdir(q, mode));
1893 if (r == -EEXIST)
1894 return 0;
1895 if (r < 0)
1896 return r;
1897
1898 return userns_lchown(q, uid, gid);
1899 }
1900
1901 static const char *timezone_from_path(const char *path) {
1902 return PATH_STARTSWITH_SET(
1903 path,
1904 "../usr/share/zoneinfo/",
1905 "/usr/share/zoneinfo/");
1906 }
1907
1908 static bool etc_writable(void) {
1909 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1910 }
1911
1912 static int setup_timezone(const char *dest) {
1913 _cleanup_free_ char *p = NULL, *etc = NULL;
1914 const char *where, *check;
1915 TimezoneMode m;
1916 int r;
1917
1918 assert(dest);
1919
1920 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1921 r = readlink_malloc("/etc/localtime", &p);
1922 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1923 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1924 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1925 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1926 else if (r < 0) {
1927 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1928 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1929 * file.
1930 *
1931 * Example:
1932 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1933 */
1934 return 0;
1935 } else if (arg_timezone == TIMEZONE_AUTO)
1936 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1937 else
1938 m = arg_timezone;
1939 } else
1940 m = arg_timezone;
1941
1942 if (m == TIMEZONE_OFF)
1943 return 0;
1944
1945 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1946 if (r < 0) {
1947 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1948 return 0;
1949 }
1950
1951 where = strjoina(etc, "/localtime");
1952
1953 switch (m) {
1954
1955 case TIMEZONE_DELETE:
1956 if (unlink(where) < 0)
1957 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1958
1959 return 0;
1960
1961 case TIMEZONE_SYMLINK: {
1962 _cleanup_free_ char *q = NULL;
1963 const char *z, *what;
1964
1965 z = timezone_from_path(p);
1966 if (!z) {
1967 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1968 return 0;
1969 }
1970
1971 r = readlink_malloc(where, &q);
1972 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1973 return 0; /* Already pointing to the right place? Then do nothing .. */
1974
1975 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1976 r = chase_symlinks(check, dest, 0, NULL, NULL);
1977 if (r < 0)
1978 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1979 else {
1980 if (unlink(where) < 0 && errno != ENOENT) {
1981 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1982 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1983 return 0;
1984 }
1985
1986 what = strjoina("../usr/share/zoneinfo/", z);
1987 if (symlink(what, where) < 0) {
1988 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1989 errno, "Failed to correct timezone of container, ignoring: %m");
1990 return 0;
1991 }
1992
1993 break;
1994 }
1995
1996 _fallthrough_;
1997 }
1998
1999 case TIMEZONE_BIND: {
2000 _cleanup_free_ char *resolved = NULL;
2001 int found;
2002
2003 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2004 if (found < 0) {
2005 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2006 return 0;
2007 }
2008
2009 if (found == 0) /* missing? */
2010 (void) touch(resolved);
2011
2012 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
2013 if (r >= 0)
2014 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2015
2016 _fallthrough_;
2017 }
2018
2019 case TIMEZONE_COPY:
2020 /* If mounting failed, try to copy */
2021 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2022 if (r < 0) {
2023 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2024 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2025 return 0;
2026 }
2027
2028 break;
2029
2030 default:
2031 assert_not_reached();
2032 }
2033
2034 /* Fix permissions of the symlink or file copy we just created */
2035 r = userns_lchown(where, 0, 0);
2036 if (r < 0)
2037 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2038
2039 return 0;
2040 }
2041
2042 static int have_resolv_conf(const char *path) {
2043 assert(path);
2044
2045 if (access(path, F_OK) < 0) {
2046 if (errno == ENOENT)
2047 return 0;
2048
2049 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2050 }
2051
2052 return 1;
2053 }
2054
2055 static int resolved_listening(void) {
2056 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2057 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2058 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2059 int r;
2060
2061 /* Check if resolved is listening */
2062
2063 r = sd_bus_open_system(&bus);
2064 if (r < 0)
2065 return log_debug_errno(r, "Failed to open system bus: %m");
2066
2067 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2068 if (r < 0)
2069 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2070 if (r == 0)
2071 return 0;
2072
2073 r = sd_bus_get_property_string(bus,
2074 "org.freedesktop.resolve1",
2075 "/org/freedesktop/resolve1",
2076 "org.freedesktop.resolve1.Manager",
2077 "DNSStubListener",
2078 &error,
2079 &dns_stub_listener_mode);
2080 if (r < 0)
2081 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2082
2083 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2084 }
2085
2086 static int setup_resolv_conf(const char *dest) {
2087 _cleanup_free_ char *etc = NULL;
2088 const char *where, *what;
2089 ResolvConfMode m;
2090 int r;
2091
2092 assert(dest);
2093
2094 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2095 if (arg_private_network)
2096 m = RESOLV_CONF_OFF;
2097 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2098 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2099 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2100 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2101 else
2102 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2103
2104 } else
2105 m = arg_resolv_conf;
2106
2107 if (m == RESOLV_CONF_OFF)
2108 return 0;
2109
2110 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2111 if (r < 0) {
2112 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2113 return 0;
2114 }
2115
2116 where = strjoina(etc, "/resolv.conf");
2117
2118 if (m == RESOLV_CONF_DELETE) {
2119 if (unlink(where) < 0)
2120 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2121
2122 return 0;
2123 }
2124
2125 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2126 what = PRIVATE_STATIC_RESOLV_CONF;
2127 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2128 what = PRIVATE_UPLINK_RESOLV_CONF;
2129 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2130 what = PRIVATE_STUB_RESOLV_CONF;
2131 else
2132 what = "/etc/resolv.conf";
2133
2134 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2135 _cleanup_free_ char *resolved = NULL;
2136 int found;
2137
2138 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2139 if (found < 0) {
2140 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2141 return 0;
2142 }
2143
2144 if (found == 0) /* missing? */
2145 (void) touch(resolved);
2146
2147 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2148 if (r >= 0)
2149 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2150
2151 /* If that didn't work, let's copy the file */
2152 }
2153
2154 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2155 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2156 else
2157 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
2158 if (r < 0) {
2159 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2160 * resolved or something similar runs inside and the symlink points there.
2161 *
2162 * If the disk image is read-only, there's also no point in complaining.
2163 */
2164 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2165 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2166 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2167 return 0;
2168 }
2169
2170 r = userns_lchown(where, 0, 0);
2171 if (r < 0)
2172 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2173
2174 return 0;
2175 }
2176
2177 static int setup_boot_id(void) {
2178 _cleanup_(unlink_and_freep) char *from = NULL;
2179 _cleanup_free_ char *path = NULL;
2180 sd_id128_t rnd = SD_ID128_NULL;
2181 const char *to;
2182 int r;
2183
2184 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2185
2186 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2187 if (r < 0)
2188 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2189
2190 r = sd_id128_randomize(&rnd);
2191 if (r < 0)
2192 return log_error_errno(r, "Failed to generate random boot id: %m");
2193
2194 r = id128_write(path, ID128_UUID, rnd, false);
2195 if (r < 0)
2196 return log_error_errno(r, "Failed to write boot id: %m");
2197
2198 from = TAKE_PTR(path);
2199 to = "/proc/sys/kernel/random/boot_id";
2200
2201 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2202 if (r < 0)
2203 return r;
2204
2205 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2206 }
2207
2208 static int copy_devnodes(const char *dest) {
2209 static const char devnodes[] =
2210 "null\0"
2211 "zero\0"
2212 "full\0"
2213 "random\0"
2214 "urandom\0"
2215 "tty\0"
2216 "net/tun\0";
2217
2218 const char *d;
2219 int r = 0;
2220
2221 assert(dest);
2222
2223 BLOCK_WITH_UMASK(0000);
2224
2225 /* Create /dev/net, so that we can create /dev/net/tun in it */
2226 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2227 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2228
2229 NULSTR_FOREACH(d, devnodes) {
2230 _cleanup_free_ char *from = NULL, *to = NULL;
2231 struct stat st;
2232
2233 from = path_join("/dev/", d);
2234 if (!from)
2235 return log_oom();
2236
2237 to = path_join(dest, from);
2238 if (!to)
2239 return log_oom();
2240
2241 if (stat(from, &st) < 0) {
2242
2243 if (errno != ENOENT)
2244 return log_error_errno(errno, "Failed to stat %s: %m", from);
2245
2246 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2247 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2248 "%s is not a char or block device, cannot copy.", from);
2249 else {
2250 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2251
2252 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2253 /* Explicitly warn the user when /dev is already populated. */
2254 if (errno == EEXIST)
2255 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2256 if (errno != EPERM)
2257 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2258
2259 /* Some systems abusively restrict mknod but allow bind mounts. */
2260 r = touch(to);
2261 if (r < 0)
2262 return log_error_errno(r, "touch (%s) failed: %m", to);
2263 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2264 if (r < 0)
2265 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2266 }
2267
2268 r = userns_lchown(to, 0, 0);
2269 if (r < 0)
2270 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2271
2272 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2273 if (!dn)
2274 return log_oom();
2275
2276 r = userns_mkdir(dest, dn, 0755, 0, 0);
2277 if (r < 0)
2278 return log_error_errno(r, "Failed to create '%s': %m", dn);
2279
2280 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2281 return log_oom();
2282
2283 prefixed = path_join(dest, sl);
2284 if (!prefixed)
2285 return log_oom();
2286
2287 t = path_join("..", d);
2288 if (!t)
2289 return log_oom();
2290
2291 if (symlink(t, prefixed) < 0)
2292 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2293 }
2294 }
2295
2296 return r;
2297 }
2298
2299 static int make_extra_nodes(const char *dest) {
2300 size_t i;
2301 int r;
2302
2303 BLOCK_WITH_UMASK(0000);
2304
2305 for (i = 0; i < arg_n_extra_nodes; i++) {
2306 _cleanup_free_ char *path = NULL;
2307 DeviceNode *n = arg_extra_nodes + i;
2308
2309 path = path_join(dest, n->path);
2310 if (!path)
2311 return log_oom();
2312
2313 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2314 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2315
2316 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2319 }
2320
2321 return 0;
2322 }
2323
2324 static int setup_pts(const char *dest) {
2325 _cleanup_free_ char *options = NULL;
2326 const char *p;
2327 int r;
2328
2329 #if HAVE_SELINUX
2330 if (arg_selinux_apifs_context)
2331 (void) asprintf(&options,
2332 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2333 arg_uid_shift + TTY_GID,
2334 arg_selinux_apifs_context);
2335 else
2336 #endif
2337 (void) asprintf(&options,
2338 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2339 arg_uid_shift + TTY_GID);
2340
2341 if (!options)
2342 return log_oom();
2343
2344 /* Mount /dev/pts itself */
2345 p = prefix_roota(dest, "/dev/pts");
2346 r = RET_NERRNO(mkdir(p, 0755));
2347 if (r < 0)
2348 return log_error_errno(r, "Failed to create /dev/pts: %m");
2349
2350 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2351 if (r < 0)
2352 return r;
2353 r = userns_lchown(p, 0, 0);
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2356
2357 /* Create /dev/ptmx symlink */
2358 p = prefix_roota(dest, "/dev/ptmx");
2359 if (symlink("pts/ptmx", p) < 0)
2360 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2361 r = userns_lchown(p, 0, 0);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2364
2365 /* And fix /dev/pts/ptmx ownership */
2366 p = prefix_roota(dest, "/dev/pts/ptmx");
2367 r = userns_lchown(p, 0, 0);
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2370
2371 return 0;
2372 }
2373
2374 static int setup_stdio_as_dev_console(void) {
2375 _cleanup_close_ int terminal = -1;
2376 int r;
2377
2378 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2379 * explicitly, if we are configured to. */
2380 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2381 if (terminal < 0)
2382 return log_error_errno(terminal, "Failed to open console: %m");
2383
2384 /* Make sure we can continue logging to the original stderr, even if
2385 * stderr points elsewhere now */
2386 r = log_dup_console();
2387 if (r < 0)
2388 return log_error_errno(r, "Failed to duplicate stderr: %m");
2389
2390 /* invalidates 'terminal' on success and failure */
2391 r = rearrange_stdio(terminal, terminal, terminal);
2392 TAKE_FD(terminal);
2393 if (r < 0)
2394 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2395
2396 return 0;
2397 }
2398
2399 static int setup_dev_console(const char *console) {
2400 _cleanup_free_ char *p = NULL;
2401 int r;
2402
2403 /* Create /dev/console symlink */
2404 r = path_make_relative("/dev", console, &p);
2405 if (r < 0)
2406 return log_error_errno(r, "Failed to create relative path: %m");
2407
2408 if (symlink(p, "/dev/console") < 0)
2409 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2410
2411 return 0;
2412 }
2413
2414 static int setup_keyring(void) {
2415 key_serial_t keyring;
2416
2417 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2418 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2419 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2420 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2421 * into the container. */
2422
2423 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2424 if (keyring == -1) {
2425 if (errno == ENOSYS)
2426 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2427 else if (ERRNO_IS_PRIVILEGE(errno))
2428 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2429 else
2430 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2431 }
2432
2433 return 0;
2434 }
2435
2436 static int setup_credentials(const char *root) {
2437 const char *q;
2438 int r;
2439
2440 if (arg_n_credentials <= 0)
2441 return 0;
2442
2443 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2444 if (r < 0)
2445 return log_error_errno(r, "Failed to create /run/host: %m");
2446
2447 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2448 if (r < 0)
2449 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2450
2451 q = prefix_roota(root, "/run/host/credentials");
2452 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2453 if (r < 0)
2454 return r;
2455
2456 for (size_t i = 0; i < arg_n_credentials; i++) {
2457 _cleanup_free_ char *j = NULL;
2458 _cleanup_close_ int fd = -1;
2459
2460 j = path_join(q, arg_credentials[i].id);
2461 if (!j)
2462 return log_oom();
2463
2464 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2465 if (fd < 0)
2466 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2467
2468 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2471
2472 if (fchmod(fd, 0400) < 0)
2473 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2474
2475 if (arg_userns_mode != USER_NAMESPACE_NO) {
2476 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2477 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2478 }
2479 }
2480
2481 if (chmod(q, 0500) < 0)
2482 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2483
2484 r = userns_lchown(q, 0, 0);
2485 if (r < 0)
2486 return r;
2487
2488 /* Make both mount and superblock read-only now */
2489 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2490 if (r < 0)
2491 return r;
2492
2493 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2494 }
2495
2496 static int setup_kmsg(int kmsg_socket) {
2497 _cleanup_(unlink_and_freep) char *from = NULL;
2498 _cleanup_free_ char *fifo = NULL;
2499 _cleanup_close_ int fd = -1;
2500 int r;
2501
2502 assert(kmsg_socket >= 0);
2503
2504 BLOCK_WITH_UMASK(0000);
2505
2506 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2507 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2508 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2509 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2510
2511 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2512 if (r < 0)
2513 return log_error_errno(r, "Failed to generate kmsg path: %m");
2514
2515 if (mkfifo(fifo, 0600) < 0)
2516 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2517
2518 from = TAKE_PTR(fifo);
2519
2520 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2521 if (r < 0)
2522 return r;
2523
2524 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2525 if (fd < 0)
2526 return log_error_errno(errno, "Failed to open fifo: %m");
2527
2528 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2529 r = send_one_fd(kmsg_socket, fd, 0);
2530 if (r < 0)
2531 return log_error_errno(r, "Failed to send FIFO fd: %m");
2532
2533 return 0;
2534 }
2535
2536 struct ExposeArgs {
2537 union in_addr_union address4;
2538 union in_addr_union address6;
2539 struct FirewallContext *fw_ctx;
2540 };
2541
2542 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2543 struct ExposeArgs *args = userdata;
2544
2545 assert(rtnl);
2546 assert(m);
2547 assert(args);
2548
2549 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2550 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2551 return 0;
2552 }
2553
2554 static int setup_hostname(void) {
2555 int r;
2556
2557 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2558 return 0;
2559
2560 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2561 if (r < 0)
2562 return log_error_errno(r, "Failed to set hostname: %m");
2563
2564 return 0;
2565 }
2566
2567 static int setup_journal(const char *directory) {
2568 _cleanup_free_ char *d = NULL;
2569 const char *p, *q;
2570 sd_id128_t this_id;
2571 bool try;
2572 int r;
2573
2574 /* Don't link journals in ephemeral mode */
2575 if (arg_ephemeral)
2576 return 0;
2577
2578 if (arg_link_journal == LINK_NO)
2579 return 0;
2580
2581 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2582
2583 r = sd_id128_get_machine(&this_id);
2584 if (r < 0)
2585 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2586
2587 if (sd_id128_equal(arg_uuid, this_id)) {
2588 log_full(try ? LOG_WARNING : LOG_ERR,
2589 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2590 if (try)
2591 return 0;
2592 return -EEXIST;
2593 }
2594
2595 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2596 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2597 if (r < 0) {
2598 bool ignore = r == -EROFS && try;
2599 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2600 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2601 return ignore ? 0 : r;
2602 }
2603 }
2604
2605 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2606 q = prefix_roota(directory, p);
2607
2608 if (path_is_mount_point(p, NULL, 0) > 0) {
2609 if (try)
2610 return 0;
2611
2612 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2613 "%s: already a mount point, refusing to use for journal", p);
2614 }
2615
2616 if (path_is_mount_point(q, NULL, 0) > 0) {
2617 if (try)
2618 return 0;
2619
2620 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2621 "%s: already a mount point, refusing to use for journal", q);
2622 }
2623
2624 r = readlink_and_make_absolute(p, &d);
2625 if (r >= 0) {
2626 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2627 path_equal(d, q)) {
2628
2629 r = userns_mkdir(directory, p, 0755, 0, 0);
2630 if (r < 0)
2631 log_warning_errno(r, "Failed to create directory %s: %m", q);
2632 return 0;
2633 }
2634
2635 if (unlink(p) < 0)
2636 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2637 } else if (r == -EINVAL) {
2638
2639 if (arg_link_journal == LINK_GUEST &&
2640 rmdir(p) < 0) {
2641
2642 if (errno == ENOTDIR) {
2643 log_error("%s already exists and is neither a symlink nor a directory", p);
2644 return r;
2645 } else
2646 return log_error_errno(errno, "Failed to remove %s: %m", p);
2647 }
2648 } else if (r != -ENOENT)
2649 return log_error_errno(r, "readlink(%s) failed: %m", p);
2650
2651 if (arg_link_journal == LINK_GUEST) {
2652
2653 if (symlink(q, p) < 0) {
2654 if (try) {
2655 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2656 return 0;
2657 } else
2658 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2659 }
2660
2661 r = userns_mkdir(directory, p, 0755, 0, 0);
2662 if (r < 0)
2663 log_warning_errno(r, "Failed to create directory %s: %m", q);
2664 return 0;
2665 }
2666
2667 if (arg_link_journal == LINK_HOST) {
2668 /* don't create parents here — if the host doesn't have
2669 * permanent journal set up, don't force it here */
2670
2671 r = RET_NERRNO(mkdir(p, 0755));
2672 if (r < 0 && r != -EEXIST) {
2673 if (try) {
2674 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2675 return 0;
2676 } else
2677 return log_error_errno(r, "Failed to create %s: %m", p);
2678 }
2679
2680 } else if (access(p, F_OK) < 0)
2681 return 0;
2682
2683 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2684 log_warning("%s is not empty, proceeding anyway.", q);
2685
2686 r = userns_mkdir(directory, p, 0755, 0, 0);
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to create %s: %m", q);
2689
2690 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2691 if (r < 0)
2692 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2693
2694 return 0;
2695 }
2696
2697 static int drop_capabilities(uid_t uid) {
2698 CapabilityQuintet q;
2699
2700 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2701 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2702 * arg_caps_retain. */
2703
2704 if (capability_quintet_is_set(&arg_full_capabilities)) {
2705 q = arg_full_capabilities;
2706
2707 if (q.bounding == UINT64_MAX)
2708 q.bounding = uid == 0 ? arg_caps_retain : 0;
2709
2710 if (q.effective == UINT64_MAX)
2711 q.effective = uid == 0 ? q.bounding : 0;
2712
2713 if (q.inheritable == UINT64_MAX)
2714 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2715
2716 if (q.permitted == UINT64_MAX)
2717 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2718
2719 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2720 q.ambient = arg_caps_ambient;
2721
2722 if (capability_quintet_mangle(&q))
2723 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2724
2725 } else {
2726 q = (CapabilityQuintet) {
2727 .bounding = arg_caps_retain,
2728 .effective = uid == 0 ? arg_caps_retain : 0,
2729 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2730 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2731 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2732 };
2733
2734 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2735 * in order to maintain the same behavior as systemd < 242. */
2736 if (capability_quintet_mangle(&q))
2737 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2738 "Some capabilities will not be set because they are not in the current bounding set.");
2739
2740 }
2741
2742 return capability_quintet_enforce(&q);
2743 }
2744
2745 static int reset_audit_loginuid(void) {
2746 _cleanup_free_ char *p = NULL;
2747 int r;
2748
2749 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2750 return 0;
2751
2752 r = read_one_line_file("/proc/self/loginuid", &p);
2753 if (r == -ENOENT)
2754 return 0;
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2757
2758 /* Already reset? */
2759 if (streq(p, "4294967295"))
2760 return 0;
2761
2762 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2763 if (r < 0) {
2764 log_error_errno(r,
2765 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2766 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2767 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2768 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2769 "using systemd-nspawn. Sleeping for 5s... (%m)");
2770
2771 sleep(5);
2772 }
2773
2774 return 0;
2775 }
2776
2777 static int setup_propagate(const char *root) {
2778 const char *p, *q;
2779 int r;
2780
2781 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2782 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2783 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2784 (void) mkdir_p(p, 0600);
2785
2786 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2787 if (r < 0)
2788 return log_error_errno(r, "Failed to create /run/host: %m");
2789
2790 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
2791 if (r < 0)
2792 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
2793
2794 q = prefix_roota(root, "/run/host/incoming");
2795 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2796 if (r < 0)
2797 return r;
2798
2799 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2800 if (r < 0)
2801 return r;
2802
2803 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
2804 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2805 }
2806
2807 static int setup_machine_id(const char *directory) {
2808 const char *etc_machine_id;
2809 sd_id128_t id;
2810 int r;
2811
2812 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2813 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2814 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2815 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2816 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2817 * container behaves nicely). */
2818
2819 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2820
2821 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
2822 if (r < 0) {
2823 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2824 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2825
2826 if (sd_id128_is_null(arg_uuid)) {
2827 r = sd_id128_randomize(&arg_uuid);
2828 if (r < 0)
2829 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2830 }
2831 } else {
2832 if (sd_id128_is_null(id))
2833 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2834 "Machine ID in container image is zero, refusing.");
2835
2836 arg_uuid = id;
2837 }
2838
2839 return 0;
2840 }
2841
2842 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2843 int r;
2844
2845 assert(directory);
2846
2847 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2848 return 0;
2849
2850 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2851 if (r == -EOPNOTSUPP)
2852 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2853 if (r == -EBADE)
2854 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2855 if (r < 0)
2856 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2857 if (r == 0)
2858 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2859 else
2860 log_debug("Patched directory tree to match UID/GID range.");
2861
2862 return r;
2863 }
2864
2865 /*
2866 * Return values:
2867 * < 0 : wait_for_terminate() failed to get the state of the
2868 * container, the container was terminated by a signal, or
2869 * failed for an unknown reason. No change is made to the
2870 * container argument.
2871 * > 0 : The program executed in the container terminated with an
2872 * error. The exit code of the program executed in the
2873 * container is returned. The container argument has been set
2874 * to CONTAINER_TERMINATED.
2875 * 0 : The container is being rebooted, has been shut down or exited
2876 * successfully. The container argument has been set to either
2877 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2878 *
2879 * That is, success is indicated by a return value of zero, and an
2880 * error is indicated by a non-zero value.
2881 */
2882 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2883 siginfo_t status;
2884 int r;
2885
2886 r = wait_for_terminate(pid, &status);
2887 if (r < 0)
2888 return log_warning_errno(r, "Failed to wait for container: %m");
2889
2890 switch (status.si_code) {
2891
2892 case CLD_EXITED:
2893 if (status.si_status == 0)
2894 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2895 else
2896 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2897
2898 *container = CONTAINER_TERMINATED;
2899 return status.si_status;
2900
2901 case CLD_KILLED:
2902 if (status.si_status == SIGINT) {
2903 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2904 *container = CONTAINER_TERMINATED;
2905 return 0;
2906
2907 } else if (status.si_status == SIGHUP) {
2908 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2909 *container = CONTAINER_REBOOTED;
2910 return 0;
2911 }
2912
2913 _fallthrough_;
2914 case CLD_DUMPED:
2915 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2916 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2917
2918 default:
2919 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2920 "Container %s failed due to unknown reason.", arg_machine);
2921 }
2922 }
2923
2924 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2925 pid_t pid;
2926
2927 pid = PTR_TO_PID(userdata);
2928 if (pid > 0) {
2929 if (kill(pid, arg_kill_signal) >= 0) {
2930 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2931 sd_event_source_set_userdata(s, NULL);
2932 return 0;
2933 }
2934 }
2935
2936 sd_event_exit(sd_event_source_get_event(s), 0);
2937 return 0;
2938 }
2939
2940 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2941 pid_t pid;
2942
2943 assert(s);
2944 assert(ssi);
2945
2946 pid = PTR_TO_PID(userdata);
2947
2948 for (;;) {
2949 siginfo_t si = {};
2950
2951 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2952 return log_error_errno(errno, "Failed to waitid(): %m");
2953 if (si.si_pid == 0) /* No pending children. */
2954 break;
2955 if (si.si_pid == pid) {
2956 /* The main process we care for has exited. Return from
2957 * signal handler but leave the zombie. */
2958 sd_event_exit(sd_event_source_get_event(s), 0);
2959 break;
2960 }
2961
2962 /* Reap all other children. */
2963 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2964 }
2965
2966 return 0;
2967 }
2968
2969 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2970 pid_t pid;
2971
2972 assert(m);
2973
2974 pid = PTR_TO_PID(userdata);
2975
2976 if (arg_kill_signal > 0) {
2977 log_info("Container termination requested. Attempting to halt container.");
2978 (void) kill(pid, arg_kill_signal);
2979 } else {
2980 log_info("Container termination requested. Exiting.");
2981 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2982 }
2983
2984 return 0;
2985 }
2986
2987 static int determine_names(void) {
2988 int r;
2989
2990 if (arg_template && !arg_directory && arg_machine) {
2991
2992 /* If --template= was specified then we should not
2993 * search for a machine, but instead create a new one
2994 * in /var/lib/machine. */
2995
2996 arg_directory = path_join("/var/lib/machines", arg_machine);
2997 if (!arg_directory)
2998 return log_oom();
2999 }
3000
3001 if (!arg_image && !arg_directory) {
3002 if (arg_machine) {
3003 _cleanup_(image_unrefp) Image *i = NULL;
3004
3005 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3006 if (r == -ENOENT)
3007 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3008 if (r < 0)
3009 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3010
3011 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3012 r = free_and_strdup(&arg_image, i->path);
3013 else
3014 r = free_and_strdup(&arg_directory, i->path);
3015 if (r < 0)
3016 return log_oom();
3017
3018 if (!arg_ephemeral)
3019 arg_read_only = arg_read_only || i->read_only;
3020 } else {
3021 r = safe_getcwd(&arg_directory);
3022 if (r < 0)
3023 return log_error_errno(r, "Failed to determine current directory: %m");
3024 }
3025
3026 if (!arg_directory && !arg_image)
3027 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3028 }
3029
3030 if (!arg_machine) {
3031 if (arg_directory && path_equal(arg_directory, "/"))
3032 arg_machine = gethostname_malloc();
3033 else if (arg_image) {
3034 char *e;
3035
3036 arg_machine = strdup(basename(arg_image));
3037
3038 /* Truncate suffix if there is one */
3039 e = endswith(arg_machine, ".raw");
3040 if (e)
3041 *e = 0;
3042 } else
3043 arg_machine = strdup(basename(arg_directory));
3044 if (!arg_machine)
3045 return log_oom();
3046
3047 hostname_cleanup(arg_machine);
3048 if (!hostname_is_valid(arg_machine, 0))
3049 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3050
3051 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3052 * to match fixed config file names. */
3053 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3054 if (!arg_settings_filename)
3055 return log_oom();
3056
3057 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3058 * instances at once without manually having to specify -M each time. */
3059 if (arg_ephemeral)
3060 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3061 return log_oom();
3062 } else {
3063 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3064 if (!arg_settings_filename)
3065 return log_oom();
3066 }
3067
3068 return 0;
3069 }
3070
3071 static int chase_symlinks_and_update(char **p, unsigned flags) {
3072 char *chased;
3073 int r;
3074
3075 assert(p);
3076
3077 if (!*p)
3078 return 0;
3079
3080 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3081 if (r < 0)
3082 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3083
3084 return free_and_replace(*p, chased);
3085 }
3086
3087 static int determine_uid_shift(const char *directory) {
3088
3089 if (arg_userns_mode == USER_NAMESPACE_NO) {
3090 arg_uid_shift = 0;
3091 return 0;
3092 }
3093
3094 if (arg_uid_shift == UID_INVALID) {
3095 struct stat st;
3096
3097 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3098
3099 if (stat(directory, &st) < 0)
3100 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3101
3102 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3103
3104 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3105 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3106 "UID and GID base of %s don't match.", directory);
3107
3108 arg_uid_range = UINT32_C(0x10000);
3109
3110 if (arg_uid_shift != 0) {
3111 /* If the image is shifted already, then we'll fall back to classic chowning, for
3112 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3113
3114 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3115 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3116 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3117 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3118 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3119 "UID base of %s is not zero, UID mapping not supported.", directory);
3120 }
3121 }
3122
3123 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3124 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3125
3126 return 0;
3127 }
3128
3129 static unsigned long effective_clone_ns_flags(void) {
3130 unsigned long flags = arg_clone_ns_flags;
3131
3132 if (arg_private_network)
3133 flags |= CLONE_NEWNET;
3134 if (arg_use_cgns)
3135 flags |= CLONE_NEWCGROUP;
3136 if (arg_userns_mode != USER_NAMESPACE_NO)
3137 flags |= CLONE_NEWUSER;
3138
3139 return flags;
3140 }
3141
3142 static int patch_sysctl(void) {
3143
3144 /* This table is inspired by runc's sysctl() function */
3145 static const struct {
3146 const char *key;
3147 bool prefix;
3148 unsigned long clone_flags;
3149 } safe_sysctl[] = {
3150 { "kernel.hostname", false, CLONE_NEWUTS },
3151 { "kernel.domainname", false, CLONE_NEWUTS },
3152 { "kernel.msgmax", false, CLONE_NEWIPC },
3153 { "kernel.msgmnb", false, CLONE_NEWIPC },
3154 { "kernel.msgmni", false, CLONE_NEWIPC },
3155 { "kernel.sem", false, CLONE_NEWIPC },
3156 { "kernel.shmall", false, CLONE_NEWIPC },
3157 { "kernel.shmmax", false, CLONE_NEWIPC },
3158 { "kernel.shmmni", false, CLONE_NEWIPC },
3159 { "fs.mqueue.", true, CLONE_NEWIPC },
3160 { "net.", true, CLONE_NEWNET },
3161 };
3162
3163 unsigned long flags;
3164 int r;
3165
3166 flags = effective_clone_ns_flags();
3167
3168 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3169 bool good = false;
3170 size_t i;
3171
3172 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3173
3174 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3175 continue;
3176
3177 if (safe_sysctl[i].prefix)
3178 good = startswith(*k, safe_sysctl[i].key);
3179 else
3180 good = streq(*k, safe_sysctl[i].key);
3181
3182 if (good)
3183 break;
3184 }
3185
3186 if (!good)
3187 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3188
3189 r = sysctl_write(*k, *v);
3190 if (r < 0)
3191 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3192 }
3193
3194 return 0;
3195 }
3196
3197 static int inner_child(
3198 Barrier *barrier,
3199 const char *directory,
3200 bool secondary,
3201 int kmsg_socket,
3202 int rtnl_socket,
3203 int master_pty_socket,
3204 FDSet *fds,
3205 char **os_release_pairs) {
3206
3207 _cleanup_free_ char *home = NULL;
3208 size_t n_env = 1;
3209 char *envp[] = {
3210 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3211 NULL, /* container */
3212 NULL, /* TERM */
3213 NULL, /* HOME */
3214 NULL, /* USER */
3215 NULL, /* LOGNAME */
3216 NULL, /* container_uuid */
3217 NULL, /* LISTEN_FDS */
3218 NULL, /* LISTEN_PID */
3219 NULL, /* NOTIFY_SOCKET */
3220 NULL, /* CREDENTIALS_DIRECTORY */
3221 NULL, /* LANG */
3222 NULL
3223 };
3224 const char *exec_target;
3225 _cleanup_strv_free_ char **env_use = NULL;
3226 int r, which_failed;
3227
3228 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3229 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3230 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3231 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3232 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3233 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3234 * namespace.
3235 *
3236 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3237 * unshare(). See below. */
3238
3239 assert(barrier);
3240 assert(directory);
3241 assert(kmsg_socket >= 0);
3242
3243 log_debug("Inner child is initializing.");
3244
3245 if (arg_userns_mode != USER_NAMESPACE_NO) {
3246 /* Tell the parent, that it now can write the UID map. */
3247 (void) barrier_place(barrier); /* #1 */
3248
3249 /* Wait until the parent wrote the UID map */
3250 if (!barrier_place_and_sync(barrier)) /* #2 */
3251 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3252
3253 /* Become the new root user inside our namespace */
3254 r = reset_uid_gid();
3255 if (r < 0)
3256 return log_error_errno(r, "Couldn't become new root: %m");
3257
3258 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3259 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3260 * propagation, but simply create new peer groups for all our mounts). */
3261 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3262 if (r < 0)
3263 return r;
3264 }
3265
3266 r = mount_all(NULL,
3267 arg_mount_settings | MOUNT_IN_USERNS,
3268 arg_uid_shift,
3269 arg_selinux_apifs_context);
3270 if (r < 0)
3271 return r;
3272
3273 if (!arg_network_namespace_path && arg_private_network) {
3274 r = unshare(CLONE_NEWNET);
3275 if (r < 0)
3276 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3277
3278 /* Tell the parent that it can setup network interfaces. */
3279 (void) barrier_place(barrier); /* #3 */
3280 }
3281
3282 r = mount_sysfs(NULL, arg_mount_settings);
3283 if (r < 0)
3284 return r;
3285
3286 /* Wait until we are cgroup-ified, so that we
3287 * can mount the right cgroup path writable */
3288 if (!barrier_place_and_sync(barrier)) /* #4 */
3289 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3290 "Parent died too early");
3291
3292 if (arg_use_cgns) {
3293 r = unshare(CLONE_NEWCGROUP);
3294 if (r < 0)
3295 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3296 r = mount_cgroups(
3297 "",
3298 arg_unified_cgroup_hierarchy,
3299 arg_userns_mode != USER_NAMESPACE_NO,
3300 arg_uid_shift,
3301 arg_uid_range,
3302 arg_selinux_apifs_context,
3303 true);
3304 } else
3305 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3306 if (r < 0)
3307 return r;
3308
3309 r = setup_boot_id();
3310 if (r < 0)
3311 return r;
3312
3313 r = setup_kmsg(kmsg_socket);
3314 if (r < 0)
3315 return r;
3316 kmsg_socket = safe_close(kmsg_socket);
3317
3318 r = mount_custom(
3319 "/",
3320 arg_custom_mounts,
3321 arg_n_custom_mounts,
3322 0,
3323 0,
3324 arg_selinux_apifs_context,
3325 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3326 if (r < 0)
3327 return r;
3328
3329 if (setsid() < 0)
3330 return log_error_errno(errno, "setsid() failed: %m");
3331
3332 if (arg_private_network)
3333 (void) loopback_setup();
3334
3335 if (arg_expose_ports) {
3336 r = expose_port_send_rtnl(rtnl_socket);
3337 if (r < 0)
3338 return r;
3339 rtnl_socket = safe_close(rtnl_socket);
3340 }
3341
3342 if (arg_console_mode != CONSOLE_PIPE) {
3343 _cleanup_close_ int master = -1;
3344 _cleanup_free_ char *console = NULL;
3345
3346 /* Allocate a pty and make it available as /dev/console. */
3347 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3348 if (master < 0)
3349 return log_error_errno(master, "Failed to allocate a pty: %m");
3350
3351 r = setup_dev_console(console);
3352 if (r < 0)
3353 return log_error_errno(r, "Failed to set up /dev/console: %m");
3354
3355 r = send_one_fd(master_pty_socket, master, 0);
3356 if (r < 0)
3357 return log_error_errno(r, "Failed to send master fd: %m");
3358 master_pty_socket = safe_close(master_pty_socket);
3359
3360 r = setup_stdio_as_dev_console();
3361 if (r < 0)
3362 return r;
3363 }
3364
3365 r = patch_sysctl();
3366 if (r < 0)
3367 return r;
3368
3369 if (arg_oom_score_adjust_set) {
3370 r = set_oom_score_adjust(arg_oom_score_adjust);
3371 if (r < 0)
3372 return log_error_errno(r, "Failed to adjust OOM score: %m");
3373 }
3374
3375 if (arg_cpu_set.set)
3376 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3377 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3378
3379 (void) setup_hostname();
3380
3381 if (arg_personality != PERSONALITY_INVALID) {
3382 r = safe_personality(arg_personality);
3383 if (r < 0)
3384 return log_error_errno(r, "personality() failed: %m");
3385 } else if (secondary) {
3386 r = safe_personality(PER_LINUX32);
3387 if (r < 0)
3388 return log_error_errno(r, "personality() failed: %m");
3389 }
3390
3391 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3392 if (r < 0)
3393 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3394
3395 #if HAVE_SECCOMP
3396 if (arg_seccomp) {
3397
3398 if (is_seccomp_available()) {
3399
3400 r = seccomp_load(arg_seccomp);
3401 if (ERRNO_IS_SECCOMP_FATAL(r))
3402 return log_error_errno(r, "Failed to install seccomp filter: %m");
3403 if (r < 0)
3404 log_debug_errno(r, "Failed to install seccomp filter: %m");
3405 }
3406 } else
3407 #endif
3408 {
3409 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3410 if (r < 0)
3411 return r;
3412 }
3413
3414 if (arg_suppress_sync) {
3415 #if HAVE_SECCOMP
3416 r = seccomp_suppress_sync();
3417 if (r < 0)
3418 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3419 #else
3420 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3421 #endif
3422 }
3423
3424 #if HAVE_SELINUX
3425 if (arg_selinux_context)
3426 if (setexeccon(arg_selinux_context) < 0)
3427 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3428 #endif
3429
3430 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3431 * if we need to later on. */
3432 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3433 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3434
3435 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3436 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3437 else
3438 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3439 if (r < 0)
3440 return r;
3441
3442 r = drop_capabilities(getuid());
3443 if (r < 0)
3444 return log_error_errno(r, "Dropping capabilities failed: %m");
3445
3446 if (arg_no_new_privileges)
3447 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3448 return log_error_errno(errno, "Failed to disable new privileges: %m");
3449
3450 /* LXC sets container=lxc, so follow the scheme here */
3451 envp[n_env++] = strjoina("container=", arg_container_service_name);
3452
3453 envp[n_env] = strv_find_prefix(environ, "TERM=");
3454 if (envp[n_env])
3455 n_env++;
3456
3457 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3458 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3459 return log_oom();
3460
3461 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3462 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3463 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3464 return log_oom();
3465
3466 assert(!sd_id128_is_null(arg_uuid));
3467
3468 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3469 return log_oom();
3470
3471 if (fdset_size(fds) > 0) {
3472 r = fdset_cloexec(fds, false);
3473 if (r < 0)
3474 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3475
3476 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3477 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3478 return log_oom();
3479 }
3480 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3481 return log_oom();
3482
3483 if (arg_n_credentials > 0) {
3484 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3485 if (!envp[n_env])
3486 return log_oom();
3487 n_env++;
3488 }
3489
3490 if (arg_start_mode != START_BOOT) {
3491 /* If we're running a command in the container, let's default to the C.UTF-8 locale as it's
3492 * part of glibc these days and was backported to most distros a long time before it got
3493 * added to upstream glibc. */
3494 envp[n_env] = strdup("LANG=C.UTF-8");
3495 if (!envp[n_env])
3496 return log_oom();
3497 n_env++;
3498 }
3499
3500 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3501 if (!env_use)
3502 return log_oom();
3503
3504 /* Let the parent know that we are ready and
3505 * wait until the parent is ready with the
3506 * setup, too... */
3507 if (!barrier_place_and_sync(barrier)) /* #5 */
3508 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3509
3510 if (arg_chdir)
3511 if (chdir(arg_chdir) < 0)
3512 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3513
3514 if (arg_start_mode == START_PID2) {
3515 r = stub_pid1(arg_uuid);
3516 if (r < 0)
3517 return r;
3518 }
3519
3520 if (arg_console_mode != CONSOLE_PIPE) {
3521 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3522 * are configured for that. Acquire it as controlling tty. */
3523 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3524 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3525 }
3526
3527 log_debug("Inner child completed, invoking payload.");
3528
3529 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3530 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3531 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3532 log_close();
3533 log_set_open_when_needed(true);
3534
3535 (void) fdset_close_others(fds);
3536
3537 if (arg_start_mode == START_BOOT) {
3538 char **a;
3539 size_t m;
3540
3541 /* Automatically search for the init system */
3542
3543 m = strv_length(arg_parameters);
3544 a = newa(char*, m + 2);
3545 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3546 a[1 + m] = NULL;
3547
3548 FOREACH_STRING(init,
3549 "/usr/lib/systemd/systemd",
3550 "/lib/systemd/systemd",
3551 "/sbin/init") {
3552 a[0] = (char*) init;
3553 execve(a[0], a, env_use);
3554 }
3555
3556 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3557 } else if (!strv_isempty(arg_parameters)) {
3558 const char *dollar_path;
3559
3560 exec_target = arg_parameters[0];
3561
3562 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3563 * binary. */
3564 dollar_path = strv_env_get(env_use, "PATH");
3565 if (dollar_path) {
3566 if (setenv("PATH", dollar_path, 1) < 0)
3567 return log_error_errno(errno, "Failed to update $PATH: %m");
3568 }
3569
3570 execvpe(arg_parameters[0], arg_parameters, env_use);
3571 } else {
3572 if (!arg_chdir)
3573 /* If we cannot change the directory, we'll end up in /, that is expected. */
3574 (void) chdir(home ?: "/root");
3575
3576 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3577 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3578 execle("/bin/bash", "-bash", NULL, env_use);
3579 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3580 execle("/bin/sh", "-sh", NULL, env_use);
3581
3582 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3583 }
3584
3585 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3586 }
3587
3588 static int setup_notify_child(void) {
3589 _cleanup_close_ int fd = -1;
3590 static const union sockaddr_union sa = {
3591 .un.sun_family = AF_UNIX,
3592 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3593 };
3594 int r;
3595
3596 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3597 if (fd < 0)
3598 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3599
3600 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3601 (void) sockaddr_un_unlink(&sa.un);
3602
3603 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3604 if (r < 0)
3605 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3606
3607 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3608 if (r < 0)
3609 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3610
3611 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3612 if (r < 0)
3613 return log_error_errno(r, "SO_PASSCRED failed: %m");
3614
3615 return TAKE_FD(fd);
3616 }
3617
3618 static int outer_child(
3619 Barrier *barrier,
3620 const char *directory,
3621 DissectedImage *dissected_image,
3622 bool secondary,
3623 int pid_socket,
3624 int uuid_socket,
3625 int notify_socket,
3626 int kmsg_socket,
3627 int rtnl_socket,
3628 int uid_shift_socket,
3629 int master_pty_socket,
3630 int unified_cgroup_hierarchy_socket,
3631 FDSet *fds,
3632 int netns_fd) {
3633
3634 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3635 _cleanup_strv_free_ char **os_release_pairs = NULL;
3636 _cleanup_close_ int fd = -1;
3637 bool idmap = false;
3638 const char *p;
3639 pid_t pid;
3640 ssize_t l;
3641 int r;
3642
3643 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3644 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3645 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3646 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3647 * forked off it, and it exits. */
3648
3649 assert(barrier);
3650 assert(directory);
3651 assert(pid_socket >= 0);
3652 assert(uuid_socket >= 0);
3653 assert(notify_socket >= 0);
3654 assert(master_pty_socket >= 0);
3655 assert(kmsg_socket >= 0);
3656
3657 log_debug("Outer child is initializing.");
3658
3659 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3660 if (r < 0)
3661 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3662
3663 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3664 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3665
3666 r = reset_audit_loginuid();
3667 if (r < 0)
3668 return r;
3669
3670 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3671 * mounts to the real root. */
3672 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3673 if (r < 0)
3674 return r;
3675
3676 if (dissected_image) {
3677 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3678 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3679 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3680 * right place right away. This makes sure ESP partitions and userns are compatible. */
3681
3682 r = dissected_image_mount_and_warn(
3683 dissected_image,
3684 directory,
3685 arg_uid_shift,
3686 arg_uid_range,
3687 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3688 DISSECT_IMAGE_DISCARD_ON_LOOP|
3689 DISSECT_IMAGE_USR_NO_ROOT|
3690 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3691 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3692 if (r < 0)
3693 return r;
3694 }
3695
3696 r = determine_uid_shift(directory);
3697 if (r < 0)
3698 return r;
3699
3700 if (arg_userns_mode != USER_NAMESPACE_NO) {
3701 /* Let the parent know which UID shift we read from the image */
3702 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3703 if (l < 0)
3704 return log_error_errno(errno, "Failed to send UID shift: %m");
3705 if (l != sizeof(arg_uid_shift))
3706 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3707 "Short write while sending UID shift.");
3708
3709 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3710 /* When we are supposed to pick the UID shift, the parent will check now whether the
3711 * UID shift we just read from the image is available. If yes, it will send the UID
3712 * shift back to us, if not it will pick a different one, and send it back to us. */
3713
3714 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3715 if (l < 0)
3716 return log_error_errno(errno, "Failed to recv UID shift: %m");
3717 if (l != sizeof(arg_uid_shift))
3718 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3719 "Short read while receiving UID shift.");
3720 }
3721
3722 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3723 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3724 }
3725
3726 if (path_equal(directory, "/")) {
3727 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3728 * place, so that we can make changes to its mount structure (for example, to implement
3729 * --volatile=) without this interfering with our ability to access files such as
3730 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3731 * (instead of a temporary directory, since we are living in our own mount namspace here
3732 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3733 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3734
3735 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3736 if (r < 0)
3737 return r;
3738
3739 directory = "/run/systemd/nspawn-root";
3740 }
3741
3742 r = setup_pivot_root(
3743 directory,
3744 arg_pivot_root_new,
3745 arg_pivot_root_old);
3746 if (r < 0)
3747 return r;
3748
3749 r = setup_volatile_mode(
3750 directory,
3751 arg_volatile_mode,
3752 arg_uid_shift,
3753 arg_selinux_apifs_context);
3754 if (r < 0)
3755 return r;
3756
3757 r = bind_user_prepare(
3758 directory,
3759 arg_bind_user,
3760 arg_uid_shift,
3761 arg_uid_range,
3762 &arg_custom_mounts, &arg_n_custom_mounts,
3763 &bind_user_context);
3764 if (r < 0)
3765 return r;
3766
3767 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3768 /* Send the user maps we determined to the parent, so that it installs it in our user
3769 * namespace UID map table */
3770
3771 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3772 uid_t map[] = {
3773 bind_user_context->data[i].payload_user->uid,
3774 bind_user_context->data[i].host_user->uid,
3775 (uid_t) bind_user_context->data[i].payload_group->gid,
3776 (uid_t) bind_user_context->data[i].host_group->gid,
3777 };
3778
3779 l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
3780 if (l < 0)
3781 return log_error_errno(errno, "Failed to send user UID map: %m");
3782 if (l != sizeof(map))
3783 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3784 "Short write while sending user UID map.");
3785 }
3786 }
3787
3788 r = mount_custom(
3789 directory,
3790 arg_custom_mounts,
3791 arg_n_custom_mounts,
3792 arg_uid_shift,
3793 arg_uid_range,
3794 arg_selinux_apifs_context,
3795 MOUNT_ROOT_ONLY);
3796 if (r < 0)
3797 return r;
3798
3799 /* Make sure we always have a mount that we can move to root later on. */
3800 r = make_mount_point(directory);
3801 if (r < 0)
3802 return r;
3803
3804 if (arg_userns_mode != USER_NAMESPACE_NO &&
3805 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3806 arg_uid_shift != 0) {
3807
3808 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
3809 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3810 /* This might fail because the kernel or file system doesn't support idmapping. We
3811 * can't really distinguish this nicely, nor do we have any guarantees about the
3812 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3813 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3814 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3815 "ID mapped mounts are apparently not available, sorry.");
3816
3817 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3818 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3819 } else if (r < 0)
3820 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3821 else {
3822 log_debug("ID mapped mounts available, making use of them.");
3823 idmap = true;
3824 }
3825 }
3826
3827 if (dissected_image) {
3828 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3829 r = dissected_image_mount(
3830 dissected_image,
3831 directory,
3832 arg_uid_shift,
3833 arg_uid_range,
3834 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3835 DISSECT_IMAGE_DISCARD_ON_LOOP|
3836 DISSECT_IMAGE_USR_NO_ROOT|
3837 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3838 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3839 if (r == -EUCLEAN)
3840 return log_error_errno(r, "File system check for image failed: %m");
3841 if (r < 0)
3842 return log_error_errno(r, "Failed to mount image file system: %m");
3843 }
3844
3845 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3846 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3847
3848 r = detect_unified_cgroup_hierarchy_from_image(directory);
3849 if (r < 0)
3850 return r;
3851
3852 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3853 if (l < 0)
3854 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3855 if (l != sizeof(arg_unified_cgroup_hierarchy))
3856 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3857 "Short write while sending cgroup mode.");
3858
3859 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3860 }
3861
3862 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3863 * mounts available in systemd services inside the container that create a new mount namespace. See
3864 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3865 * will inherit the shared propagation mode.
3866 *
3867 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3868 * directory mount to root later on.
3869 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3870 */
3871 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3872 if (r < 0)
3873 return r;
3874
3875 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3876 if (r < 0)
3877 return r;
3878
3879 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3880 if (r < 0)
3881 return r;
3882
3883 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3884 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3885 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3886 if (r < 0)
3887 return log_error_errno(r, "Failed to make tree read-only: %m");
3888 }
3889
3890 r = mount_all(directory,
3891 arg_mount_settings,
3892 arg_uid_shift,
3893 arg_selinux_apifs_context);
3894 if (r < 0)
3895 return r;
3896
3897 r = copy_devnodes(directory);
3898 if (r < 0)
3899 return r;
3900
3901 r = make_extra_nodes(directory);
3902 if (r < 0)
3903 return r;
3904
3905 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3906
3907 p = prefix_roota(directory, "/run/host");
3908 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3909
3910 r = setup_pts(directory);
3911 if (r < 0)
3912 return r;
3913
3914 r = setup_propagate(directory);
3915 if (r < 0)
3916 return r;
3917
3918 r = setup_keyring();
3919 if (r < 0)
3920 return r;
3921
3922 r = setup_credentials(directory);
3923 if (r < 0)
3924 return r;
3925
3926 r = bind_user_setup(bind_user_context, directory);
3927 if (r < 0)
3928 return r;
3929
3930 r = mount_custom(
3931 directory,
3932 arg_custom_mounts,
3933 arg_n_custom_mounts,
3934 arg_uid_shift,
3935 arg_uid_range,
3936 arg_selinux_apifs_context,
3937 MOUNT_NON_ROOT_ONLY);
3938 if (r < 0)
3939 return r;
3940
3941 r = setup_timezone(directory);
3942 if (r < 0)
3943 return r;
3944
3945 r = setup_resolv_conf(directory);
3946 if (r < 0)
3947 return r;
3948
3949 r = setup_machine_id(directory);
3950 if (r < 0)
3951 return r;
3952
3953 r = setup_journal(directory);
3954 if (r < 0)
3955 return r;
3956
3957 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3958 p = prefix_roota(directory, "/run/host/container-manager");
3959 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3960
3961 /* The same stuff as the $container_uuid env var */
3962 p = prefix_roota(directory, "/run/host/container-uuid");
3963 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3964
3965 if (!arg_use_cgns) {
3966 r = mount_cgroups(
3967 directory,
3968 arg_unified_cgroup_hierarchy,
3969 arg_userns_mode != USER_NAMESPACE_NO,
3970 arg_uid_shift,
3971 arg_uid_range,
3972 arg_selinux_apifs_context,
3973 false);
3974 if (r < 0)
3975 return r;
3976 }
3977
3978 r = mount_move_root(directory);
3979 if (r < 0)
3980 return log_error_errno(r, "Failed to move root directory: %m");
3981
3982 fd = setup_notify_child();
3983 if (fd < 0)
3984 return fd;
3985
3986 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3987 arg_clone_ns_flags |
3988 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3989 if (pid < 0)
3990 return log_error_errno(errno, "Failed to fork inner child: %m");
3991 if (pid == 0) {
3992 pid_socket = safe_close(pid_socket);
3993 uuid_socket = safe_close(uuid_socket);
3994 notify_socket = safe_close(notify_socket);
3995 uid_shift_socket = safe_close(uid_shift_socket);
3996
3997 /* The inner child has all namespaces that are requested, so that we all are owned by the
3998 * user if user namespaces are turned on. */
3999
4000 if (arg_network_namespace_path) {
4001 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4002 if (r < 0)
4003 return log_error_errno(r, "Failed to join network namespace: %m");
4004 }
4005
4006 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
4007 if (r < 0)
4008 _exit(EXIT_FAILURE);
4009
4010 _exit(EXIT_SUCCESS);
4011 }
4012
4013 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4014 if (l < 0)
4015 return log_error_errno(errno, "Failed to send PID: %m");
4016 if (l != sizeof(pid))
4017 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4018 "Short write while sending PID.");
4019
4020 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4021 if (l < 0)
4022 return log_error_errno(errno, "Failed to send machine ID: %m");
4023 if (l != sizeof(arg_uuid))
4024 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4025 "Short write while sending machine ID.");
4026
4027 l = send_one_fd(notify_socket, fd, 0);
4028 if (l < 0)
4029 return log_error_errno(l, "Failed to send notify fd: %m");
4030
4031 pid_socket = safe_close(pid_socket);
4032 uuid_socket = safe_close(uuid_socket);
4033 notify_socket = safe_close(notify_socket);
4034 master_pty_socket = safe_close(master_pty_socket);
4035 kmsg_socket = safe_close(kmsg_socket);
4036 rtnl_socket = safe_close(rtnl_socket);
4037 netns_fd = safe_close(netns_fd);
4038
4039 return 0;
4040 }
4041
4042 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4043 bool tried_hashed = false;
4044 unsigned n_tries = 100;
4045 uid_t candidate;
4046 int r;
4047
4048 assert(shift);
4049 assert(ret_lock_file);
4050 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4051 assert(arg_uid_range == 0x10000U);
4052
4053 candidate = *shift;
4054
4055 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4056
4057 for (;;) {
4058 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4059 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4060
4061 if (--n_tries <= 0)
4062 return -EBUSY;
4063
4064 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4065 goto next;
4066 if ((candidate & UINT32_C(0xFFFF)) != 0)
4067 goto next;
4068
4069 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4070 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4071 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4072 goto next;
4073 if (r < 0)
4074 return r;
4075
4076 /* Make some superficial checks whether the range is currently known in the user database */
4077 if (getpwuid(candidate))
4078 goto next;
4079 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4080 goto next;
4081 if (getgrgid(candidate))
4082 goto next;
4083 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4084 goto next;
4085
4086 *ret_lock_file = lf;
4087 lf = (struct LockFile) LOCK_FILE_INIT;
4088 *shift = candidate;
4089 return 0;
4090
4091 next:
4092 if (arg_machine && !tried_hashed) {
4093 /* Try to hash the base from the container name */
4094
4095 static const uint8_t hash_key[] = {
4096 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4097 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4098 };
4099
4100 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4101
4102 tried_hashed = true;
4103 } else
4104 random_bytes(&candidate, sizeof(candidate));
4105
4106 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4107 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4108 }
4109 }
4110
4111 static int add_one_uid_map(
4112 char **p,
4113 uid_t container_uid,
4114 uid_t host_uid,
4115 uid_t range) {
4116
4117 return strextendf(p,
4118 UID_FMT " " UID_FMT " " UID_FMT "\n",
4119 container_uid, host_uid, range);
4120 }
4121
4122 static int make_uid_map_string(
4123 const uid_t bind_user_uid[],
4124 size_t n_bind_user_uid,
4125 size_t offset,
4126 char **ret) {
4127
4128 _cleanup_free_ char *s = NULL;
4129 uid_t previous_uid = 0;
4130 int r;
4131
4132 assert(n_bind_user_uid == 0 || bind_user_uid);
4133 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4134 assert(ret);
4135
4136 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4137 * quadruplet, consisting of host and container UID + GID. */
4138
4139 for (size_t i = 0; i < n_bind_user_uid; i++) {
4140 uid_t payload_uid = bind_user_uid[i*2+offset],
4141 host_uid = bind_user_uid[i*2+offset+1];
4142
4143 assert(previous_uid <= payload_uid);
4144 assert(payload_uid < arg_uid_range);
4145
4146 /* Add a range to close the gap to previous entry */
4147 if (payload_uid > previous_uid) {
4148 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4149 if (r < 0)
4150 return r;
4151 }
4152
4153 /* Map this specific user */
4154 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4155 if (r < 0)
4156 return r;
4157
4158 previous_uid = payload_uid + 1;
4159 }
4160
4161 /* And add a range to close the gap to finish the range */
4162 if (arg_uid_range > previous_uid) {
4163 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4164 if (r < 0)
4165 return r;
4166 }
4167
4168 assert(s);
4169
4170 *ret = TAKE_PTR(s);
4171 return 0;
4172 }
4173
4174 static int setup_uid_map(
4175 pid_t pid,
4176 const uid_t bind_user_uid[],
4177 size_t n_bind_user_uid) {
4178
4179 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4180 _cleanup_free_ char *s = NULL;
4181 int r;
4182
4183 assert(pid > 1);
4184
4185 /* Build the UID map string */
4186 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4187 return log_oom();
4188
4189 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4190 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4191 if (r < 0)
4192 return log_error_errno(r, "Failed to write UID map: %m");
4193
4194 /* And now build the GID map string */
4195 s = mfree(s);
4196 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4197 return log_oom();
4198
4199 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4200 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4201 if (r < 0)
4202 return log_error_errno(r, "Failed to write GID map: %m");
4203
4204 return 0;
4205 }
4206
4207 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4208 char buf[NOTIFY_BUFFER_MAX+1];
4209 char *p = NULL;
4210 struct iovec iovec = {
4211 .iov_base = buf,
4212 .iov_len = sizeof(buf)-1,
4213 };
4214 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4215 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4216 struct msghdr msghdr = {
4217 .msg_iov = &iovec,
4218 .msg_iovlen = 1,
4219 .msg_control = &control,
4220 .msg_controllen = sizeof(control),
4221 };
4222 struct ucred *ucred;
4223 ssize_t n;
4224 pid_t inner_child_pid;
4225 _cleanup_strv_free_ char **tags = NULL;
4226 int r;
4227
4228 assert(userdata);
4229
4230 inner_child_pid = PTR_TO_PID(userdata);
4231
4232 if (revents != EPOLLIN) {
4233 log_warning("Got unexpected poll event for notify fd.");
4234 return 0;
4235 }
4236
4237 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4238 if (n < 0) {
4239 if (ERRNO_IS_TRANSIENT(n))
4240 return 0;
4241 if (n == -EXFULL) {
4242 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4243 return 0;
4244 }
4245 return log_warning_errno(n, "Couldn't read notification socket: %m");
4246 }
4247
4248 cmsg_close_all(&msghdr);
4249
4250 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4251 if (!ucred || ucred->pid != inner_child_pid) {
4252 log_debug("Received notify message without valid credentials. Ignoring.");
4253 return 0;
4254 }
4255
4256 if ((size_t) n >= sizeof(buf)) {
4257 log_warning("Received notify message exceeded maximum size. Ignoring.");
4258 return 0;
4259 }
4260
4261 buf[n] = 0;
4262 tags = strv_split(buf, "\n\r");
4263 if (!tags)
4264 return log_oom();
4265
4266 if (strv_contains(tags, "READY=1")) {
4267 r = sd_notify(false, "READY=1\n");
4268 if (r < 0)
4269 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4270 }
4271
4272 p = strv_find_startswith(tags, "STATUS=");
4273 if (p)
4274 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4275
4276 return 0;
4277 }
4278
4279 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4280 int r;
4281
4282 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4283 if (r < 0)
4284 return log_error_errno(r, "Failed to allocate notify event source: %m");
4285
4286 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4287
4288 return 0;
4289 }
4290
4291 static int merge_settings(Settings *settings, const char *path) {
4292 int rl;
4293
4294 assert(settings);
4295 assert(path);
4296
4297 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4298 * that this steals the fields of the Settings* structure, and hence modifies it. */
4299
4300 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4301 settings->start_mode >= 0) {
4302 arg_start_mode = settings->start_mode;
4303 strv_free_and_replace(arg_parameters, settings->parameters);
4304 }
4305
4306 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4307 settings->ephemeral >= 0)
4308 arg_ephemeral = settings->ephemeral;
4309
4310 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4311 settings->root) {
4312
4313 if (!arg_settings_trusted)
4314 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4315 else
4316 free_and_replace(arg_directory, settings->root);
4317 }
4318
4319 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4320 settings->pivot_root_new) {
4321 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4322 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4323 }
4324
4325 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4326 settings->working_directory)
4327 free_and_replace(arg_chdir, settings->working_directory);
4328
4329 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4330 settings->environment)
4331 strv_free_and_replace(arg_setenv, settings->environment);
4332
4333 if ((arg_settings_mask & SETTING_USER) == 0) {
4334
4335 if (settings->user)
4336 free_and_replace(arg_user, settings->user);
4337
4338 if (uid_is_valid(settings->uid))
4339 arg_uid = settings->uid;
4340 if (gid_is_valid(settings->gid))
4341 arg_gid = settings->gid;
4342 if (settings->n_supplementary_gids > 0) {
4343 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4344 arg_n_supplementary_gids = settings->n_supplementary_gids;
4345 }
4346 }
4347
4348 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4349 uint64_t plus, minus;
4350 uint64_t network_minus = 0;
4351 uint64_t ambient;
4352
4353 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4354 * Settings structure */
4355
4356 plus = settings->capability;
4357 minus = settings->drop_capability;
4358
4359 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4360 settings_network_configured(settings)) {
4361 if (settings_private_network(settings))
4362 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4363 else
4364 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4365 }
4366
4367 if (!arg_settings_trusted && plus != 0) {
4368 if (settings->capability != 0)
4369 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4370 } else {
4371 arg_caps_retain &= ~network_minus;
4372 arg_caps_retain |= plus;
4373 }
4374
4375 arg_caps_retain &= ~minus;
4376
4377 /* Copy the full capabilities over too */
4378 if (capability_quintet_is_set(&settings->full_capabilities)) {
4379 if (!arg_settings_trusted)
4380 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4381 else
4382 arg_full_capabilities = settings->full_capabilities;
4383 }
4384
4385 ambient = settings->ambient_capability;
4386 if (!arg_settings_trusted && ambient != 0)
4387 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4388 else
4389 arg_caps_ambient |= ambient;
4390 }
4391
4392 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4393 settings->kill_signal > 0)
4394 arg_kill_signal = settings->kill_signal;
4395
4396 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4397 settings->personality != PERSONALITY_INVALID)
4398 arg_personality = settings->personality;
4399
4400 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4401 !sd_id128_is_null(settings->machine_id)) {
4402
4403 if (!arg_settings_trusted)
4404 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4405 else
4406 arg_uuid = settings->machine_id;
4407 }
4408
4409 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4410 settings->read_only >= 0)
4411 arg_read_only = settings->read_only;
4412
4413 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4414 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4415 arg_volatile_mode = settings->volatile_mode;
4416
4417 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4418 settings->n_custom_mounts > 0) {
4419
4420 if (!arg_settings_trusted)
4421 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4422 else {
4423 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4424 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4425 arg_n_custom_mounts = settings->n_custom_mounts;
4426 settings->n_custom_mounts = 0;
4427 }
4428 }
4429
4430 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4431 settings_network_configured(settings)) {
4432
4433 if (!arg_settings_trusted)
4434 log_warning("Ignoring network settings, file %s is not trusted.", path);
4435 else {
4436 arg_network_veth = settings_network_veth(settings);
4437 arg_private_network = settings_private_network(settings);
4438
4439 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4440 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4441 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4442 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4443
4444 free_and_replace(arg_network_bridge, settings->network_bridge);
4445 free_and_replace(arg_network_zone, settings->network_zone);
4446
4447 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4448 }
4449 }
4450
4451 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4452 settings->expose_ports) {
4453
4454 if (!arg_settings_trusted)
4455 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4456 else {
4457 expose_port_free_all(arg_expose_ports);
4458 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4459 }
4460 }
4461
4462 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4463 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4464
4465 if (!arg_settings_trusted)
4466 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4467 else {
4468 arg_userns_mode = settings->userns_mode;
4469 arg_uid_shift = settings->uid_shift;
4470 arg_uid_range = settings->uid_range;
4471 arg_userns_ownership = settings->userns_ownership;
4472 }
4473 }
4474
4475 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4476 !strv_isempty(settings->bind_user))
4477 strv_free_and_replace(arg_bind_user, settings->bind_user);
4478
4479 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4480 settings->notify_ready >= 0)
4481 arg_notify_ready = settings->notify_ready;
4482
4483 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4484
4485 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4486 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4487 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4488 else {
4489 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4490 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4491 }
4492 }
4493
4494 #if HAVE_SECCOMP
4495 if (settings->seccomp) {
4496 if (!arg_settings_trusted)
4497 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4498 else {
4499 seccomp_release(arg_seccomp);
4500 arg_seccomp = TAKE_PTR(settings->seccomp);
4501 }
4502 }
4503 #endif
4504 }
4505
4506 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4507 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4508 continue;
4509
4510 if (!settings->rlimit[rl])
4511 continue;
4512
4513 if (!arg_settings_trusted) {
4514 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4515 continue;
4516 }
4517
4518 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4519 }
4520
4521 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4522 settings->hostname)
4523 free_and_replace(arg_hostname, settings->hostname);
4524
4525 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4526 settings->no_new_privileges >= 0)
4527 arg_no_new_privileges = settings->no_new_privileges;
4528
4529 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4530 settings->oom_score_adjust_set) {
4531
4532 if (!arg_settings_trusted)
4533 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4534 else {
4535 arg_oom_score_adjust = settings->oom_score_adjust;
4536 arg_oom_score_adjust_set = true;
4537 }
4538 }
4539
4540 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4541 settings->cpu_set.set) {
4542
4543 if (!arg_settings_trusted)
4544 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4545 else {
4546 cpu_set_reset(&arg_cpu_set);
4547 arg_cpu_set = settings->cpu_set;
4548 settings->cpu_set = (CPUSet) {};
4549 }
4550 }
4551
4552 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4553 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4554 arg_resolv_conf = settings->resolv_conf;
4555
4556 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4557 settings->link_journal != _LINK_JOURNAL_INVALID) {
4558
4559 if (!arg_settings_trusted)
4560 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4561 else {
4562 arg_link_journal = settings->link_journal;
4563 arg_link_journal_try = settings->link_journal_try;
4564 }
4565 }
4566
4567 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4568 settings->timezone != _TIMEZONE_MODE_INVALID)
4569 arg_timezone = settings->timezone;
4570
4571 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4572 settings->slice) {
4573
4574 if (!arg_settings_trusted)
4575 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4576 else
4577 free_and_replace(arg_slice, settings->slice);
4578 }
4579
4580 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4581 settings->use_cgns >= 0) {
4582
4583 if (!arg_settings_trusted)
4584 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4585 else
4586 arg_use_cgns = settings->use_cgns;
4587 }
4588
4589 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4590 settings->clone_ns_flags != ULONG_MAX) {
4591
4592 if (!arg_settings_trusted)
4593 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4594 else
4595 arg_clone_ns_flags = settings->clone_ns_flags;
4596 }
4597
4598 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4599 settings->console_mode >= 0) {
4600
4601 if (!arg_settings_trusted)
4602 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4603 else
4604 arg_console_mode = settings->console_mode;
4605 }
4606
4607 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4608 settings->suppress_sync >= 0)
4609 arg_suppress_sync = settings->suppress_sync;
4610
4611 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4612 * don't consult arg_settings_mask for them. */
4613
4614 sd_bus_message_unref(arg_property_message);
4615 arg_property_message = TAKE_PTR(settings->properties);
4616
4617 arg_console_width = settings->console_width;
4618 arg_console_height = settings->console_height;
4619
4620 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4621 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4622 arg_n_extra_nodes = settings->n_extra_nodes;
4623
4624 return 0;
4625 }
4626
4627 static int load_settings(void) {
4628 _cleanup_(settings_freep) Settings *settings = NULL;
4629 _cleanup_fclose_ FILE *f = NULL;
4630 _cleanup_free_ char *p = NULL;
4631 int r;
4632
4633 if (arg_oci_bundle)
4634 return 0;
4635
4636 /* If all settings are masked, there's no point in looking for
4637 * the settings file */
4638 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4639 return 0;
4640
4641 /* We first look in the admin's directories in /etc and /run */
4642 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4643 _cleanup_free_ char *j = NULL;
4644
4645 j = path_join(i, arg_settings_filename);
4646 if (!j)
4647 return log_oom();
4648
4649 f = fopen(j, "re");
4650 if (f) {
4651 p = TAKE_PTR(j);
4652
4653 /* By default, we trust configuration from /etc and /run */
4654 if (arg_settings_trusted < 0)
4655 arg_settings_trusted = true;
4656
4657 break;
4658 }
4659
4660 if (errno != ENOENT)
4661 return log_error_errno(errno, "Failed to open %s: %m", j);
4662 }
4663
4664 if (!f) {
4665 /* After that, let's look for a file next to the
4666 * actual image we shall boot. */
4667
4668 if (arg_image) {
4669 p = file_in_same_dir(arg_image, arg_settings_filename);
4670 if (!p)
4671 return log_oom();
4672 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4673 p = file_in_same_dir(arg_directory, arg_settings_filename);
4674 if (!p)
4675 return log_oom();
4676 }
4677
4678 if (p) {
4679 f = fopen(p, "re");
4680 if (!f && errno != ENOENT)
4681 return log_error_errno(errno, "Failed to open %s: %m", p);
4682
4683 /* By default, we do not trust configuration from /var/lib/machines */
4684 if (arg_settings_trusted < 0)
4685 arg_settings_trusted = false;
4686 }
4687 }
4688
4689 if (!f)
4690 return 0;
4691
4692 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4693
4694 r = settings_load(f, p, &settings);
4695 if (r < 0)
4696 return r;
4697
4698 return merge_settings(settings, p);
4699 }
4700
4701 static int load_oci_bundle(void) {
4702 _cleanup_(settings_freep) Settings *settings = NULL;
4703 int r;
4704
4705 if (!arg_oci_bundle)
4706 return 0;
4707
4708 /* By default let's trust OCI bundles */
4709 if (arg_settings_trusted < 0)
4710 arg_settings_trusted = true;
4711
4712 r = oci_load(NULL, arg_oci_bundle, &settings);
4713 if (r < 0)
4714 return r;
4715
4716 return merge_settings(settings, arg_oci_bundle);
4717 }
4718
4719 static int run_container(
4720 DissectedImage *dissected_image,
4721 bool secondary,
4722 FDSet *fds,
4723 char veth_name[IFNAMSIZ], bool *veth_created,
4724 struct ExposeArgs *expose_args,
4725 int *master, pid_t *pid, int *ret) {
4726
4727 static const struct sigaction sa = {
4728 .sa_handler = nop_signal_handler,
4729 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4730 };
4731
4732 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4733 _cleanup_close_ int etc_passwd_lock = -1;
4734 _cleanup_close_pair_ int
4735 kmsg_socket_pair[2] = { -1, -1 },
4736 rtnl_socket_pair[2] = { -1, -1 },
4737 pid_socket_pair[2] = { -1, -1 },
4738 uuid_socket_pair[2] = { -1, -1 },
4739 notify_socket_pair[2] = { -1, -1 },
4740 uid_shift_socket_pair[2] = { -1, -1 },
4741 master_pty_socket_pair[2] = { -1, -1 },
4742 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4743
4744 _cleanup_close_ int notify_socket = -1;
4745 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4746 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4747 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4748 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4749 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4750 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4751 _cleanup_free_ uid_t *bind_user_uid = NULL;
4752 size_t n_bind_user_uid = 0;
4753 ContainerStatus container_status = 0;
4754 int ifi = 0, r;
4755 ssize_t l;
4756 sigset_t mask_chld;
4757 _cleanup_close_ int child_netns_fd = -1;
4758
4759 assert_se(sigemptyset(&mask_chld) == 0);
4760 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4761
4762 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4763 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4764 * check with getpwuid() if the specific user already exists. Note that /etc might be
4765 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4766 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4767 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4768 * really ours. */
4769
4770 etc_passwd_lock = take_etc_passwd_lock(NULL);
4771 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4772 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4773 }
4774
4775 r = barrier_create(&barrier);
4776 if (r < 0)
4777 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4778
4779 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4780 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4781
4782 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4783 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4784
4785 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4786 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4787
4788 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4789 return log_error_errno(errno, "Failed to create id socket pair: %m");
4790
4791 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4792 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4793
4794 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4795 return log_error_errno(errno, "Failed to create console socket pair: %m");
4796
4797 if (arg_userns_mode != USER_NAMESPACE_NO)
4798 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4799 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4800
4801 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4802 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4803 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4804
4805 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4806 * parent's blocking calls and give it a chance to call wait() and terminate. */
4807 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4808 if (r < 0)
4809 return log_error_errno(errno, "Failed to change the signal mask: %m");
4810
4811 r = sigaction(SIGCHLD, &sa, NULL);
4812 if (r < 0)
4813 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4814
4815 if (arg_network_namespace_path) {
4816 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4817 if (child_netns_fd < 0)
4818 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4819
4820 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4821 if (r == -EUCLEAN)
4822 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4823 else if (r < 0)
4824 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4825 else if (r == 0)
4826 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4827 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4828 }
4829
4830 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4831 if (*pid < 0)
4832 return log_error_errno(errno, "clone() failed%s: %m",
4833 errno == EINVAL ?
4834 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4835
4836 if (*pid == 0) {
4837 /* The outer child only has a file system namespace. */
4838 barrier_set_role(&barrier, BARRIER_CHILD);
4839
4840 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4841 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4842 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4843 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4844 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4845 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4846 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4847 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4848
4849 (void) reset_all_signal_handlers();
4850 (void) reset_signal_mask();
4851
4852 r = outer_child(&barrier,
4853 arg_directory,
4854 dissected_image,
4855 secondary,
4856 pid_socket_pair[1],
4857 uuid_socket_pair[1],
4858 notify_socket_pair[1],
4859 kmsg_socket_pair[1],
4860 rtnl_socket_pair[1],
4861 uid_shift_socket_pair[1],
4862 master_pty_socket_pair[1],
4863 unified_cgroup_hierarchy_socket_pair[1],
4864 fds,
4865 child_netns_fd);
4866 if (r < 0)
4867 _exit(EXIT_FAILURE);
4868
4869 _exit(EXIT_SUCCESS);
4870 }
4871
4872 barrier_set_role(&barrier, BARRIER_PARENT);
4873
4874 fdset_close(fds);
4875
4876 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4877 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4878 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4879 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4880 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4881 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4882 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4883 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4884
4885 if (arg_userns_mode != USER_NAMESPACE_NO) {
4886 /* The child just let us know the UID shift it might have read from the image. */
4887 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4888 if (l < 0)
4889 return log_error_errno(errno, "Failed to read UID shift: %m");
4890 if (l != sizeof arg_uid_shift)
4891 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4892
4893 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4894 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4895 * image, but if that's already in use, pick a new one, and report back to the child,
4896 * which one we now picked. */
4897
4898 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4899 if (r < 0)
4900 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4901
4902 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4903 if (l < 0)
4904 return log_error_errno(errno, "Failed to send UID shift: %m");
4905 if (l != sizeof arg_uid_shift)
4906 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4907 }
4908
4909 n_bind_user_uid = strv_length(arg_bind_user);
4910 if (n_bind_user_uid > 0) {
4911 /* Right after the UID shift, we'll receive the list of UID mappings for the
4912 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4913
4914 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4915 if (!bind_user_uid)
4916 return log_oom();
4917
4918 for (size_t i = 0; i < n_bind_user_uid; i++) {
4919 l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4920 if (l < 0)
4921 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4922 if (l != sizeof(uid_t)*4)
4923 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4924 SYNTHETIC_ERRNO(EIO),
4925 "Short read while reading bind user UID pairs.");
4926 }
4927 }
4928 }
4929
4930 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4931 /* The child let us know the support cgroup mode it might have read from the image. */
4932 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4933 if (l < 0)
4934 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4935 if (l != sizeof(arg_unified_cgroup_hierarchy))
4936 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4937 l, l == 0 ? " The child is most likely dead." : "");
4938 }
4939
4940 /* Wait for the outer child. */
4941 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4942 if (r < 0)
4943 return r;
4944 if (r != EXIT_SUCCESS)
4945 return -EIO;
4946
4947 /* And now retrieve the PID of the inner child. */
4948 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4949 if (l < 0)
4950 return log_error_errno(errno, "Failed to read inner child PID: %m");
4951 if (l != sizeof *pid)
4952 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4953
4954 /* We also retrieve container UUID in case it was generated by outer child */
4955 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4956 if (l < 0)
4957 return log_error_errno(errno, "Failed to read container machine ID: %m");
4958 if (l != sizeof(arg_uuid))
4959 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4960
4961 /* We also retrieve the socket used for notifications generated by outer child */
4962 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4963 if (notify_socket < 0)
4964 return log_error_errno(notify_socket,
4965 "Failed to receive notification socket from the outer child: %m");
4966
4967 log_debug("Init process invoked as PID "PID_FMT, *pid);
4968
4969 if (arg_userns_mode != USER_NAMESPACE_NO) {
4970 if (!barrier_place_and_sync(&barrier)) /* #1 */
4971 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4972
4973 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4974 if (r < 0)
4975 return r;
4976
4977 (void) barrier_place(&barrier); /* #2 */
4978 }
4979
4980 if (arg_private_network) {
4981 if (!arg_network_namespace_path) {
4982 /* Wait until the child has unshared its network namespace. */
4983 if (!barrier_place_and_sync(&barrier)) /* #3 */
4984 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4985 }
4986
4987 if (child_netns_fd < 0) {
4988 /* Make sure we have an open file descriptor to the child's network
4989 * namespace so it stays alive even if the child exits. */
4990 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4991 if (r < 0)
4992 return log_error_errno(r, "Failed to open child network namespace: %m");
4993 }
4994
4995 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4996 if (r < 0)
4997 return r;
4998
4999 if (arg_network_veth) {
5000 r = setup_veth(arg_machine, *pid, veth_name,
5001 arg_network_bridge || arg_network_zone);
5002 if (r < 0)
5003 return r;
5004 else if (r > 0)
5005 ifi = r;
5006
5007 if (arg_network_bridge) {
5008 /* Add the interface to a bridge */
5009 r = setup_bridge(veth_name, arg_network_bridge, false);
5010 if (r < 0)
5011 return r;
5012 if (r > 0)
5013 ifi = r;
5014 } else if (arg_network_zone) {
5015 /* Add the interface to a bridge, possibly creating it */
5016 r = setup_bridge(veth_name, arg_network_zone, true);
5017 if (r < 0)
5018 return r;
5019 if (r > 0)
5020 ifi = r;
5021 }
5022 }
5023
5024 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5025 if (r < 0)
5026 return r;
5027
5028 /* We created the primary and extra veth links now; let's remember this, so that we know to
5029 remove them later on. Note that we don't bother with removing veth links that were created
5030 here when their setup failed half-way, because in that case the kernel should be able to
5031 remove them on its own, since they cannot be referenced by anything yet. */
5032 *veth_created = true;
5033
5034 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5035 if (r < 0)
5036 return r;
5037
5038 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5039 if (r < 0)
5040 return r;
5041 }
5042
5043 if (arg_register || !arg_keep_unit) {
5044 r = sd_bus_default_system(&bus);
5045 if (r < 0)
5046 return log_error_errno(r, "Failed to open system bus: %m");
5047
5048 r = sd_bus_set_close_on_exit(bus, false);
5049 if (r < 0)
5050 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5051 }
5052
5053 if (!arg_keep_unit) {
5054 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5055 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5056 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5057
5058 r = sd_bus_match_signal_async(
5059 bus,
5060 NULL,
5061 "org.freedesktop.systemd1",
5062 NULL,
5063 "org.freedesktop.systemd1.Scope",
5064 "RequestStop",
5065 on_request_stop, NULL, PID_TO_PTR(*pid));
5066 if (r < 0)
5067 return log_error_errno(r, "Failed to request RequestStop match: %m");
5068 }
5069
5070 if (arg_register) {
5071 r = register_machine(
5072 bus,
5073 arg_machine,
5074 *pid,
5075 arg_directory,
5076 arg_uuid,
5077 ifi,
5078 arg_slice,
5079 arg_custom_mounts, arg_n_custom_mounts,
5080 arg_kill_signal,
5081 arg_property,
5082 arg_property_message,
5083 arg_keep_unit,
5084 arg_container_service_name);
5085 if (r < 0)
5086 return r;
5087
5088 } else if (!arg_keep_unit) {
5089 r = allocate_scope(
5090 bus,
5091 arg_machine,
5092 *pid,
5093 arg_slice,
5094 arg_custom_mounts, arg_n_custom_mounts,
5095 arg_kill_signal,
5096 arg_property,
5097 arg_property_message);
5098 if (r < 0)
5099 return r;
5100
5101 } else if (arg_slice || arg_property)
5102 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5103
5104 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5105 if (r < 0)
5106 return r;
5107
5108 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5109 if (r < 0)
5110 return r;
5111
5112 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5113 if (r < 0)
5114 return r;
5115
5116 /* Notify the child that the parent is ready with all
5117 * its setup (including cgroup-ification), and that
5118 * the child can now hand over control to the code to
5119 * run inside the container. */
5120 (void) barrier_place(&barrier); /* #4 */
5121
5122 /* Block SIGCHLD here, before notifying child.
5123 * process_pty() will handle it with the other signals. */
5124 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5125
5126 /* Reset signal to default */
5127 r = default_signals(SIGCHLD);
5128 if (r < 0)
5129 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5130
5131 r = sd_event_new(&event);
5132 if (r < 0)
5133 return log_error_errno(r, "Failed to get default event source: %m");
5134
5135 (void) sd_event_set_watchdog(event, true);
5136
5137 if (bus) {
5138 r = sd_bus_attach_event(bus, event, 0);
5139 if (r < 0)
5140 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5141 }
5142
5143 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5144 if (r < 0)
5145 return r;
5146
5147 /* Let the child know that we are ready and wait that the child is completely ready now. */
5148 if (!barrier_place_and_sync(&barrier)) /* #5 */
5149 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5150
5151 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5152 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5153 etc_passwd_lock = safe_close(etc_passwd_lock);
5154
5155 (void) sd_notifyf(false,
5156 "STATUS=Container running.\n"
5157 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5158 if (!arg_notify_ready) {
5159 r = sd_notify(false, "READY=1\n");
5160 if (r < 0)
5161 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5162 }
5163
5164 if (arg_kill_signal > 0) {
5165 /* Try to kill the init system on SIGINT or SIGTERM */
5166 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5167 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5168 } else {
5169 /* Immediately exit */
5170 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5171 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5172 }
5173
5174 /* Exit when the child exits */
5175 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5176
5177 if (arg_expose_ports) {
5178 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
5179 if (r < 0)
5180 return r;
5181
5182 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5183 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5184 }
5185
5186 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5187
5188 if (arg_console_mode != CONSOLE_PIPE) {
5189 _cleanup_close_ int fd = -1;
5190 PTYForwardFlags flags = 0;
5191
5192 /* Retrieve the master pty allocated by inner child */
5193 fd = receive_one_fd(master_pty_socket_pair[0], 0);
5194 if (fd < 0)
5195 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5196
5197 switch (arg_console_mode) {
5198
5199 case CONSOLE_READ_ONLY:
5200 flags |= PTY_FORWARD_READ_ONLY;
5201
5202 _fallthrough_;
5203
5204 case CONSOLE_INTERACTIVE:
5205 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5206
5207 r = pty_forward_new(event, fd, flags, &forward);
5208 if (r < 0)
5209 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5210
5211 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5212 (void) pty_forward_set_width_height(forward,
5213 arg_console_width,
5214 arg_console_height);
5215 break;
5216
5217 default:
5218 assert(arg_console_mode == CONSOLE_PASSIVE);
5219 }
5220
5221 *master = TAKE_FD(fd);
5222 }
5223
5224 r = sd_event_loop(event);
5225 if (r < 0)
5226 return log_error_errno(r, "Failed to run event loop: %m");
5227
5228 if (forward) {
5229 char last_char = 0;
5230
5231 (void) pty_forward_get_last_char(forward, &last_char);
5232 forward = pty_forward_free(forward);
5233
5234 if (!arg_quiet && last_char != '\n')
5235 putc('\n', stdout);
5236 }
5237
5238 /* Kill if it is not dead yet anyway */
5239 if (!arg_register && !arg_keep_unit && bus)
5240 terminate_scope(bus, arg_machine);
5241
5242 /* Normally redundant, but better safe than sorry */
5243 (void) kill(*pid, SIGKILL);
5244
5245 if (arg_private_network) {
5246 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5247 * to avoid having to move the parent to the child network namespace. */
5248 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5249 if (r < 0)
5250 return r;
5251
5252 if (r == 0) {
5253 _cleanup_close_ int parent_netns_fd = -1;
5254
5255 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5256 if (r < 0) {
5257 log_error_errno(r, "Failed to open parent network namespace: %m");
5258 _exit(EXIT_FAILURE);
5259 }
5260
5261 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5262 if (r < 0) {
5263 log_error_errno(r, "Failed to enter child network namespace: %m");
5264 _exit(EXIT_FAILURE);
5265 }
5266
5267 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5268 if (r < 0)
5269 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5270
5271 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5272 }
5273 }
5274
5275 r = wait_for_container(TAKE_PID(*pid), &container_status);
5276
5277 /* Tell machined that we are gone. */
5278 if (bus)
5279 (void) unregister_machine(bus, arg_machine);
5280
5281 if (r < 0)
5282 /* We failed to wait for the container, or the container exited abnormally. */
5283 return r;
5284 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5285 /* r > 0 → The container exited with a non-zero status.
5286 * As a special case, we need to replace 133 with a different value,
5287 * because 133 is special-cased in the service file to reboot the container.
5288 * otherwise → The container exited with zero status and a reboot was not requested.
5289 */
5290 if (r == EXIT_FORCE_RESTART)
5291 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5292 *ret = r;
5293 return 0; /* finito */
5294 }
5295
5296 /* CONTAINER_REBOOTED, loop again */
5297
5298 if (arg_keep_unit) {
5299 /* Special handling if we are running as a service: instead of simply
5300 * restarting the machine we want to restart the entire service, so let's
5301 * inform systemd about this with the special exit code 133. The service
5302 * file uses RestartForceExitStatus=133 so that this results in a full
5303 * nspawn restart. This is necessary since we might have cgroup parameters
5304 * set we want to have flushed out. */
5305 *ret = EXIT_FORCE_RESTART;
5306 return 0; /* finito */
5307 }
5308
5309 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5310 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5311
5312 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5313 *veth_created = false;
5314 return 1; /* loop again */
5315 }
5316
5317 static int initialize_rlimits(void) {
5318 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5319 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5320 * container execution environments. */
5321
5322 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5323 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5324 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5325 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5326 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5327 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5328 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5329 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5330 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5331 [RLIMIT_NICE] = { 0, 0 },
5332 [RLIMIT_NOFILE] = { 1024, 4096 },
5333 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5334 [RLIMIT_RTPRIO] = { 0, 0 },
5335 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5336 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5337
5338 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5339 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5340 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5341 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5342 * that PID 1 changes a number of other resource limits during early initialization which is why we
5343 * don't read the other limits from PID 1 but prefer the static table above. */
5344 };
5345
5346 int rl;
5347
5348 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5349 /* Let's only fill in what the user hasn't explicitly configured anyway */
5350 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5351 const struct rlimit *v;
5352 struct rlimit buffer;
5353
5354 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5355 /* For these two let's read the limits off PID 1. See above for an explanation. */
5356
5357 if (prlimit(1, rl, NULL, &buffer) < 0)
5358 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5359
5360 v = &buffer;
5361 } else if (rl == RLIMIT_NOFILE) {
5362 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5363 * userspace. Given that nspawn containers are often run without our PID 1,
5364 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5365 * so that container userspace gets similar resources as host userspace
5366 * gets. */
5367 buffer = kernel_defaults[rl];
5368 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5369 v = &buffer;
5370 } else
5371 v = kernel_defaults + rl;
5372
5373 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5374 if (!arg_rlimit[rl])
5375 return log_oom();
5376 }
5377
5378 if (DEBUG_LOGGING) {
5379 _cleanup_free_ char *k = NULL;
5380
5381 (void) rlimit_format(arg_rlimit[rl], &k);
5382 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5383 }
5384 }
5385
5386 return 0;
5387 }
5388
5389 static int cant_be_in_netns(void) {
5390 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5391 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5392 _cleanup_close_ int fd = -1;
5393 struct ucred ucred;
5394 int r;
5395
5396 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5397 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5398 * nice message. */
5399
5400 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5401 return 0;
5402
5403 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5404 if (fd < 0)
5405 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5406
5407 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5408 if (r < 0) {
5409 if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
5410 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5411 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5412
5413 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5414 }
5415
5416 r = getpeercred(fd, &ucred);
5417 if (r < 0)
5418 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5419
5420 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5421 r = readlink_malloc(udev_path, &udev_ns);
5422 if (r < 0)
5423 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5424
5425 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5426 if (r < 0)
5427 return log_error_errno(r, "Failed to read our own network namespace: %m");
5428
5429 if (!streq(our_ns, udev_ns))
5430 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5431 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5432 return 0;
5433 }
5434
5435 static int run(int argc, char *argv[]) {
5436 bool secondary = false, remove_directory = false, remove_image = false,
5437 veth_created = false, remove_tmprootdir = false;
5438 _cleanup_close_ int master = -1;
5439 _cleanup_fdset_free_ FDSet *fds = NULL;
5440 int r, n_fd_passed, ret = EXIT_SUCCESS;
5441 char veth_name[IFNAMSIZ] = "";
5442 struct ExposeArgs expose_args = {};
5443 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5444 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5445 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5446 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5447 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5448 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5449 pid_t pid = 0;
5450
5451 log_parse_environment();
5452 log_open();
5453
5454 r = parse_argv(argc, argv);
5455 if (r <= 0)
5456 goto finish;
5457
5458 if (geteuid() != 0) {
5459 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5460 argc >= 2 ? "Need to be root." :
5461 "Need to be root (and some arguments are usually required).\nHint: try --help");
5462 goto finish;
5463 }
5464
5465 r = cant_be_in_netns();
5466 if (r < 0)
5467 goto finish;
5468
5469 r = initialize_rlimits();
5470 if (r < 0)
5471 goto finish;
5472
5473 r = load_oci_bundle();
5474 if (r < 0)
5475 goto finish;
5476
5477 r = determine_names();
5478 if (r < 0)
5479 goto finish;
5480
5481 r = load_settings();
5482 if (r < 0)
5483 goto finish;
5484
5485 r = cg_unified();
5486 if (r < 0) {
5487 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5488 goto finish;
5489 }
5490
5491 r = verify_arguments();
5492 if (r < 0)
5493 goto finish;
5494
5495 /* Reapply environment settings. */
5496 (void) detect_unified_cgroup_hierarchy_from_environment();
5497
5498 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5499 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5500 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5501 (void) ignore_signals(SIGPIPE);
5502
5503 n_fd_passed = sd_listen_fds(false);
5504 if (n_fd_passed > 0) {
5505 r = fdset_new_listen_fds(&fds, false);
5506 if (r < 0) {
5507 log_error_errno(r, "Failed to collect file descriptors: %m");
5508 goto finish;
5509 }
5510 }
5511
5512 /* The "default" umask. This is appropriate for most file and directory
5513 * operations performed by nspawn, and is the umask that will be used for
5514 * the child. Functions like copy_devnodes() change the umask temporarily. */
5515 umask(0022);
5516
5517 if (arg_directory) {
5518 assert(!arg_image);
5519
5520 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5521 * /var from the host will propagate into container dynamically (because bad things happen if
5522 * two systems write to the same /var). Let's allow it for the special cases where /var is
5523 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5524 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5525 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5526 r = -EINVAL;
5527 goto finish;
5528 }
5529
5530 if (arg_ephemeral) {
5531 _cleanup_free_ char *np = NULL;
5532
5533 r = chase_symlinks_and_update(&arg_directory, 0);
5534 if (r < 0)
5535 goto finish;
5536
5537 /* If the specified path is a mount point we generate the new snapshot immediately
5538 * inside it under a random name. However if the specified is not a mount point we
5539 * create the new snapshot in the parent directory, just next to it. */
5540 r = path_is_mount_point(arg_directory, NULL, 0);
5541 if (r < 0) {
5542 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5543 goto finish;
5544 }
5545 if (r > 0)
5546 r = tempfn_random_child(arg_directory, "machine.", &np);
5547 else
5548 r = tempfn_random(arg_directory, "machine.", &np);
5549 if (r < 0) {
5550 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5551 goto finish;
5552 }
5553
5554 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5555 * only owned by us and no one else. */
5556 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5557 if (r < 0) {
5558 log_error_errno(r, "Failed to lock %s: %m", np);
5559 goto finish;
5560 }
5561
5562 {
5563 BLOCK_SIGNALS(SIGINT);
5564 r = btrfs_subvol_snapshot(arg_directory, np,
5565 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5566 BTRFS_SNAPSHOT_FALLBACK_COPY |
5567 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5568 BTRFS_SNAPSHOT_RECURSIVE |
5569 BTRFS_SNAPSHOT_QUOTA |
5570 BTRFS_SNAPSHOT_SIGINT);
5571 }
5572 if (r == -EINTR) {
5573 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5574 goto finish;
5575 }
5576 if (r < 0) {
5577 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5578 goto finish;
5579 }
5580
5581 free_and_replace(arg_directory, np);
5582 remove_directory = true;
5583 } else {
5584 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5585 if (r < 0)
5586 goto finish;
5587
5588 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5589 if (r == -EBUSY) {
5590 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5591 goto finish;
5592 }
5593 if (r < 0) {
5594 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5595 goto finish;
5596 }
5597
5598 if (arg_template) {
5599 r = chase_symlinks_and_update(&arg_template, 0);
5600 if (r < 0)
5601 goto finish;
5602
5603 {
5604 BLOCK_SIGNALS(SIGINT);
5605 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5606 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5607 BTRFS_SNAPSHOT_FALLBACK_COPY |
5608 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5609 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5610 BTRFS_SNAPSHOT_RECURSIVE |
5611 BTRFS_SNAPSHOT_QUOTA |
5612 BTRFS_SNAPSHOT_SIGINT);
5613 }
5614 if (r == -EEXIST)
5615 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5616 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5617 else if (r == -EINTR) {
5618 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5619 goto finish;
5620 } else if (r < 0) {
5621 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5622 goto finish;
5623 } else
5624 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5625 "Populated %s from template %s.", arg_directory, arg_template);
5626 }
5627 }
5628
5629 if (arg_start_mode == START_BOOT) {
5630 _cleanup_free_ char *b = NULL;
5631 const char *p;
5632
5633 if (arg_pivot_root_new) {
5634 b = path_join(arg_directory, arg_pivot_root_new);
5635 if (!b)
5636 return log_oom();
5637
5638 p = b;
5639 } else
5640 p = arg_directory;
5641
5642 if (path_is_os_tree(p) <= 0) {
5643 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5644 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5645 goto finish;
5646 }
5647 } else {
5648 _cleanup_free_ char *p = NULL;
5649
5650 if (arg_pivot_root_new)
5651 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5652 else
5653 p = path_join(arg_directory, "/usr/");
5654 if (!p)
5655 return log_oom();
5656
5657 if (laccess(p, F_OK) < 0) {
5658 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5659 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5660 goto finish;
5661 }
5662 }
5663
5664 } else {
5665 DissectImageFlags dissect_image_flags =
5666 DISSECT_IMAGE_GENERIC_ROOT |
5667 DISSECT_IMAGE_REQUIRE_ROOT |
5668 DISSECT_IMAGE_RELAX_VAR_CHECK |
5669 DISSECT_IMAGE_USR_NO_ROOT;
5670 assert(arg_image);
5671 assert(!arg_template);
5672
5673 r = chase_symlinks_and_update(&arg_image, 0);
5674 if (r < 0)
5675 goto finish;
5676
5677 if (arg_ephemeral) {
5678 _cleanup_free_ char *np = NULL;
5679
5680 r = tempfn_random(arg_image, "machine.", &np);
5681 if (r < 0) {
5682 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5683 goto finish;
5684 }
5685
5686 /* Always take an exclusive lock on our own ephemeral copy. */
5687 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5688 if (r < 0) {
5689 r = log_error_errno(r, "Failed to create image lock: %m");
5690 goto finish;
5691 }
5692
5693 {
5694 BLOCK_SIGNALS(SIGINT);
5695 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5696 }
5697 if (r == -EINTR) {
5698 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5699 goto finish;
5700 }
5701 if (r < 0) {
5702 r = log_error_errno(r, "Failed to copy image file: %m");
5703 goto finish;
5704 }
5705
5706 free_and_replace(arg_image, np);
5707 remove_image = true;
5708 } else {
5709 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5710 if (r == -EBUSY) {
5711 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5712 goto finish;
5713 }
5714 if (r < 0) {
5715 r = log_error_errno(r, "Failed to create image lock: %m");
5716 goto finish;
5717 }
5718
5719 r = verity_settings_load(
5720 &arg_verity_settings,
5721 arg_image, NULL, NULL);
5722 if (r < 0) {
5723 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5724 goto finish;
5725 }
5726
5727 if (arg_verity_settings.data_path)
5728 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5729 }
5730
5731 if (!mkdtemp(tmprootdir)) {
5732 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5733 goto finish;
5734 }
5735
5736 remove_tmprootdir = true;
5737
5738 arg_directory = strdup(tmprootdir);
5739 if (!arg_directory) {
5740 r = log_oom();
5741 goto finish;
5742 }
5743
5744 r = loop_device_make_by_path(
5745 arg_image,
5746 arg_read_only ? O_RDONLY : O_RDWR,
5747 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5748 &loop);
5749 if (r < 0) {
5750 log_error_errno(r, "Failed to set up loopback block device: %m");
5751 goto finish;
5752 }
5753
5754 /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
5755 r = loop_device_flock(loop, LOCK_SH);
5756 if (r < 0) {
5757 log_error_errno(r, "Failed to take lock on loopback block device: %m");
5758 goto finish;
5759 }
5760
5761 r = dissect_image_and_warn(
5762 loop->fd,
5763 arg_image,
5764 &arg_verity_settings,
5765 NULL,
5766 loop->diskseq,
5767 loop->uevent_seqnum_not_before,
5768 loop->timestamp_not_before,
5769 dissect_image_flags,
5770 &dissected_image);
5771 if (r == -ENOPKG) {
5772 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5773 log_notice("Note that the disk image needs to\n"
5774 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5775 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5776 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
5777 " d) or contain a file system without a partition table\n"
5778 "in order to be bootable with systemd-nspawn.");
5779 goto finish;
5780 }
5781 if (r < 0)
5782 goto finish;
5783
5784 r = dissected_image_load_verity_sig_partition(
5785 dissected_image,
5786 loop->fd,
5787 &arg_verity_settings);
5788 if (r < 0)
5789 goto finish;
5790
5791 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5792 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5793 "root hash signature found! Proceeding without integrity checking.", arg_image);
5794
5795 r = dissected_image_decrypt_interactively(
5796 dissected_image,
5797 NULL,
5798 &arg_verity_settings,
5799 0,
5800 &decrypted_image);
5801 if (r < 0)
5802 goto finish;
5803
5804 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5805 if (remove_image && unlink(arg_image) >= 0)
5806 remove_image = false;
5807 }
5808
5809 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5810 if (r < 0)
5811 goto finish;
5812
5813 if (arg_console_mode < 0)
5814 arg_console_mode =
5815 isatty(STDIN_FILENO) > 0 &&
5816 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5817
5818 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5819 arg_quiet = true;
5820
5821 if (!arg_quiet)
5822 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5823 arg_machine, arg_image ?: arg_directory);
5824
5825 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5826
5827 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5828 r = log_error_errno(errno, "Failed to become subreaper: %m");
5829 goto finish;
5830 }
5831
5832 if (arg_expose_ports) {
5833 r = fw_ctx_new(&fw_ctx);
5834 if (r < 0) {
5835 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5836 goto finish;
5837 }
5838 expose_args.fw_ctx = fw_ctx;
5839 }
5840 for (;;) {
5841 r = run_container(dissected_image,
5842 secondary,
5843 fds,
5844 veth_name, &veth_created,
5845 &expose_args, &master,
5846 &pid, &ret);
5847 if (r <= 0)
5848 break;
5849 }
5850
5851 finish:
5852 (void) sd_notify(false,
5853 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5854 "STOPPING=1\nSTATUS=Terminating...");
5855
5856 if (pid > 0)
5857 (void) kill(pid, SIGKILL);
5858
5859 /* Try to flush whatever is still queued in the pty */
5860 if (master >= 0) {
5861 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5862 master = safe_close(master);
5863 }
5864
5865 if (pid > 0)
5866 (void) wait_for_terminate(pid, NULL);
5867
5868 pager_close();
5869
5870 if (remove_directory && arg_directory) {
5871 int k;
5872
5873 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5874 if (k < 0)
5875 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5876 }
5877
5878 if (remove_image && arg_image) {
5879 if (unlink(arg_image) < 0)
5880 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5881 }
5882
5883 if (remove_tmprootdir) {
5884 if (rmdir(tmprootdir) < 0)
5885 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5886 }
5887
5888 if (arg_machine) {
5889 const char *p;
5890
5891 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5892 (void) rm_rf(p, REMOVE_ROOT);
5893 }
5894
5895 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5896 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5897
5898 if (veth_created)
5899 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5900 (void) remove_bridge(arg_network_zone);
5901
5902 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5903 expose_port_free_all(arg_expose_ports);
5904 rlimit_free_all(arg_rlimit);
5905 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5906 credential_free_all(arg_credentials, arg_n_credentials);
5907
5908 if (r < 0)
5909 return r;
5910
5911 return ret;
5912 }
5913
5914 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);