]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
8abb01679101625e92e7aacbf7989bc9a7294fc0
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "barrier.h"
28 #include "base-filesystem.h"
29 #include "blkid-util.h"
30 #include "btrfs-util.h"
31 #include "build.h"
32 #include "bus-error.h"
33 #include "bus-util.h"
34 #include "cap-list.h"
35 #include "capability-util.h"
36 #include "cgroup-util.h"
37 #include "chase.h"
38 #include "common-signal.h"
39 #include "copy.h"
40 #include "cpu-set-util.h"
41 #include "creds-util.h"
42 #include "dev-setup.h"
43 #include "discover-image.h"
44 #include "dissect-image.h"
45 #include "env-util.h"
46 #include "escape.h"
47 #include "fd-util.h"
48 #include "fdset.h"
49 #include "fileio.h"
50 #include "format-util.h"
51 #include "fs-util.h"
52 #include "gpt.h"
53 #include "hexdecoct.h"
54 #include "hostname-setup.h"
55 #include "hostname-util.h"
56 #include "id128-util.h"
57 #include "io-util.h"
58 #include "log.h"
59 #include "loop-util.h"
60 #include "loopback-setup.h"
61 #include "macro.h"
62 #include "main-func.h"
63 #include "missing_sched.h"
64 #include "mkdir.h"
65 #include "mount-util.h"
66 #include "mountpoint-util.h"
67 #include "namespace-util.h"
68 #include "netlink-util.h"
69 #include "nspawn-bind-user.h"
70 #include "nspawn-cgroup.h"
71 #include "nspawn-creds.h"
72 #include "nspawn-def.h"
73 #include "nspawn-expose-ports.h"
74 #include "nspawn-mount.h"
75 #include "nspawn-network.h"
76 #include "nspawn-oci.h"
77 #include "nspawn-patch-uid.h"
78 #include "nspawn-register.h"
79 #include "nspawn-seccomp.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "nspawn-stub-pid1.h"
83 #include "nspawn-util.h"
84 #include "nspawn.h"
85 #include "nulstr-util.h"
86 #include "os-util.h"
87 #include "pager.h"
88 #include "parse-argument.h"
89 #include "parse-util.h"
90 #include "pretty-print.h"
91 #include "process-util.h"
92 #include "ptyfwd.h"
93 #include "random-util.h"
94 #include "raw-clone.h"
95 #include "resolve-util.h"
96 #include "rlimit-util.h"
97 #include "rm-rf.h"
98 #if HAVE_SECCOMP
99 #include "seccomp-util.h"
100 #endif
101 #include "selinux-util.h"
102 #include "signal-util.h"
103 #include "socket-util.h"
104 #include "stat-util.h"
105 #include "stdio-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "sysctl-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-name.h"
114 #include "user-util.h"
115
116 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
118 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED,
125 } ContainerStatus;
126
127 static char *arg_directory = NULL;
128 static char *arg_template = NULL;
129 static char *arg_chdir = NULL;
130 static char *arg_pivot_root_new = NULL;
131 static char *arg_pivot_root_old = NULL;
132 static char *arg_user = NULL;
133 static uid_t arg_uid = UID_INVALID;
134 static gid_t arg_gid = GID_INVALID;
135 static gid_t* arg_supplementary_gids = NULL;
136 static size_t arg_n_supplementary_gids = 0;
137 static sd_id128_t arg_uuid = {};
138 static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 static char *arg_hostname = NULL; /* The name the payload sees by default */
140 static const char *arg_selinux_context = NULL;
141 static const char *arg_selinux_apifs_context = NULL;
142 static char *arg_slice = NULL;
143 static bool arg_private_network = false;
144 static bool arg_read_only = false;
145 static StartMode arg_start_mode = START_PID1;
146 static bool arg_ephemeral = false;
147 static LinkJournal arg_link_journal = LINK_AUTO;
148 static bool arg_link_journal_try = false;
149 static uint64_t arg_caps_retain =
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
161 (1ULL << CAP_MKNOD) |
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
165 (1ULL << CAP_SETFCAP) |
166 (1ULL << CAP_SETGID) |
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
170 (1ULL << CAP_SYS_BOOT) |
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
174 (1ULL << CAP_SYS_RESOURCE) |
175 (1ULL << CAP_SYS_TTY_CONFIG);
176 static uint64_t arg_caps_ambient = 0;
177 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
178 static CustomMount *arg_custom_mounts = NULL;
179 static size_t arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static char *arg_network_namespace_path = NULL;
192 static PagerFlags arg_pager_flags = 0;
193 static unsigned long arg_personality = PERSONALITY_INVALID;
194 static char *arg_image = NULL;
195 static char *arg_oci_bundle = NULL;
196 static VolatileMode arg_volatile_mode = VOLATILE_NO;
197 static ExposePort *arg_expose_ports = NULL;
198 static char **arg_property = NULL;
199 static sd_bus_message *arg_property_message = NULL;
200 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
201 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
202 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
203 static int arg_kill_signal = 0;
204 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
205 static SettingsMask arg_settings_mask = 0;
206 static int arg_settings_trusted = -1;
207 static char **arg_parameters = NULL;
208 static const char *arg_container_service_name = "systemd-nspawn";
209 static bool arg_notify_ready = false;
210 static bool arg_use_cgns = true;
211 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
212 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
213 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
214 static char **arg_syscall_allow_list = NULL;
215 static char **arg_syscall_deny_list = NULL;
216 #if HAVE_SECCOMP
217 static scmp_filter_ctx arg_seccomp = NULL;
218 #endif
219 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
220 static bool arg_no_new_privileges = false;
221 static int arg_oom_score_adjust = 0;
222 static bool arg_oom_score_adjust_set = false;
223 static CPUSet arg_cpu_set = {};
224 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
225 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
226 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
227 static DeviceNode* arg_extra_nodes = NULL;
228 static size_t arg_n_extra_nodes = 0;
229 static char **arg_sysctl = NULL;
230 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
231 static Credential *arg_credentials = NULL;
232 static size_t arg_n_credentials = 0;
233 static char **arg_bind_user = NULL;
234 static bool arg_suppress_sync = false;
235 static char *arg_settings_filename = NULL;
236 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
237
238 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
259 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
260 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
261 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
262 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
263 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
264 #if HAVE_SECCOMP
265 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
266 #endif
267 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
268 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
269 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
270 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
271
272 static int handle_arg_console(const char *arg) {
273 if (streq(arg, "help")) {
274 puts("autopipe\n"
275 "interactive\n"
276 "passive\n"
277 "pipe\n"
278 "read-only");
279 return 0;
280 }
281
282 if (streq(arg, "interactive"))
283 arg_console_mode = CONSOLE_INTERACTIVE;
284 else if (streq(arg, "read-only"))
285 arg_console_mode = CONSOLE_READ_ONLY;
286 else if (streq(arg, "passive"))
287 arg_console_mode = CONSOLE_PASSIVE;
288 else if (streq(arg, "pipe")) {
289 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
290 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
291 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
292 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
293 "Proceeding anyway.");
294
295 arg_console_mode = CONSOLE_PIPE;
296 } else if (streq(arg, "autopipe")) {
297 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
298 arg_console_mode = CONSOLE_INTERACTIVE;
299 else
300 arg_console_mode = CONSOLE_PIPE;
301 } else
302 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
303
304 arg_settings_mask |= SETTING_CONSOLE_MODE;
305 return 1;
306 }
307
308 static int help(void) {
309 _cleanup_free_ char *link = NULL;
310 int r;
311
312 pager_open(arg_pager_flags);
313
314 r = terminal_urlify_man("systemd-nspawn", "1", &link);
315 if (r < 0)
316 return log_oom();
317
318 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
319 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
320 " -h --help Show this help\n"
321 " --version Print version string\n"
322 " -q --quiet Do not show status information\n"
323 " --no-pager Do not pipe output into a pager\n"
324 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
325 "%3$sImage:%4$s\n"
326 " -D --directory=PATH Root directory for the container\n"
327 " --template=PATH Initialize root directory from template directory,\n"
328 " if missing\n"
329 " -x --ephemeral Run container with snapshot of root directory, and\n"
330 " remove it after exit\n"
331 " -i --image=PATH Root file system disk image (or device node) for\n"
332 " the container\n"
333 " --oci-bundle=PATH OCI bundle directory\n"
334 " --read-only Mount the root directory read-only\n"
335 " --volatile[=MODE] Run the system in volatile mode\n"
336 " --root-hash=HASH Specify verity root hash for root disk image\n"
337 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
338 " as a DER encoded PKCS7, either as a path to a file\n"
339 " or as an ASCII base64 encoded string prefixed by\n"
340 " 'base64:'\n"
341 " --verity-data=PATH Specify hash device for verity\n"
342 " --pivot-root=PATH[:PATH]\n"
343 " Pivot root to given directory in the container\n\n"
344 "%3$sExecution:%4$s\n"
345 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
346 " -b --boot Boot up full system (i.e. invoke init)\n"
347 " --chdir=PATH Set working directory in the container\n"
348 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
349 " -u --user=USER Run the command under specified user or UID\n"
350 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
351 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
352 " --suppress-sync=BOOLEAN\n"
353 " Suppress any form of disk data synchronization\n\n"
354 "%3$sSystem Identity:%4$s\n"
355 " -M --machine=NAME Set the machine name for the container\n"
356 " --hostname=NAME Override the hostname for the container\n"
357 " --uuid=UUID Set a specific machine UUID for the container\n\n"
358 "%3$sProperties:%4$s\n"
359 " -S --slice=SLICE Place the container in the specified slice\n"
360 " --property=NAME=VALUE Set scope unit property\n"
361 " --register=BOOLEAN Register container as machine\n"
362 " --keep-unit Do not register a scope for the machine, reuse\n"
363 " the service unit nspawn is running in\n\n"
364 "%3$sUser Namespacing:%4$s\n"
365 " --private-users=no Run without user namespacing\n"
366 " --private-users=yes|pick|identity\n"
367 " Run within user namespace, autoselect UID/GID range\n"
368 " --private-users=UIDBASE[:NUIDS]\n"
369 " Similar, but with user configured UID/GID range\n"
370 " --private-users-ownership=MODE\n"
371 " Adjust ('chown') or map ('map') OS tree ownership\n"
372 " to private UID/GID range\n"
373 " -U Equivalent to --private-users=pick and\n"
374 " --private-users-ownership=auto\n\n"
375 "%3$sNetworking:%4$s\n"
376 " --private-network Disable network in container\n"
377 " --network-interface=INTERFACE\n"
378 " Assign an existing network interface to the\n"
379 " container\n"
380 " --network-macvlan=INTERFACE\n"
381 " Create a macvlan network interface based on an\n"
382 " existing network interface to the container\n"
383 " --network-ipvlan=INTERFACE\n"
384 " Create an ipvlan network interface based on an\n"
385 " existing network interface to the container\n"
386 " -n --network-veth Add a virtual Ethernet connection between host\n"
387 " and container\n"
388 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
389 " Add an additional virtual Ethernet link between\n"
390 " host and container\n"
391 " --network-bridge=INTERFACE\n"
392 " Add a virtual Ethernet connection to the container\n"
393 " and attach it to an existing bridge on the host\n"
394 " --network-zone=NAME Similar, but attach the new interface to an\n"
395 " an automatically managed bridge interface\n"
396 " --network-namespace-path=PATH\n"
397 " Set network namespace to the one represented by\n"
398 " the specified kernel namespace file node\n"
399 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
400 " Expose a container IP port on the host\n\n"
401 "%3$sSecurity:%4$s\n"
402 " --capability=CAP In addition to the default, retain specified\n"
403 " capability\n"
404 " --drop-capability=CAP Drop the specified capability from the default set\n"
405 " --ambient-capability=CAP\n"
406 " Sets the specified capability for the started\n"
407 " process. Not useful if booting a machine.\n"
408 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
409 " --system-call-filter=LIST|~LIST\n"
410 " Permit/prohibit specific system calls\n"
411 " -Z --selinux-context=SECLABEL\n"
412 " Set the SELinux security context to be used by\n"
413 " processes in the container\n"
414 " -L --selinux-apifs-context=SECLABEL\n"
415 " Set the SELinux security context to be used by\n"
416 " API/tmpfs file systems in the container\n\n"
417 "%3$sResources:%4$s\n"
418 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
419 " --oom-score-adjust=VALUE\n"
420 " Adjust the OOM score value for the payload\n"
421 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
422 " --personality=ARCH Pick personality for this container\n\n"
423 "%3$sIntegration:%4$s\n"
424 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
425 " --timezone=MODE Select mode of /etc/localtime initialization\n"
426 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
427 " host, try-guest, try-host\n"
428 " -j Equivalent to --link-journal=try-guest\n\n"
429 "%3$sMounts:%4$s\n"
430 " --bind=PATH[:PATH[:OPTIONS]]\n"
431 " Bind mount a file or directory from the host into\n"
432 " the container\n"
433 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
434 " Similar, but creates a read-only bind mount\n"
435 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
436 " it\n"
437 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
438 " --overlay=PATH[:PATH...]:PATH\n"
439 " Create an overlay mount from the host to \n"
440 " the container\n"
441 " --overlay-ro=PATH[:PATH...]:PATH\n"
442 " Similar, but creates a read-only overlay mount\n"
443 " --bind-user=NAME Bind user from host to container\n\n"
444 "%3$sInput/Output:%4$s\n"
445 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
446 " set up for the container.\n"
447 " -P --pipe Equivalent to --console=pipe\n\n"
448 "%3$sCredentials:%4$s\n"
449 " --set-credential=ID:VALUE\n"
450 " Pass a credential with literal value to container.\n"
451 " --load-credential=ID:PATH\n"
452 " Load credential to pass to container from file or\n"
453 " AF_UNIX stream socket.\n"
454 "\nSee the %2$s for details.\n",
455 program_invocation_short_name,
456 link,
457 ansi_underline(),
458 ansi_normal(),
459 ansi_highlight(),
460 ansi_normal());
461
462 return 0;
463 }
464
465 static int custom_mount_check_all(void) {
466 size_t i;
467
468 for (i = 0; i < arg_n_custom_mounts; i++) {
469 CustomMount *m = &arg_custom_mounts[i];
470
471 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
472 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
473 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
474 "--private-users-ownership=own may not be combined with custom root mounts.");
475 if (arg_uid_shift == UID_INVALID)
476 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
477 "--private-users with automatic UID shift may not be combined with custom root mounts.");
478 }
479 }
480
481 return 0;
482 }
483
484 static int detect_unified_cgroup_hierarchy_from_environment(void) {
485 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
486 int r;
487
488 /* Allow the user to control whether the unified hierarchy is used */
489
490 e = getenv(var);
491 if (!e) {
492 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
493 var = "UNIFIED_CGROUP_HIERARCHY";
494 e = getenv(var);
495 }
496
497 if (!isempty(e)) {
498 r = parse_boolean(e);
499 if (r < 0)
500 return log_error_errno(r, "Failed to parse $%s: %m", var);
501 if (r > 0)
502 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
503 else
504 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
505 }
506
507 return 0;
508 }
509
510 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
511 int r;
512
513 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
514 * in the image actually supports. */
515 r = cg_all_unified();
516 if (r < 0)
517 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
518 if (r > 0) {
519 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
520 * routine only detects 231, so we'll have a false negative here for 230. */
521 r = systemd_installation_has_version(directory, "230");
522 if (r < 0)
523 return log_error_errno(r, "Failed to determine systemd version in container: %m");
524 if (r > 0)
525 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
526 else
527 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
528 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
529 /* Mixed cgroup hierarchy support was added in 233 */
530 r = systemd_installation_has_version(directory, "233");
531 if (r < 0)
532 return log_error_errno(r, "Failed to determine systemd version in container: %m");
533 if (r > 0)
534 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
535 else
536 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
537 } else
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
539
540 log_debug("Using %s hierarchy for container.",
541 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
542 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
543
544 return 0;
545 }
546
547 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
548 uint64_t mask = 0;
549 int r;
550
551 for (;;) {
552 _cleanup_free_ char *t = NULL;
553
554 r = extract_first_word(&spec, &t, ",", 0);
555 if (r < 0)
556 return log_error_errno(r, "Failed to parse capability %s.", t);
557 if (r == 0)
558 break;
559
560 if (streq(t, "help")) {
561 for (int i = 0; i < capability_list_length(); i++) {
562 const char *name;
563
564 name = capability_to_name(i);
565 if (name)
566 puts(name);
567 }
568
569 return 0; /* quit */
570 }
571
572 if (streq(t, "all"))
573 mask = UINT64_MAX;
574 else {
575 r = capability_from_name(t);
576 if (r < 0)
577 return log_error_errno(r, "Failed to parse capability %s.", t);
578
579 mask |= 1ULL << r;
580 }
581 }
582
583 *ret_mask = mask;
584 return 1; /* continue */
585 }
586
587 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
588 int r;
589
590 r = getenv_bool(name);
591 if (r == -ENXIO)
592 return 0;
593 if (r < 0)
594 return log_error_errno(r, "Failed to parse $%s: %m", name);
595
596 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
597 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
598 return 0;
599 }
600
601 static int parse_mount_settings_env(void) {
602 const char *e;
603 int r;
604
605 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
606 if (r < 0 && r != -ENXIO)
607 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
608 if (r >= 0)
609 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
610
611 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
612 if (streq_ptr(e, "network"))
613 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
614
615 else if (e) {
616 r = parse_boolean(e);
617 if (r < 0)
618 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
619
620 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
621 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
622 }
623
624 return 0;
625 }
626
627 static int parse_environment(void) {
628 const char *e;
629 int r;
630
631 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
632 if (r < 0)
633 return r;
634 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
635 if (r < 0)
636 return r;
637 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
638 if (r < 0)
639 return r;
640 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
641 if (r < 0)
642 return r;
643
644 r = parse_mount_settings_env();
645 if (r < 0)
646 return r;
647
648 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
649 * even if it is supported. If not supported, it has no effect. */
650 if (!cg_ns_supported())
651 arg_use_cgns = false;
652 else {
653 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
654 if (r < 0) {
655 if (r != -ENXIO)
656 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
657
658 arg_use_cgns = true;
659 } else {
660 arg_use_cgns = r > 0;
661 arg_settings_mask |= SETTING_USE_CGNS;
662 }
663 }
664
665 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
666 if (e)
667 arg_container_service_name = e;
668
669 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
670 if (r >= 0)
671 arg_suppress_sync = r;
672 else if (r != -ENXIO)
673 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
674
675 return detect_unified_cgroup_hierarchy_from_environment();
676 }
677
678 static int parse_argv(int argc, char *argv[]) {
679 enum {
680 ARG_VERSION = 0x100,
681 ARG_PRIVATE_NETWORK,
682 ARG_UUID,
683 ARG_READ_ONLY,
684 ARG_CAPABILITY,
685 ARG_AMBIENT_CAPABILITY,
686 ARG_DROP_CAPABILITY,
687 ARG_LINK_JOURNAL,
688 ARG_BIND,
689 ARG_BIND_RO,
690 ARG_TMPFS,
691 ARG_OVERLAY,
692 ARG_OVERLAY_RO,
693 ARG_INACCESSIBLE,
694 ARG_SHARE_SYSTEM,
695 ARG_REGISTER,
696 ARG_KEEP_UNIT,
697 ARG_NETWORK_INTERFACE,
698 ARG_NETWORK_MACVLAN,
699 ARG_NETWORK_IPVLAN,
700 ARG_NETWORK_BRIDGE,
701 ARG_NETWORK_ZONE,
702 ARG_NETWORK_VETH_EXTRA,
703 ARG_NETWORK_NAMESPACE_PATH,
704 ARG_PERSONALITY,
705 ARG_VOLATILE,
706 ARG_TEMPLATE,
707 ARG_PROPERTY,
708 ARG_PRIVATE_USERS,
709 ARG_KILL_SIGNAL,
710 ARG_SETTINGS,
711 ARG_CHDIR,
712 ARG_PIVOT_ROOT,
713 ARG_PRIVATE_USERS_CHOWN,
714 ARG_PRIVATE_USERS_OWNERSHIP,
715 ARG_NOTIFY_READY,
716 ARG_ROOT_HASH,
717 ARG_ROOT_HASH_SIG,
718 ARG_VERITY_DATA,
719 ARG_SYSTEM_CALL_FILTER,
720 ARG_RLIMIT,
721 ARG_HOSTNAME,
722 ARG_NO_NEW_PRIVILEGES,
723 ARG_OOM_SCORE_ADJUST,
724 ARG_CPU_AFFINITY,
725 ARG_RESOLV_CONF,
726 ARG_TIMEZONE,
727 ARG_CONSOLE,
728 ARG_PIPE,
729 ARG_OCI_BUNDLE,
730 ARG_NO_PAGER,
731 ARG_SET_CREDENTIAL,
732 ARG_LOAD_CREDENTIAL,
733 ARG_BIND_USER,
734 ARG_SUPPRESS_SYNC,
735 };
736
737 static const struct option options[] = {
738 { "help", no_argument, NULL, 'h' },
739 { "version", no_argument, NULL, ARG_VERSION },
740 { "directory", required_argument, NULL, 'D' },
741 { "template", required_argument, NULL, ARG_TEMPLATE },
742 { "ephemeral", no_argument, NULL, 'x' },
743 { "user", required_argument, NULL, 'u' },
744 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
745 { "as-pid2", no_argument, NULL, 'a' },
746 { "boot", no_argument, NULL, 'b' },
747 { "uuid", required_argument, NULL, ARG_UUID },
748 { "read-only", no_argument, NULL, ARG_READ_ONLY },
749 { "capability", required_argument, NULL, ARG_CAPABILITY },
750 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
751 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
752 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
753 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
754 { "bind", required_argument, NULL, ARG_BIND },
755 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
756 { "tmpfs", required_argument, NULL, ARG_TMPFS },
757 { "overlay", required_argument, NULL, ARG_OVERLAY },
758 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
759 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
760 { "machine", required_argument, NULL, 'M' },
761 { "hostname", required_argument, NULL, ARG_HOSTNAME },
762 { "slice", required_argument, NULL, 'S' },
763 { "setenv", required_argument, NULL, 'E' },
764 { "selinux-context", required_argument, NULL, 'Z' },
765 { "selinux-apifs-context", required_argument, NULL, 'L' },
766 { "quiet", no_argument, NULL, 'q' },
767 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
768 { "register", required_argument, NULL, ARG_REGISTER },
769 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
770 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
771 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
772 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
773 { "network-veth", no_argument, NULL, 'n' },
774 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
775 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
776 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
777 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
778 { "personality", required_argument, NULL, ARG_PERSONALITY },
779 { "image", required_argument, NULL, 'i' },
780 { "volatile", optional_argument, NULL, ARG_VOLATILE },
781 { "port", required_argument, NULL, 'p' },
782 { "property", required_argument, NULL, ARG_PROPERTY },
783 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
784 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
785 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
786 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
787 { "settings", required_argument, NULL, ARG_SETTINGS },
788 { "chdir", required_argument, NULL, ARG_CHDIR },
789 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
790 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
791 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
792 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
793 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
794 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
795 { "rlimit", required_argument, NULL, ARG_RLIMIT },
796 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
797 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
798 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
799 { "timezone", required_argument, NULL, ARG_TIMEZONE },
800 { "console", required_argument, NULL, ARG_CONSOLE },
801 { "pipe", no_argument, NULL, ARG_PIPE },
802 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
803 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
804 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
805 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
806 { "bind-user", required_argument, NULL, ARG_BIND_USER },
807 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
808 {}
809 };
810
811 int c, r;
812 uint64_t plus = 0, minus = 0;
813 bool mask_all_settings = false, mask_no_settings = false;
814
815 assert(argc >= 0);
816 assert(argv);
817
818 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
819 switch (c) {
820
821 case 'h':
822 return help();
823
824 case ARG_VERSION:
825 return version();
826
827 case 'D':
828 r = parse_path_argument(optarg, false, &arg_directory);
829 if (r < 0)
830 return r;
831
832 arg_settings_mask |= SETTING_DIRECTORY;
833 break;
834
835 case ARG_TEMPLATE:
836 r = parse_path_argument(optarg, false, &arg_template);
837 if (r < 0)
838 return r;
839
840 arg_settings_mask |= SETTING_DIRECTORY;
841 break;
842
843 case 'i':
844 r = parse_path_argument(optarg, false, &arg_image);
845 if (r < 0)
846 return r;
847
848 arg_settings_mask |= SETTING_DIRECTORY;
849 break;
850
851 case ARG_OCI_BUNDLE:
852 r = parse_path_argument(optarg, false, &arg_oci_bundle);
853 if (r < 0)
854 return r;
855
856 break;
857
858 case 'x':
859 arg_ephemeral = true;
860 arg_settings_mask |= SETTING_EPHEMERAL;
861 break;
862
863 case 'u':
864 r = free_and_strdup(&arg_user, optarg);
865 if (r < 0)
866 return log_oom();
867
868 arg_settings_mask |= SETTING_USER;
869 break;
870
871 case ARG_NETWORK_ZONE: {
872 char *j;
873
874 j = strjoin("vz-", optarg);
875 if (!j)
876 return log_oom();
877
878 if (!ifname_valid(j)) {
879 log_error("Network zone name not valid: %s", j);
880 free(j);
881 return -EINVAL;
882 }
883
884 free_and_replace(arg_network_zone, j);
885
886 arg_network_veth = true;
887 arg_private_network = true;
888 arg_settings_mask |= SETTING_NETWORK;
889 break;
890 }
891
892 case ARG_NETWORK_BRIDGE:
893
894 if (!ifname_valid(optarg))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
896 "Bridge interface name not valid: %s", optarg);
897
898 r = free_and_strdup(&arg_network_bridge, optarg);
899 if (r < 0)
900 return log_oom();
901
902 _fallthrough_;
903 case 'n':
904 arg_network_veth = true;
905 arg_private_network = true;
906 arg_settings_mask |= SETTING_NETWORK;
907 break;
908
909 case ARG_NETWORK_VETH_EXTRA:
910 r = veth_extra_parse(&arg_network_veth_extra, optarg);
911 if (r < 0)
912 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
913
914 arg_private_network = true;
915 arg_settings_mask |= SETTING_NETWORK;
916 break;
917
918 case ARG_NETWORK_INTERFACE:
919 if (!ifname_valid(optarg))
920 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
921 "Network interface name not valid: %s", optarg);
922
923 r = test_network_interface_initialized(optarg);
924 if (r < 0)
925 return r;
926
927 if (strv_extend(&arg_network_interfaces, optarg) < 0)
928 return log_oom();
929
930 arg_private_network = true;
931 arg_settings_mask |= SETTING_NETWORK;
932 break;
933
934 case ARG_NETWORK_MACVLAN:
935
936 if (!ifname_valid(optarg))
937 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
938 "MACVLAN network interface name not valid: %s", optarg);
939
940 r = test_network_interface_initialized(optarg);
941 if (r < 0)
942 return r;
943
944 if (strv_extend(&arg_network_macvlan, optarg) < 0)
945 return log_oom();
946
947 arg_private_network = true;
948 arg_settings_mask |= SETTING_NETWORK;
949 break;
950
951 case ARG_NETWORK_IPVLAN:
952
953 if (!ifname_valid(optarg))
954 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
955 "IPVLAN network interface name not valid: %s", optarg);
956
957 r = test_network_interface_initialized(optarg);
958 if (r < 0)
959 return r;
960
961 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
962 return log_oom();
963
964 _fallthrough_;
965 case ARG_PRIVATE_NETWORK:
966 arg_private_network = true;
967 arg_settings_mask |= SETTING_NETWORK;
968 break;
969
970 case ARG_NETWORK_NAMESPACE_PATH:
971 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
972 if (r < 0)
973 return r;
974
975 arg_settings_mask |= SETTING_NETWORK;
976 break;
977
978 case 'b':
979 if (arg_start_mode == START_PID2)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
981 "--boot and --as-pid2 may not be combined.");
982
983 arg_start_mode = START_BOOT;
984 arg_settings_mask |= SETTING_START_MODE;
985 break;
986
987 case 'a':
988 if (arg_start_mode == START_BOOT)
989 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
990 "--boot and --as-pid2 may not be combined.");
991
992 arg_start_mode = START_PID2;
993 arg_settings_mask |= SETTING_START_MODE;
994 break;
995
996 case ARG_UUID:
997 r = sd_id128_from_string(optarg, &arg_uuid);
998 if (r < 0)
999 return log_error_errno(r, "Invalid UUID: %s", optarg);
1000
1001 if (sd_id128_is_null(arg_uuid))
1002 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1003 "Machine UUID may not be all zeroes.");
1004
1005 arg_settings_mask |= SETTING_MACHINE_ID;
1006 break;
1007
1008 case 'S': {
1009 _cleanup_free_ char *mangled = NULL;
1010
1011 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1012 if (r < 0)
1013 return log_oom();
1014
1015 free_and_replace(arg_slice, mangled);
1016 arg_settings_mask |= SETTING_SLICE;
1017 break;
1018 }
1019
1020 case 'M':
1021 if (isempty(optarg))
1022 arg_machine = mfree(arg_machine);
1023 else {
1024 if (!hostname_is_valid(optarg, 0))
1025 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1026 "Invalid machine name: %s", optarg);
1027
1028 r = free_and_strdup(&arg_machine, optarg);
1029 if (r < 0)
1030 return log_oom();
1031 }
1032 break;
1033
1034 case ARG_HOSTNAME:
1035 if (isempty(optarg))
1036 arg_hostname = mfree(arg_hostname);
1037 else {
1038 if (!hostname_is_valid(optarg, 0))
1039 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1040 "Invalid hostname: %s", optarg);
1041
1042 r = free_and_strdup(&arg_hostname, optarg);
1043 if (r < 0)
1044 return log_oom();
1045 }
1046
1047 arg_settings_mask |= SETTING_HOSTNAME;
1048 break;
1049
1050 case 'Z':
1051 arg_selinux_context = optarg;
1052 break;
1053
1054 case 'L':
1055 arg_selinux_apifs_context = optarg;
1056 break;
1057
1058 case ARG_READ_ONLY:
1059 arg_read_only = true;
1060 arg_settings_mask |= SETTING_READ_ONLY;
1061 break;
1062
1063 case ARG_AMBIENT_CAPABILITY: {
1064 uint64_t m;
1065 r = parse_capability_spec(optarg, &m);
1066 if (r <= 0)
1067 return r;
1068 arg_caps_ambient |= m;
1069 arg_settings_mask |= SETTING_CAPABILITY;
1070 break;
1071 }
1072 case ARG_CAPABILITY:
1073 case ARG_DROP_CAPABILITY: {
1074 uint64_t m;
1075 r = parse_capability_spec(optarg, &m);
1076 if (r <= 0)
1077 return r;
1078
1079 if (c == ARG_CAPABILITY)
1080 plus |= m;
1081 else
1082 minus |= m;
1083 arg_settings_mask |= SETTING_CAPABILITY;
1084 break;
1085 }
1086 case ARG_NO_NEW_PRIVILEGES:
1087 r = parse_boolean(optarg);
1088 if (r < 0)
1089 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1090
1091 arg_no_new_privileges = r;
1092 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1093 break;
1094
1095 case 'j':
1096 arg_link_journal = LINK_GUEST;
1097 arg_link_journal_try = true;
1098 arg_settings_mask |= SETTING_LINK_JOURNAL;
1099 break;
1100
1101 case ARG_LINK_JOURNAL:
1102 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1103 if (r < 0)
1104 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1105
1106 arg_settings_mask |= SETTING_LINK_JOURNAL;
1107 break;
1108
1109 case ARG_BIND:
1110 case ARG_BIND_RO:
1111 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1112 if (r < 0)
1113 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1114
1115 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1116 break;
1117
1118 case ARG_TMPFS:
1119 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1120 if (r < 0)
1121 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1122
1123 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1124 break;
1125
1126 case ARG_OVERLAY:
1127 case ARG_OVERLAY_RO:
1128 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1129 if (r == -EADDRNOTAVAIL)
1130 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1131 if (r < 0)
1132 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1133
1134 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1135 break;
1136
1137 case ARG_INACCESSIBLE:
1138 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1139 if (r < 0)
1140 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1141
1142 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1143 break;
1144
1145 case 'E':
1146 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1147 if (r < 0)
1148 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1149
1150 arg_settings_mask |= SETTING_ENVIRONMENT;
1151 break;
1152
1153 case 'q':
1154 arg_quiet = true;
1155 break;
1156
1157 case ARG_SHARE_SYSTEM:
1158 /* We don't officially support this anymore, except for compat reasons. People should use the
1159 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1160 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1161 arg_clone_ns_flags = 0;
1162 break;
1163
1164 case ARG_REGISTER:
1165 r = parse_boolean(optarg);
1166 if (r < 0) {
1167 log_error("Failed to parse --register= argument: %s", optarg);
1168 return r;
1169 }
1170
1171 arg_register = r;
1172 break;
1173
1174 case ARG_KEEP_UNIT:
1175 arg_keep_unit = true;
1176 break;
1177
1178 case ARG_PERSONALITY:
1179
1180 arg_personality = personality_from_string(optarg);
1181 if (arg_personality == PERSONALITY_INVALID)
1182 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1183 "Unknown or unsupported personality '%s'.", optarg);
1184
1185 arg_settings_mask |= SETTING_PERSONALITY;
1186 break;
1187
1188 case ARG_VOLATILE:
1189
1190 if (!optarg)
1191 arg_volatile_mode = VOLATILE_YES;
1192 else if (streq(optarg, "help")) {
1193 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1194 return 0;
1195 } else {
1196 VolatileMode m;
1197
1198 m = volatile_mode_from_string(optarg);
1199 if (m < 0)
1200 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1201 "Failed to parse --volatile= argument: %s", optarg);
1202 else
1203 arg_volatile_mode = m;
1204 }
1205
1206 arg_settings_mask |= SETTING_VOLATILE_MODE;
1207 break;
1208
1209 case 'p':
1210 r = expose_port_parse(&arg_expose_ports, optarg);
1211 if (r == -EEXIST)
1212 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1213 if (r < 0)
1214 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1215
1216 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1217 break;
1218
1219 case ARG_PROPERTY:
1220 if (strv_extend(&arg_property, optarg) < 0)
1221 return log_oom();
1222
1223 break;
1224
1225 case ARG_PRIVATE_USERS: {
1226 int boolean;
1227
1228 if (!optarg)
1229 boolean = true;
1230 else if (!in_charset(optarg, DIGITS))
1231 /* do *not* parse numbers as booleans */
1232 boolean = parse_boolean(optarg);
1233 else
1234 boolean = -1;
1235
1236 if (boolean == 0) {
1237 /* no: User namespacing off */
1238 arg_userns_mode = USER_NAMESPACE_NO;
1239 arg_uid_shift = UID_INVALID;
1240 arg_uid_range = UINT32_C(0x10000);
1241 } else if (boolean > 0) {
1242 /* yes: User namespacing on, UID range is read from root dir */
1243 arg_userns_mode = USER_NAMESPACE_FIXED;
1244 arg_uid_shift = UID_INVALID;
1245 arg_uid_range = UINT32_C(0x10000);
1246 } else if (streq(optarg, "pick")) {
1247 /* pick: User namespacing on, UID range is picked randomly */
1248 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1249 * implied by USER_NAMESPACE_PICK
1250 * further down. */
1251 arg_uid_shift = UID_INVALID;
1252 arg_uid_range = UINT32_C(0x10000);
1253
1254 } else if (streq(optarg, "identity")) {
1255 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1256 * itself, i.e. we don't actually map anything, but do take benefit of
1257 * isolation of capability sets. */
1258 arg_userns_mode = USER_NAMESPACE_FIXED;
1259 arg_uid_shift = 0;
1260 arg_uid_range = UINT32_C(0x10000);
1261 } else {
1262 _cleanup_free_ char *buffer = NULL;
1263 const char *range, *shift;
1264
1265 /* anything else: User namespacing on, UID range is explicitly configured */
1266
1267 range = strchr(optarg, ':');
1268 if (range) {
1269 buffer = strndup(optarg, range - optarg);
1270 if (!buffer)
1271 return log_oom();
1272 shift = buffer;
1273
1274 range++;
1275 r = safe_atou32(range, &arg_uid_range);
1276 if (r < 0)
1277 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1278 } else
1279 shift = optarg;
1280
1281 r = parse_uid(shift, &arg_uid_shift);
1282 if (r < 0)
1283 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1284
1285 arg_userns_mode = USER_NAMESPACE_FIXED;
1286
1287 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1288 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1289 }
1290
1291 arg_settings_mask |= SETTING_USERNS;
1292 break;
1293 }
1294
1295 case 'U':
1296 if (userns_supported()) {
1297 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1298 * implied by USER_NAMESPACE_PICK
1299 * further down. */
1300 arg_uid_shift = UID_INVALID;
1301 arg_uid_range = UINT32_C(0x10000);
1302
1303 arg_settings_mask |= SETTING_USERNS;
1304 }
1305
1306 break;
1307
1308 case ARG_PRIVATE_USERS_CHOWN:
1309 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1310
1311 arg_settings_mask |= SETTING_USERNS;
1312 break;
1313
1314 case ARG_PRIVATE_USERS_OWNERSHIP:
1315 if (streq(optarg, "help")) {
1316 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1317 return 0;
1318 }
1319
1320 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1321 if (arg_userns_ownership < 0)
1322 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1323
1324 arg_settings_mask |= SETTING_USERNS;
1325 break;
1326
1327 case ARG_KILL_SIGNAL:
1328 if (streq(optarg, "help")) {
1329 DUMP_STRING_TABLE(signal, int, _NSIG);
1330 return 0;
1331 }
1332
1333 arg_kill_signal = signal_from_string(optarg);
1334 if (arg_kill_signal < 0)
1335 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1336
1337 arg_settings_mask |= SETTING_KILL_SIGNAL;
1338 break;
1339
1340 case ARG_SETTINGS:
1341
1342 /* no → do not read files
1343 * yes → read files, do not override cmdline, trust only subset
1344 * override → read files, override cmdline, trust only subset
1345 * trusted → read files, do not override cmdline, trust all
1346 */
1347
1348 r = parse_boolean(optarg);
1349 if (r < 0) {
1350 if (streq(optarg, "trusted")) {
1351 mask_all_settings = false;
1352 mask_no_settings = false;
1353 arg_settings_trusted = true;
1354
1355 } else if (streq(optarg, "override")) {
1356 mask_all_settings = false;
1357 mask_no_settings = true;
1358 arg_settings_trusted = -1;
1359 } else
1360 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1361 } else if (r > 0) {
1362 /* yes */
1363 mask_all_settings = false;
1364 mask_no_settings = false;
1365 arg_settings_trusted = -1;
1366 } else {
1367 /* no */
1368 mask_all_settings = true;
1369 mask_no_settings = false;
1370 arg_settings_trusted = false;
1371 }
1372
1373 break;
1374
1375 case ARG_CHDIR:
1376 if (!path_is_absolute(optarg))
1377 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1378 "Working directory %s is not an absolute path.", optarg);
1379
1380 r = free_and_strdup(&arg_chdir, optarg);
1381 if (r < 0)
1382 return log_oom();
1383
1384 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1385 break;
1386
1387 case ARG_PIVOT_ROOT:
1388 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1391
1392 arg_settings_mask |= SETTING_PIVOT_ROOT;
1393 break;
1394
1395 case ARG_NOTIFY_READY:
1396 r = parse_boolean(optarg);
1397 if (r < 0)
1398 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1399 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1400 arg_notify_ready = r;
1401 arg_settings_mask |= SETTING_NOTIFY_READY;
1402 break;
1403
1404 case ARG_ROOT_HASH: {
1405 _cleanup_free_ void *k = NULL;
1406 size_t l;
1407
1408 r = unhexmem(optarg, strlen(optarg), &k, &l);
1409 if (r < 0)
1410 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1411 if (l < sizeof(sd_id128_t))
1412 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1413
1414 free_and_replace(arg_verity_settings.root_hash, k);
1415 arg_verity_settings.root_hash_size = l;
1416 break;
1417 }
1418
1419 case ARG_ROOT_HASH_SIG: {
1420 char *value;
1421 size_t l;
1422 void *p;
1423
1424 if ((value = startswith(optarg, "base64:"))) {
1425 r = unbase64mem(value, strlen(value), &p, &l);
1426 if (r < 0)
1427 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1428
1429 } else {
1430 r = read_full_file(optarg, (char**) &p, &l);
1431 if (r < 0)
1432 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1433 }
1434
1435 free_and_replace(arg_verity_settings.root_hash_sig, p);
1436 arg_verity_settings.root_hash_sig_size = l;
1437 break;
1438 }
1439
1440 case ARG_VERITY_DATA:
1441 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1442 if (r < 0)
1443 return r;
1444 break;
1445
1446 case ARG_SYSTEM_CALL_FILTER: {
1447 bool negative;
1448 const char *items;
1449
1450 negative = optarg[0] == '~';
1451 items = negative ? optarg + 1 : optarg;
1452
1453 for (;;) {
1454 _cleanup_free_ char *word = NULL;
1455
1456 r = extract_first_word(&items, &word, NULL, 0);
1457 if (r == 0)
1458 break;
1459 if (r == -ENOMEM)
1460 return log_oom();
1461 if (r < 0)
1462 return log_error_errno(r, "Failed to parse system call filter: %m");
1463
1464 if (negative)
1465 r = strv_extend(&arg_syscall_deny_list, word);
1466 else
1467 r = strv_extend(&arg_syscall_allow_list, word);
1468 if (r < 0)
1469 return log_oom();
1470 }
1471
1472 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1473 break;
1474 }
1475
1476 case ARG_RLIMIT: {
1477 const char *eq;
1478 _cleanup_free_ char *name = NULL;
1479 int rl;
1480
1481 if (streq(optarg, "help")) {
1482 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1483 return 0;
1484 }
1485
1486 eq = strchr(optarg, '=');
1487 if (!eq)
1488 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1489 "--rlimit= expects an '=' assignment.");
1490
1491 name = strndup(optarg, eq - optarg);
1492 if (!name)
1493 return log_oom();
1494
1495 rl = rlimit_from_string_harder(name);
1496 if (rl < 0)
1497 return log_error_errno(rl, "Unknown resource limit: %s", name);
1498
1499 if (!arg_rlimit[rl]) {
1500 arg_rlimit[rl] = new0(struct rlimit, 1);
1501 if (!arg_rlimit[rl])
1502 return log_oom();
1503 }
1504
1505 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1506 if (r < 0)
1507 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1508
1509 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1510 break;
1511 }
1512
1513 case ARG_OOM_SCORE_ADJUST:
1514 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1515 if (r < 0)
1516 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1517
1518 arg_oom_score_adjust_set = true;
1519 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1520 break;
1521
1522 case ARG_CPU_AFFINITY: {
1523 CPUSet cpuset;
1524
1525 r = parse_cpu_set(optarg, &cpuset);
1526 if (r < 0)
1527 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1528
1529 cpu_set_reset(&arg_cpu_set);
1530 arg_cpu_set = cpuset;
1531 arg_settings_mask |= SETTING_CPU_AFFINITY;
1532 break;
1533 }
1534
1535 case ARG_RESOLV_CONF:
1536 if (streq(optarg, "help")) {
1537 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1538 return 0;
1539 }
1540
1541 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1542 if (arg_resolv_conf < 0)
1543 return log_error_errno(arg_resolv_conf,
1544 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1545
1546 arg_settings_mask |= SETTING_RESOLV_CONF;
1547 break;
1548
1549 case ARG_TIMEZONE:
1550 if (streq(optarg, "help")) {
1551 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1552 return 0;
1553 }
1554
1555 arg_timezone = timezone_mode_from_string(optarg);
1556 if (arg_timezone < 0)
1557 return log_error_errno(arg_timezone,
1558 "Failed to parse /etc/localtime mode: %s", optarg);
1559
1560 arg_settings_mask |= SETTING_TIMEZONE;
1561 break;
1562
1563 case ARG_CONSOLE:
1564 r = handle_arg_console(optarg);
1565 if (r <= 0)
1566 return r;
1567 break;
1568
1569 case 'P':
1570 case ARG_PIPE:
1571 r = handle_arg_console("pipe");
1572 if (r <= 0)
1573 return r;
1574 break;
1575
1576 case ARG_NO_PAGER:
1577 arg_pager_flags |= PAGER_DISABLE;
1578 break;
1579
1580 case ARG_SET_CREDENTIAL: {
1581 _cleanup_free_ char *word = NULL, *data = NULL;
1582 const char *p = optarg;
1583 Credential *a;
1584 ssize_t l;
1585
1586 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1587 if (r == -ENOMEM)
1588 return log_oom();
1589 if (r < 0)
1590 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1591 if (r == 0 || !p)
1592 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1593
1594 if (!credential_name_valid(word))
1595 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1596
1597 for (size_t i = 0; i < arg_n_credentials; i++)
1598 if (streq(arg_credentials[i].id, word))
1599 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1600
1601 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1602 if (l < 0)
1603 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1604
1605 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1606 if (!a)
1607 return log_oom();
1608
1609 a[arg_n_credentials++] = (Credential) {
1610 .id = TAKE_PTR(word),
1611 .data = TAKE_PTR(data),
1612 .size = l,
1613 };
1614
1615 arg_credentials = a;
1616
1617 arg_settings_mask |= SETTING_CREDENTIALS;
1618 break;
1619 }
1620
1621 case ARG_LOAD_CREDENTIAL: {
1622 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1623 _cleanup_(erase_and_freep) char *data = NULL;
1624 _cleanup_free_ char *word = NULL, *j = NULL;
1625 const char *p = optarg;
1626 Credential *a;
1627 size_t size, i;
1628
1629 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1630 if (r == -ENOMEM)
1631 return log_oom();
1632 if (r < 0)
1633 return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
1634 if (r == 0 || !p)
1635 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", optarg);
1636
1637 if (!credential_name_valid(word))
1638 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1639
1640 for (i = 0; i < arg_n_credentials; i++)
1641 if (streq(arg_credentials[i].id, word))
1642 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1643
1644 if (path_is_absolute(p))
1645 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1646 else {
1647 const char *e;
1648
1649 r = get_credentials_dir(&e);
1650 if (r < 0)
1651 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1652
1653 j = path_join(e, p);
1654 if (!j)
1655 return log_oom();
1656 }
1657
1658 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1659 flags,
1660 NULL,
1661 &data, &size);
1662 if (r < 0)
1663 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1664
1665 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1666 if (!a)
1667 return log_oom();
1668
1669 a[arg_n_credentials++] = (Credential) {
1670 .id = TAKE_PTR(word),
1671 .data = TAKE_PTR(data),
1672 .size = size,
1673 };
1674
1675 arg_credentials = a;
1676
1677 arg_settings_mask |= SETTING_CREDENTIALS;
1678 break;
1679 }
1680
1681 case ARG_BIND_USER:
1682 if (!valid_user_group_name(optarg, 0))
1683 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1684
1685 if (strv_extend(&arg_bind_user, optarg) < 0)
1686 return log_oom();
1687
1688 arg_settings_mask |= SETTING_BIND_USER;
1689 break;
1690
1691 case ARG_SUPPRESS_SYNC:
1692 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1693 if (r < 0)
1694 return r;
1695
1696 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1697 break;
1698
1699 case '?':
1700 return -EINVAL;
1701
1702 default:
1703 assert_not_reached();
1704 }
1705
1706 if (argc > optind) {
1707 strv_free(arg_parameters);
1708 arg_parameters = strv_copy(argv + optind);
1709 if (!arg_parameters)
1710 return log_oom();
1711
1712 arg_settings_mask |= SETTING_START_MODE;
1713 }
1714
1715 if (arg_ephemeral && arg_template && !arg_directory)
1716 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1717 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1718 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1719 * --directory=". */
1720 arg_directory = TAKE_PTR(arg_template);
1721
1722 arg_caps_retain |= plus;
1723 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1724
1725 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1726 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1727 * indicate that. */
1728 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
1729 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
1730
1731 arg_caps_retain &= ~minus;
1732
1733 /* Make sure to parse environment before we reset the settings mask below */
1734 r = parse_environment();
1735 if (r < 0)
1736 return r;
1737
1738 /* Load all settings from .nspawn files */
1739 if (mask_no_settings)
1740 arg_settings_mask = 0;
1741
1742 /* Don't load any settings from .nspawn files */
1743 if (mask_all_settings)
1744 arg_settings_mask = _SETTINGS_MASK_ALL;
1745
1746 return 1;
1747 }
1748
1749 static int verify_arguments(void) {
1750 int r;
1751
1752 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1753 /* If we are running the stub init in the container, we don't need to look at what the init
1754 * in the container supports, because we are not using it. Let's immediately pick the right
1755 * setting based on the host system configuration.
1756 *
1757 * We only do this, if the user didn't use an environment variable to override the detection.
1758 */
1759
1760 r = cg_all_unified();
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1763 if (r > 0)
1764 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1765 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1766 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1767 else
1768 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1769 }
1770
1771 if (arg_userns_mode != USER_NAMESPACE_NO)
1772 arg_mount_settings |= MOUNT_USE_USERNS;
1773
1774 if (arg_private_network)
1775 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1776
1777 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1778 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1779 arg_register = false;
1780 if (arg_start_mode != START_PID1)
1781 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1782 }
1783
1784 if (arg_userns_ownership < 0)
1785 arg_userns_ownership =
1786 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1787 USER_NAMESPACE_OWNERSHIP_OFF;
1788
1789 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1790 arg_kill_signal = SIGRTMIN+3;
1791
1792 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1793 arg_read_only = true;
1794
1795 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1796 arg_read_only = true;
1797
1798 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1799 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1800 * The latter is not technically a user session, but we don't need to labour the point. */
1801 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1802
1803 if (arg_directory && arg_image)
1804 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1805
1806 if (arg_template && arg_image)
1807 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1808
1809 if (arg_template && !(arg_directory || arg_machine))
1810 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1811
1812 if (arg_ephemeral && arg_template)
1813 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1814
1815 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1816 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1817
1818 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1819 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1820
1821 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1823 "--read-only and --private-users-ownership=chown may not be combined.");
1824
1825 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1826 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1827 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1828 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1829 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1830
1831 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1832 * we need to error out, to avoid conflicts between different network options. */
1833 if (arg_network_namespace_path &&
1834 (arg_network_interfaces || arg_network_macvlan ||
1835 arg_network_ipvlan || arg_network_veth_extra ||
1836 arg_network_bridge || arg_network_zone ||
1837 arg_network_veth))
1838 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1839
1840 if (arg_network_bridge && arg_network_zone)
1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1842 "--network-bridge= and --network-zone= may not be combined.");
1843
1844 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1846
1847 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1848 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1849
1850 if (arg_expose_ports && !arg_private_network)
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1852
1853 if (arg_caps_ambient) {
1854 if (arg_caps_ambient == UINT64_MAX)
1855 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1856
1857 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1858 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1859
1860 if (arg_start_mode == START_BOOT)
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1862 }
1863
1864 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1865 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1866
1867 /* Drop duplicate --bind-user= entries */
1868 strv_uniq(arg_bind_user);
1869
1870 r = custom_mount_check_all();
1871 if (r < 0)
1872 return r;
1873
1874 return 0;
1875 }
1876
1877 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1878 assert(p);
1879
1880 if (arg_userns_mode == USER_NAMESPACE_NO)
1881 return 0;
1882
1883 if (uid == UID_INVALID && gid == GID_INVALID)
1884 return 0;
1885
1886 if (uid != UID_INVALID) {
1887 uid += arg_uid_shift;
1888
1889 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1890 return -EOVERFLOW;
1891 }
1892
1893 if (gid != GID_INVALID) {
1894 gid += (gid_t) arg_uid_shift;
1895
1896 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1897 return -EOVERFLOW;
1898 }
1899
1900 return RET_NERRNO(lchown(p, uid, gid));
1901 }
1902
1903 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1904 const char *q;
1905 int r;
1906
1907 q = prefix_roota(root, path);
1908 r = RET_NERRNO(mkdir(q, mode));
1909 if (r == -EEXIST)
1910 return 0;
1911 if (r < 0)
1912 return r;
1913
1914 return userns_lchown(q, uid, gid);
1915 }
1916
1917 static const char *timezone_from_path(const char *path) {
1918 return PATH_STARTSWITH_SET(
1919 path,
1920 "../usr/share/zoneinfo/",
1921 "/usr/share/zoneinfo/");
1922 }
1923
1924 static bool etc_writable(void) {
1925 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1926 }
1927
1928 static int setup_timezone(const char *dest) {
1929 _cleanup_free_ char *p = NULL, *etc = NULL;
1930 const char *where, *check;
1931 TimezoneMode m;
1932 int r;
1933
1934 assert(dest);
1935
1936 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1937 r = readlink_malloc("/etc/localtime", &p);
1938 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1939 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1940 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1941 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1942 else if (r < 0) {
1943 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1944 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1945 * file.
1946 *
1947 * Example:
1948 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1949 */
1950 return 0;
1951 } else if (arg_timezone == TIMEZONE_AUTO)
1952 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1953 else
1954 m = arg_timezone;
1955 } else
1956 m = arg_timezone;
1957
1958 if (m == TIMEZONE_OFF)
1959 return 0;
1960
1961 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1962 if (r < 0) {
1963 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1964 return 0;
1965 }
1966
1967 where = strjoina(etc, "/localtime");
1968
1969 switch (m) {
1970
1971 case TIMEZONE_DELETE:
1972 if (unlink(where) < 0)
1973 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1974
1975 return 0;
1976
1977 case TIMEZONE_SYMLINK: {
1978 _cleanup_free_ char *q = NULL;
1979 const char *z, *what;
1980
1981 z = timezone_from_path(p);
1982 if (!z) {
1983 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1984 return 0;
1985 }
1986
1987 r = readlink_malloc(where, &q);
1988 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1989 return 0; /* Already pointing to the right place? Then do nothing .. */
1990
1991 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1992 r = chase(check, dest, 0, NULL, NULL);
1993 if (r < 0)
1994 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1995 else {
1996 if (unlink(where) < 0 && errno != ENOENT) {
1997 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1998 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1999 return 0;
2000 }
2001
2002 what = strjoina("../usr/share/zoneinfo/", z);
2003 if (symlink(what, where) < 0) {
2004 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
2005 errno, "Failed to correct timezone of container, ignoring: %m");
2006 return 0;
2007 }
2008
2009 break;
2010 }
2011
2012 _fallthrough_;
2013 }
2014
2015 case TIMEZONE_BIND: {
2016 _cleanup_free_ char *resolved = NULL;
2017 int found;
2018
2019 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2020 if (found < 0) {
2021 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2022 return 0;
2023 }
2024
2025 if (found == 0) /* missing? */
2026 (void) touch(resolved);
2027
2028 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
2029 if (r >= 0)
2030 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2031
2032 _fallthrough_;
2033 }
2034
2035 case TIMEZONE_COPY:
2036 /* If mounting failed, try to copy */
2037 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
2038 if (r < 0) {
2039 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2040 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2041 return 0;
2042 }
2043
2044 break;
2045
2046 default:
2047 assert_not_reached();
2048 }
2049
2050 /* Fix permissions of the symlink or file copy we just created */
2051 r = userns_lchown(where, 0, 0);
2052 if (r < 0)
2053 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2054
2055 return 0;
2056 }
2057
2058 static int have_resolv_conf(const char *path) {
2059 assert(path);
2060
2061 if (access(path, F_OK) < 0) {
2062 if (errno == ENOENT)
2063 return 0;
2064
2065 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2066 }
2067
2068 return 1;
2069 }
2070
2071 static int resolved_listening(void) {
2072 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2073 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2074 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2075 int r;
2076
2077 /* Check if resolved is listening */
2078
2079 r = sd_bus_open_system(&bus);
2080 if (r < 0)
2081 return log_debug_errno(r, "Failed to open system bus: %m");
2082
2083 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2084 if (r < 0)
2085 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2086 if (r == 0)
2087 return 0;
2088
2089 r = sd_bus_get_property_string(bus,
2090 "org.freedesktop.resolve1",
2091 "/org/freedesktop/resolve1",
2092 "org.freedesktop.resolve1.Manager",
2093 "DNSStubListener",
2094 &error,
2095 &dns_stub_listener_mode);
2096 if (r < 0)
2097 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2098
2099 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2100 }
2101
2102 static int setup_resolv_conf(const char *dest) {
2103 _cleanup_free_ char *etc = NULL;
2104 const char *where, *what;
2105 ResolvConfMode m;
2106 int r;
2107
2108 assert(dest);
2109
2110 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2111 if (arg_private_network)
2112 m = RESOLV_CONF_OFF;
2113 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2114 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2115 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2116 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2117 else
2118 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2119
2120 } else
2121 m = arg_resolv_conf;
2122
2123 if (m == RESOLV_CONF_OFF)
2124 return 0;
2125
2126 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2127 if (r < 0) {
2128 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2129 return 0;
2130 }
2131
2132 where = strjoina(etc, "/resolv.conf");
2133
2134 if (m == RESOLV_CONF_DELETE) {
2135 if (unlink(where) < 0)
2136 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2137
2138 return 0;
2139 }
2140
2141 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2142 what = PRIVATE_STATIC_RESOLV_CONF;
2143 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2144 what = PRIVATE_UPLINK_RESOLV_CONF;
2145 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2146 what = PRIVATE_STUB_RESOLV_CONF;
2147 else
2148 what = "/etc/resolv.conf";
2149
2150 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2151 _cleanup_free_ char *resolved = NULL;
2152 int found;
2153
2154 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2155 if (found < 0) {
2156 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2157 return 0;
2158 }
2159
2160 if (found == 0) /* missing? */
2161 (void) touch(resolved);
2162
2163 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2164 if (r >= 0)
2165 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2166
2167 /* If that didn't work, let's copy the file */
2168 }
2169
2170 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2171 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
2172 else
2173 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
2174 if (r < 0) {
2175 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2176 * resolved or something similar runs inside and the symlink points there.
2177 *
2178 * If the disk image is read-only, there's also no point in complaining.
2179 */
2180 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2181 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2182 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2183 return 0;
2184 }
2185
2186 r = userns_lchown(where, 0, 0);
2187 if (r < 0)
2188 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2189
2190 return 0;
2191 }
2192
2193 static int setup_boot_id(void) {
2194 _cleanup_(unlink_and_freep) char *from = NULL;
2195 _cleanup_free_ char *path = NULL;
2196 sd_id128_t rnd = SD_ID128_NULL;
2197 const char *to;
2198 int r;
2199
2200 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2201
2202 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2203 if (r < 0)
2204 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2205
2206 r = sd_id128_randomize(&rnd);
2207 if (r < 0)
2208 return log_error_errno(r, "Failed to generate random boot id: %m");
2209
2210 r = id128_write(path, ID128_FORMAT_UUID, rnd);
2211 if (r < 0)
2212 return log_error_errno(r, "Failed to write boot id: %m");
2213
2214 from = TAKE_PTR(path);
2215 to = "/proc/sys/kernel/random/boot_id";
2216
2217 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2218 if (r < 0)
2219 return r;
2220
2221 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2222 }
2223
2224 static int copy_devnodes(const char *dest) {
2225 static const char devnodes[] =
2226 "null\0"
2227 "zero\0"
2228 "full\0"
2229 "random\0"
2230 "urandom\0"
2231 "tty\0"
2232 "net/tun\0";
2233
2234 int r = 0;
2235
2236 assert(dest);
2237
2238 BLOCK_WITH_UMASK(0000);
2239
2240 /* Create /dev/net, so that we can create /dev/net/tun in it */
2241 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2242 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2243
2244 NULSTR_FOREACH(d, devnodes) {
2245 _cleanup_free_ char *from = NULL, *to = NULL;
2246 struct stat st;
2247
2248 from = path_join("/dev/", d);
2249 if (!from)
2250 return log_oom();
2251
2252 to = path_join(dest, from);
2253 if (!to)
2254 return log_oom();
2255
2256 if (stat(from, &st) < 0) {
2257
2258 if (errno != ENOENT)
2259 return log_error_errno(errno, "Failed to stat %s: %m", from);
2260
2261 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2262 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2263 "%s is not a char or block device, cannot copy.", from);
2264 else {
2265 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2266
2267 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2268 /* Explicitly warn the user when /dev is already populated. */
2269 if (errno == EEXIST)
2270 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2271 if (errno != EPERM)
2272 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2273
2274 /* Some systems abusively restrict mknod but allow bind mounts. */
2275 r = touch(to);
2276 if (r < 0)
2277 return log_error_errno(r, "touch (%s) failed: %m", to);
2278 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2279 if (r < 0)
2280 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2281 }
2282
2283 r = userns_lchown(to, 0, 0);
2284 if (r < 0)
2285 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2286
2287 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2288 if (!dn)
2289 return log_oom();
2290
2291 r = userns_mkdir(dest, dn, 0755, 0, 0);
2292 if (r < 0)
2293 return log_error_errno(r, "Failed to create '%s': %m", dn);
2294
2295 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2296 return log_oom();
2297
2298 prefixed = path_join(dest, sl);
2299 if (!prefixed)
2300 return log_oom();
2301
2302 t = path_join("..", d);
2303 if (!t)
2304 return log_oom();
2305
2306 if (symlink(t, prefixed) < 0)
2307 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2308 }
2309 }
2310
2311 return r;
2312 }
2313
2314 static int make_extra_nodes(const char *dest) {
2315 size_t i;
2316 int r;
2317
2318 BLOCK_WITH_UMASK(0000);
2319
2320 for (i = 0; i < arg_n_extra_nodes; i++) {
2321 _cleanup_free_ char *path = NULL;
2322 DeviceNode *n = arg_extra_nodes + i;
2323
2324 path = path_join(dest, n->path);
2325 if (!path)
2326 return log_oom();
2327
2328 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2329 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2330
2331 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2332 if (r < 0)
2333 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2334 }
2335
2336 return 0;
2337 }
2338
2339 static int setup_pts(const char *dest) {
2340 _cleanup_free_ char *options = NULL;
2341 const char *p;
2342 int r;
2343
2344 #if HAVE_SELINUX
2345 if (arg_selinux_apifs_context)
2346 (void) asprintf(&options,
2347 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2348 arg_uid_shift + TTY_GID,
2349 arg_selinux_apifs_context);
2350 else
2351 #endif
2352 (void) asprintf(&options,
2353 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2354 arg_uid_shift + TTY_GID);
2355
2356 if (!options)
2357 return log_oom();
2358
2359 /* Mount /dev/pts itself */
2360 p = prefix_roota(dest, "/dev/pts");
2361 r = RET_NERRNO(mkdir(p, 0755));
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to create /dev/pts: %m");
2364
2365 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2366 if (r < 0)
2367 return r;
2368 r = userns_lchown(p, 0, 0);
2369 if (r < 0)
2370 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2371
2372 /* Create /dev/ptmx symlink */
2373 p = prefix_roota(dest, "/dev/ptmx");
2374 if (symlink("pts/ptmx", p) < 0)
2375 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2376 r = userns_lchown(p, 0, 0);
2377 if (r < 0)
2378 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2379
2380 /* And fix /dev/pts/ptmx ownership */
2381 p = prefix_roota(dest, "/dev/pts/ptmx");
2382 r = userns_lchown(p, 0, 0);
2383 if (r < 0)
2384 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2385
2386 return 0;
2387 }
2388
2389 static int setup_stdio_as_dev_console(void) {
2390 _cleanup_close_ int terminal = -EBADF;
2391 int r;
2392
2393 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2394 * explicitly, if we are configured to. */
2395 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2396 if (terminal < 0)
2397 return log_error_errno(terminal, "Failed to open console: %m");
2398
2399 /* Make sure we can continue logging to the original stderr, even if
2400 * stderr points elsewhere now */
2401 r = log_dup_console();
2402 if (r < 0)
2403 return log_error_errno(r, "Failed to duplicate stderr: %m");
2404
2405 /* invalidates 'terminal' on success and failure */
2406 r = rearrange_stdio(terminal, terminal, terminal);
2407 TAKE_FD(terminal);
2408 if (r < 0)
2409 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2410
2411 return 0;
2412 }
2413
2414 static int setup_dev_console(const char *console) {
2415 _cleanup_free_ char *p = NULL;
2416 int r;
2417
2418 /* Create /dev/console symlink */
2419 r = path_make_relative("/dev", console, &p);
2420 if (r < 0)
2421 return log_error_errno(r, "Failed to create relative path: %m");
2422
2423 if (symlink(p, "/dev/console") < 0)
2424 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2425
2426 return 0;
2427 }
2428
2429 static int setup_keyring(void) {
2430 key_serial_t keyring;
2431
2432 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2433 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2434 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2435 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2436 * into the container. */
2437
2438 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2439 if (keyring == -1) {
2440 if (errno == ENOSYS)
2441 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2442 else if (ERRNO_IS_PRIVILEGE(errno))
2443 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2444 else
2445 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2446 }
2447
2448 return 0;
2449 }
2450
2451 static int setup_credentials(const char *root) {
2452 const char *q;
2453 int r;
2454
2455 if (arg_n_credentials <= 0)
2456 return 0;
2457
2458 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to create /run/host: %m");
2461
2462 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2465
2466 q = prefix_roota(root, "/run/host/credentials");
2467 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2468 if (r < 0)
2469 return r;
2470
2471 for (size_t i = 0; i < arg_n_credentials; i++) {
2472 _cleanup_free_ char *j = NULL;
2473 _cleanup_close_ int fd = -EBADF;
2474
2475 j = path_join(q, arg_credentials[i].id);
2476 if (!j)
2477 return log_oom();
2478
2479 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2480 if (fd < 0)
2481 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2482
2483 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2484 if (r < 0)
2485 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2486
2487 if (fchmod(fd, 0400) < 0)
2488 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2489
2490 if (arg_userns_mode != USER_NAMESPACE_NO) {
2491 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2492 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2493 }
2494 }
2495
2496 if (chmod(q, 0500) < 0)
2497 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2498
2499 r = userns_lchown(q, 0, 0);
2500 if (r < 0)
2501 return r;
2502
2503 /* Make both mount and superblock read-only now */
2504 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2505 if (r < 0)
2506 return r;
2507
2508 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2509 }
2510
2511 static int setup_kmsg(int fd_inner_socket) {
2512 _cleanup_(unlink_and_freep) char *from = NULL;
2513 _cleanup_free_ char *fifo = NULL;
2514 _cleanup_close_ int fd = -EBADF;
2515 int r;
2516
2517 assert(fd_inner_socket >= 0);
2518
2519 BLOCK_WITH_UMASK(0000);
2520
2521 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2522 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2523 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2524 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2525
2526 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2527 if (r < 0)
2528 return log_error_errno(r, "Failed to generate kmsg path: %m");
2529
2530 if (mkfifo(fifo, 0600) < 0)
2531 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2532
2533 from = TAKE_PTR(fifo);
2534
2535 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2536 if (r < 0)
2537 return r;
2538
2539 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2540 if (fd < 0)
2541 return log_error_errno(errno, "Failed to open fifo: %m");
2542
2543 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2544 r = send_one_fd(fd_inner_socket, fd, 0);
2545 if (r < 0)
2546 return log_error_errno(r, "Failed to send FIFO fd: %m");
2547
2548 return 0;
2549 }
2550
2551 struct ExposeArgs {
2552 union in_addr_union address4;
2553 union in_addr_union address6;
2554 struct FirewallContext *fw_ctx;
2555 };
2556
2557 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2558 struct ExposeArgs *args = ASSERT_PTR(userdata);
2559
2560 assert(rtnl);
2561 assert(m);
2562
2563 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2564 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2565 return 0;
2566 }
2567
2568 static int setup_hostname(void) {
2569 int r;
2570
2571 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2572 return 0;
2573
2574 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2575 if (r < 0)
2576 return log_error_errno(r, "Failed to set hostname: %m");
2577
2578 return 0;
2579 }
2580
2581 static int setup_journal(const char *directory) {
2582 _cleanup_free_ char *d = NULL;
2583 const char *p, *q;
2584 sd_id128_t this_id;
2585 bool try;
2586 int r;
2587
2588 /* Don't link journals in ephemeral mode */
2589 if (arg_ephemeral)
2590 return 0;
2591
2592 if (arg_link_journal == LINK_NO)
2593 return 0;
2594
2595 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2596
2597 r = sd_id128_get_machine(&this_id);
2598 if (r < 0)
2599 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2600
2601 if (sd_id128_equal(arg_uuid, this_id)) {
2602 log_full(try ? LOG_WARNING : LOG_ERR,
2603 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2604 if (try)
2605 return 0;
2606 return -EEXIST;
2607 }
2608
2609 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2610 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2611 if (r < 0) {
2612 bool ignore = r == -EROFS && try;
2613 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2614 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2615 return ignore ? 0 : r;
2616 }
2617 }
2618
2619 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2620 q = prefix_roota(directory, p);
2621
2622 if (path_is_mount_point(p, NULL, 0) > 0) {
2623 if (try)
2624 return 0;
2625
2626 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2627 "%s: already a mount point, refusing to use for journal", p);
2628 }
2629
2630 if (path_is_mount_point(q, NULL, 0) > 0) {
2631 if (try)
2632 return 0;
2633
2634 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2635 "%s: already a mount point, refusing to use for journal", q);
2636 }
2637
2638 r = readlink_and_make_absolute(p, &d);
2639 if (r >= 0) {
2640 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2641 path_equal(d, q)) {
2642
2643 r = userns_mkdir(directory, p, 0755, 0, 0);
2644 if (r < 0)
2645 log_warning_errno(r, "Failed to create directory %s: %m", q);
2646 return 0;
2647 }
2648
2649 if (unlink(p) < 0)
2650 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2651 } else if (r == -EINVAL) {
2652
2653 if (arg_link_journal == LINK_GUEST &&
2654 rmdir(p) < 0) {
2655
2656 if (errno == ENOTDIR) {
2657 log_error("%s already exists and is neither a symlink nor a directory", p);
2658 return r;
2659 } else
2660 return log_error_errno(errno, "Failed to remove %s: %m", p);
2661 }
2662 } else if (r != -ENOENT)
2663 return log_error_errno(r, "readlink(%s) failed: %m", p);
2664
2665 if (arg_link_journal == LINK_GUEST) {
2666
2667 if (symlink(q, p) < 0) {
2668 if (try) {
2669 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2670 return 0;
2671 } else
2672 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2673 }
2674
2675 r = userns_mkdir(directory, p, 0755, 0, 0);
2676 if (r < 0)
2677 log_warning_errno(r, "Failed to create directory %s: %m", q);
2678 return 0;
2679 }
2680
2681 if (arg_link_journal == LINK_HOST) {
2682 /* don't create parents here — if the host doesn't have
2683 * permanent journal set up, don't force it here */
2684
2685 r = RET_NERRNO(mkdir(p, 0755));
2686 if (r < 0 && r != -EEXIST) {
2687 if (try) {
2688 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2689 return 0;
2690 } else
2691 return log_error_errno(r, "Failed to create %s: %m", p);
2692 }
2693
2694 } else if (access(p, F_OK) < 0)
2695 return 0;
2696
2697 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2698 log_warning("%s is not empty, proceeding anyway.", q);
2699
2700 r = userns_mkdir(directory, p, 0755, 0, 0);
2701 if (r < 0)
2702 return log_error_errno(r, "Failed to create %s: %m", q);
2703
2704 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2705 if (r < 0)
2706 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2707
2708 return 0;
2709 }
2710
2711 static int drop_capabilities(uid_t uid) {
2712 CapabilityQuintet q;
2713
2714 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2715 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2716 * arg_caps_retain. */
2717
2718 if (capability_quintet_is_set(&arg_full_capabilities)) {
2719 q = arg_full_capabilities;
2720
2721 if (q.bounding == UINT64_MAX)
2722 q.bounding = uid == 0 ? arg_caps_retain : 0;
2723
2724 if (q.effective == UINT64_MAX)
2725 q.effective = uid == 0 ? q.bounding : 0;
2726
2727 if (q.inheritable == UINT64_MAX)
2728 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2729
2730 if (q.permitted == UINT64_MAX)
2731 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2732
2733 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2734 q.ambient = arg_caps_ambient;
2735
2736 if (capability_quintet_mangle(&q))
2737 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2738
2739 } else {
2740 q = (CapabilityQuintet) {
2741 .bounding = arg_caps_retain,
2742 .effective = uid == 0 ? arg_caps_retain : 0,
2743 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2744 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2745 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2746 };
2747
2748 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2749 * in order to maintain the same behavior as systemd < 242. */
2750 if (capability_quintet_mangle(&q))
2751 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2752 "Some capabilities will not be set because they are not in the current bounding set.");
2753
2754 }
2755
2756 return capability_quintet_enforce(&q);
2757 }
2758
2759 static int reset_audit_loginuid(void) {
2760 _cleanup_free_ char *p = NULL;
2761 int r;
2762
2763 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2764 return 0;
2765
2766 r = read_one_line_file("/proc/self/loginuid", &p);
2767 if (r == -ENOENT)
2768 return 0;
2769 if (r < 0)
2770 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2771
2772 /* Already reset? */
2773 if (streq(p, "4294967295"))
2774 return 0;
2775
2776 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2777 if (r < 0) {
2778 log_error_errno(r,
2779 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2780 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2781 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2782 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2783 "using systemd-nspawn. Sleeping for 5s... (%m)");
2784
2785 sleep(5);
2786 }
2787
2788 return 0;
2789 }
2790
2791 static int mount_tunnel_dig(const char *root) {
2792 const char *p, *q;
2793 int r;
2794
2795 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2796 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2797 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2798 (void) mkdir_p(p, 0600);
2799
2800 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2801 if (r < 0)
2802 return log_error_errno(r, "Failed to create /run/host: %m");
2803
2804 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
2805 if (r < 0)
2806 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
2807
2808 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
2809 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2810 if (r < 0)
2811 return r;
2812
2813 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2814 if (r < 0)
2815 return r;
2816
2817 return 0;
2818 }
2819
2820 static int mount_tunnel_open(void) {
2821 int r;
2822
2823 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2824 if (r < 0)
2825 return r;
2826
2827 return 0;
2828 }
2829
2830 static int setup_machine_id(const char *directory) {
2831 int r;
2832
2833 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2834 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2835 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2836 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2837 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2838 * container behaves nicely). */
2839
2840 r = id128_get_machine(directory, &arg_uuid);
2841 if (r < 0) {
2842 if (!ERRNO_IS_MACHINE_ID_UNSET(r)) /* If the file is missing, empty, or uninitialized, we don't mind */
2843 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2844
2845 if (sd_id128_is_null(arg_uuid)) {
2846 r = sd_id128_randomize(&arg_uuid);
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2849 }
2850 }
2851
2852 return 0;
2853 }
2854
2855 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2856 int r;
2857
2858 assert(directory);
2859
2860 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2861 return 0;
2862
2863 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2864 if (r == -EOPNOTSUPP)
2865 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2866 if (r == -EBADE)
2867 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2868 if (r < 0)
2869 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2870 if (r == 0)
2871 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2872 else
2873 log_debug("Patched directory tree to match UID/GID range.");
2874
2875 return r;
2876 }
2877
2878 /*
2879 * Return values:
2880 * < 0 : wait_for_terminate() failed to get the state of the
2881 * container, the container was terminated by a signal, or
2882 * failed for an unknown reason. No change is made to the
2883 * container argument.
2884 * > 0 : The program executed in the container terminated with an
2885 * error. The exit code of the program executed in the
2886 * container is returned. The container argument has been set
2887 * to CONTAINER_TERMINATED.
2888 * 0 : The container is being rebooted, has been shut down or exited
2889 * successfully. The container argument has been set to either
2890 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2891 *
2892 * That is, success is indicated by a return value of zero, and an
2893 * error is indicated by a non-zero value.
2894 */
2895 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2896 siginfo_t status;
2897 int r;
2898
2899 r = wait_for_terminate(pid, &status);
2900 if (r < 0)
2901 return log_warning_errno(r, "Failed to wait for container: %m");
2902
2903 switch (status.si_code) {
2904
2905 case CLD_EXITED:
2906 if (status.si_status == 0)
2907 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2908 else
2909 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2910
2911 *container = CONTAINER_TERMINATED;
2912 return status.si_status;
2913
2914 case CLD_KILLED:
2915 if (status.si_status == SIGINT) {
2916 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2917 *container = CONTAINER_TERMINATED;
2918 return 0;
2919
2920 } else if (status.si_status == SIGHUP) {
2921 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2922 *container = CONTAINER_REBOOTED;
2923 return 0;
2924 }
2925
2926 _fallthrough_;
2927 case CLD_DUMPED:
2928 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2929 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2930
2931 default:
2932 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2933 "Container %s failed due to unknown reason.", arg_machine);
2934 }
2935 }
2936
2937 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2938 pid_t pid;
2939
2940 pid = PTR_TO_PID(userdata);
2941 if (pid > 0) {
2942 if (kill(pid, arg_kill_signal) >= 0) {
2943 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2944 sd_event_source_set_userdata(s, NULL);
2945 return 0;
2946 }
2947 }
2948
2949 sd_event_exit(sd_event_source_get_event(s), 0);
2950 return 0;
2951 }
2952
2953 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2954 pid_t pid;
2955
2956 assert(s);
2957 assert(ssi);
2958
2959 pid = PTR_TO_PID(userdata);
2960
2961 for (;;) {
2962 siginfo_t si = {};
2963
2964 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2965 return log_error_errno(errno, "Failed to waitid(): %m");
2966 if (si.si_pid == 0) /* No pending children. */
2967 break;
2968 if (si.si_pid == pid) {
2969 /* The main process we care for has exited. Return from
2970 * signal handler but leave the zombie. */
2971 sd_event_exit(sd_event_source_get_event(s), 0);
2972 break;
2973 }
2974
2975 /* Reap all other children. */
2976 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2977 }
2978
2979 return 0;
2980 }
2981
2982 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2983 pid_t pid;
2984
2985 assert(m);
2986
2987 pid = PTR_TO_PID(userdata);
2988
2989 if (arg_kill_signal > 0) {
2990 log_info("Container termination requested. Attempting to halt container.");
2991 (void) kill(pid, arg_kill_signal);
2992 } else {
2993 log_info("Container termination requested. Exiting.");
2994 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2995 }
2996
2997 return 0;
2998 }
2999
3000 static int determine_names(void) {
3001 int r;
3002
3003 if (arg_template && !arg_directory && arg_machine) {
3004
3005 /* If --template= was specified then we should not
3006 * search for a machine, but instead create a new one
3007 * in /var/lib/machine. */
3008
3009 arg_directory = path_join("/var/lib/machines", arg_machine);
3010 if (!arg_directory)
3011 return log_oom();
3012 }
3013
3014 if (!arg_image && !arg_directory) {
3015 if (arg_machine) {
3016 _cleanup_(image_unrefp) Image *i = NULL;
3017
3018 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3019 if (r == -ENOENT)
3020 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3021 if (r < 0)
3022 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3023
3024 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3025 r = free_and_strdup(&arg_image, i->path);
3026 else
3027 r = free_and_strdup(&arg_directory, i->path);
3028 if (r < 0)
3029 return log_oom();
3030
3031 if (!arg_ephemeral)
3032 arg_read_only = arg_read_only || i->read_only;
3033 } else {
3034 r = safe_getcwd(&arg_directory);
3035 if (r < 0)
3036 return log_error_errno(r, "Failed to determine current directory: %m");
3037 }
3038
3039 if (!arg_directory && !arg_image)
3040 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3041 }
3042
3043 if (!arg_machine) {
3044 if (arg_directory && path_equal(arg_directory, "/"))
3045 arg_machine = gethostname_malloc();
3046 else if (arg_image) {
3047 char *e;
3048
3049 r = path_extract_filename(arg_image, &arg_machine);
3050 if (r < 0)
3051 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
3052
3053 /* Truncate suffix if there is one */
3054 e = endswith(arg_machine, ".raw");
3055 if (e)
3056 *e = 0;
3057 } else {
3058 r = path_extract_filename(arg_directory, &arg_machine);
3059 if (r < 0)
3060 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3061 }
3062
3063 hostname_cleanup(arg_machine);
3064 if (!hostname_is_valid(arg_machine, 0))
3065 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3066
3067 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3068 * to match fixed config file names. */
3069 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3070 if (!arg_settings_filename)
3071 return log_oom();
3072
3073 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3074 * instances at once without manually having to specify -M each time. */
3075 if (arg_ephemeral)
3076 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3077 return log_oom();
3078 } else {
3079 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3080 if (!arg_settings_filename)
3081 return log_oom();
3082 }
3083
3084 return 0;
3085 }
3086
3087 static int chase_and_update(char **p, unsigned flags) {
3088 char *chased;
3089 int r;
3090
3091 assert(p);
3092
3093 if (!*p)
3094 return 0;
3095
3096 r = chase(*p, NULL, flags, &chased, NULL);
3097 if (r < 0)
3098 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3099
3100 return free_and_replace(*p, chased);
3101 }
3102
3103 static int determine_uid_shift(const char *directory) {
3104
3105 if (arg_userns_mode == USER_NAMESPACE_NO) {
3106 arg_uid_shift = 0;
3107 return 0;
3108 }
3109
3110 if (arg_uid_shift == UID_INVALID) {
3111 struct stat st;
3112
3113 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3114
3115 if (stat(directory, &st) < 0)
3116 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3117
3118 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3119
3120 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3121 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3122 "UID and GID base of %s don't match.", directory);
3123
3124 arg_uid_range = UINT32_C(0x10000);
3125
3126 if (arg_uid_shift != 0) {
3127 /* If the image is shifted already, then we'll fall back to classic chowning, for
3128 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3129
3130 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3131 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3132 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3133 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3134 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3135 "UID base of %s is not zero, UID mapping not supported.", directory);
3136 }
3137 }
3138
3139 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3141
3142 return 0;
3143 }
3144
3145 static unsigned long effective_clone_ns_flags(void) {
3146 unsigned long flags = arg_clone_ns_flags;
3147
3148 if (arg_private_network)
3149 flags |= CLONE_NEWNET;
3150 if (arg_use_cgns)
3151 flags |= CLONE_NEWCGROUP;
3152 if (arg_userns_mode != USER_NAMESPACE_NO)
3153 flags |= CLONE_NEWUSER;
3154
3155 return flags;
3156 }
3157
3158 static int patch_sysctl(void) {
3159
3160 /* This table is inspired by runc's sysctl() function */
3161 static const struct {
3162 const char *key;
3163 bool prefix;
3164 unsigned long clone_flags;
3165 } safe_sysctl[] = {
3166 { "kernel.hostname", false, CLONE_NEWUTS },
3167 { "kernel.domainname", false, CLONE_NEWUTS },
3168 { "kernel.msgmax", false, CLONE_NEWIPC },
3169 { "kernel.msgmnb", false, CLONE_NEWIPC },
3170 { "kernel.msgmni", false, CLONE_NEWIPC },
3171 { "kernel.sem", false, CLONE_NEWIPC },
3172 { "kernel.shmall", false, CLONE_NEWIPC },
3173 { "kernel.shmmax", false, CLONE_NEWIPC },
3174 { "kernel.shmmni", false, CLONE_NEWIPC },
3175 { "fs.mqueue.", true, CLONE_NEWIPC },
3176 { "net.", true, CLONE_NEWNET },
3177 };
3178
3179 unsigned long flags;
3180 int r;
3181
3182 flags = effective_clone_ns_flags();
3183
3184 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3185 bool good = false;
3186 size_t i;
3187
3188 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3189
3190 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3191 continue;
3192
3193 if (safe_sysctl[i].prefix)
3194 good = startswith(*k, safe_sysctl[i].key);
3195 else
3196 good = streq(*k, safe_sysctl[i].key);
3197
3198 if (good)
3199 break;
3200 }
3201
3202 if (!good)
3203 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3204
3205 r = sysctl_write(*k, *v);
3206 if (r < 0)
3207 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3208 }
3209
3210 return 0;
3211 }
3212
3213 static int inner_child(
3214 Barrier *barrier,
3215 int fd_inner_socket,
3216 FDSet *fds,
3217 char **os_release_pairs) {
3218
3219 _cleanup_free_ char *home = NULL;
3220 size_t n_env = 1;
3221 char *envp[] = {
3222 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3223 NULL, /* container */
3224 NULL, /* TERM */
3225 NULL, /* HOME */
3226 NULL, /* USER */
3227 NULL, /* LOGNAME */
3228 NULL, /* container_uuid */
3229 NULL, /* LISTEN_FDS */
3230 NULL, /* LISTEN_PID */
3231 NULL, /* NOTIFY_SOCKET */
3232 NULL, /* CREDENTIALS_DIRECTORY */
3233 NULL, /* LANG */
3234 NULL
3235 };
3236 const char *exec_target;
3237 _cleanup_strv_free_ char **env_use = NULL;
3238 int r, which_failed;
3239
3240 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3241 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3242 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3243 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3244 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3245 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3246 * namespace.
3247 *
3248 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3249 * unshare(). See below. */
3250
3251 assert(barrier);
3252 assert(fd_inner_socket >= 0);
3253
3254 log_debug("Inner child is initializing.");
3255
3256 if (arg_userns_mode != USER_NAMESPACE_NO) {
3257 /* Tell the parent, that it now can write the UID map. */
3258 (void) barrier_place(barrier); /* #1 */
3259
3260 /* Wait until the parent wrote the UID map */
3261 if (!barrier_place_and_sync(barrier)) /* #2 */
3262 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3263
3264 /* Become the new root user inside our namespace */
3265 r = reset_uid_gid();
3266 if (r < 0)
3267 return log_error_errno(r, "Couldn't become new root: %m");
3268
3269 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3270 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3271 * propagation, but simply create new peer groups for all our mounts). */
3272 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3273 if (r < 0)
3274 return r;
3275 }
3276
3277 r = mount_all(NULL,
3278 arg_mount_settings | MOUNT_IN_USERNS,
3279 arg_uid_shift,
3280 arg_selinux_apifs_context);
3281 if (r < 0)
3282 return r;
3283
3284 if (!arg_network_namespace_path && arg_private_network) {
3285 r = unshare(CLONE_NEWNET);
3286 if (r < 0)
3287 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3288
3289 /* Tell the parent that it can setup network interfaces. */
3290 (void) barrier_place(barrier); /* #3 */
3291 }
3292
3293 r = mount_sysfs(NULL, arg_mount_settings);
3294 if (r < 0)
3295 return r;
3296
3297 /* Wait until we are cgroup-ified, so that we
3298 * can mount the right cgroup path writable */
3299 if (!barrier_place_and_sync(barrier)) /* #4 */
3300 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3301 "Parent died too early");
3302
3303 if (arg_use_cgns) {
3304 r = unshare(CLONE_NEWCGROUP);
3305 if (r < 0)
3306 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3307 r = mount_cgroups(
3308 "",
3309 arg_unified_cgroup_hierarchy,
3310 arg_userns_mode != USER_NAMESPACE_NO,
3311 arg_uid_shift,
3312 arg_uid_range,
3313 arg_selinux_apifs_context,
3314 true);
3315 } else
3316 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3317 if (r < 0)
3318 return r;
3319
3320 r = setup_boot_id();
3321 if (r < 0)
3322 return r;
3323
3324 r = setup_kmsg(fd_inner_socket);
3325 if (r < 0)
3326 return r;
3327
3328 r = mount_custom(
3329 "/",
3330 arg_custom_mounts,
3331 arg_n_custom_mounts,
3332 0,
3333 0,
3334 arg_selinux_apifs_context,
3335 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3336 if (r < 0)
3337 return r;
3338
3339 if (setsid() < 0)
3340 return log_error_errno(errno, "setsid() failed: %m");
3341
3342 if (arg_private_network)
3343 (void) loopback_setup();
3344
3345 if (arg_expose_ports) {
3346 r = expose_port_send_rtnl(fd_inner_socket);
3347 if (r < 0)
3348 return r;
3349 }
3350
3351 if (arg_console_mode != CONSOLE_PIPE) {
3352 _cleanup_close_ int master = -EBADF;
3353 _cleanup_free_ char *console = NULL;
3354
3355 /* Allocate a pty and make it available as /dev/console. */
3356 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3357 if (master < 0)
3358 return log_error_errno(master, "Failed to allocate a pty: %m");
3359
3360 r = setup_dev_console(console);
3361 if (r < 0)
3362 return log_error_errno(r, "Failed to set up /dev/console: %m");
3363
3364 r = send_one_fd(fd_inner_socket, master, 0);
3365 if (r < 0)
3366 return log_error_errno(r, "Failed to send master fd: %m");
3367
3368 r = setup_stdio_as_dev_console();
3369 if (r < 0)
3370 return r;
3371 }
3372
3373 r = patch_sysctl();
3374 if (r < 0)
3375 return r;
3376
3377 if (arg_oom_score_adjust_set) {
3378 r = set_oom_score_adjust(arg_oom_score_adjust);
3379 if (r < 0)
3380 return log_error_errno(r, "Failed to adjust OOM score: %m");
3381 }
3382
3383 if (arg_cpu_set.set)
3384 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3385 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3386
3387 (void) setup_hostname();
3388
3389 if (arg_personality != PERSONALITY_INVALID) {
3390 r = safe_personality(arg_personality);
3391 if (r < 0)
3392 return log_error_errno(r, "personality() failed: %m");
3393 #ifdef ARCHITECTURE_SECONDARY
3394 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
3395 r = safe_personality(PER_LINUX32);
3396 if (r < 0)
3397 return log_error_errno(r, "personality() failed: %m");
3398 #endif
3399 } else if (arg_architecture >= 0 && arg_architecture != native_architecture())
3400 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3401 "Selected architecture '%s' not supported locally, refusing.",
3402 architecture_to_string(arg_architecture));
3403
3404 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3405 if (r < 0)
3406 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3407
3408 #if HAVE_SECCOMP
3409 if (arg_seccomp) {
3410
3411 if (is_seccomp_available()) {
3412
3413 r = seccomp_load(arg_seccomp);
3414 if (ERRNO_IS_SECCOMP_FATAL(r))
3415 return log_error_errno(r, "Failed to install seccomp filter: %m");
3416 if (r < 0)
3417 log_debug_errno(r, "Failed to install seccomp filter: %m");
3418 }
3419 } else
3420 #endif
3421 {
3422 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3423 if (r < 0)
3424 return r;
3425 }
3426
3427 if (arg_suppress_sync) {
3428 #if HAVE_SECCOMP
3429 r = seccomp_suppress_sync();
3430 if (r < 0)
3431 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3432 #else
3433 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3434 #endif
3435 }
3436
3437 #if HAVE_SELINUX
3438 if (arg_selinux_context)
3439 if (setexeccon(arg_selinux_context) < 0)
3440 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3441 #endif
3442
3443 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3444 * if we need to later on. */
3445 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3446 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3447
3448 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3449 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3450 else
3451 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3452 if (r < 0)
3453 return r;
3454
3455 r = drop_capabilities(getuid());
3456 if (r < 0)
3457 return log_error_errno(r, "Dropping capabilities failed: %m");
3458
3459 if (arg_no_new_privileges)
3460 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3461 return log_error_errno(errno, "Failed to disable new privileges: %m");
3462
3463 /* LXC sets container=lxc, so follow the scheme here */
3464 envp[n_env++] = strjoina("container=", arg_container_service_name);
3465
3466 envp[n_env] = strv_find_prefix(environ, "TERM=");
3467 if (envp[n_env])
3468 n_env++;
3469
3470 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3471 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3472 return log_oom();
3473
3474 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3475 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3476 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
3477 return log_oom();
3478
3479 assert(!sd_id128_is_null(arg_uuid));
3480
3481 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3482 return log_oom();
3483
3484 if (fdset_size(fds) > 0) {
3485 r = fdset_cloexec(fds, false);
3486 if (r < 0)
3487 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3488
3489 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3490 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3491 return log_oom();
3492 }
3493 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3494 return log_oom();
3495
3496 if (arg_n_credentials > 0) {
3497 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3498 if (!envp[n_env])
3499 return log_oom();
3500 n_env++;
3501 }
3502
3503 if (arg_start_mode != START_BOOT) {
3504 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3505 if (!envp[n_env])
3506 return log_oom();
3507 n_env++;
3508 }
3509
3510 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3511 if (!env_use)
3512 return log_oom();
3513
3514 /* Let the parent know that we are ready and
3515 * wait until the parent is ready with the
3516 * setup, too... */
3517 if (!barrier_place_and_sync(barrier)) /* #5 */
3518 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3519
3520 if (arg_chdir)
3521 if (chdir(arg_chdir) < 0)
3522 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3523
3524 if (arg_start_mode == START_PID2) {
3525 r = stub_pid1(arg_uuid);
3526 if (r < 0)
3527 return r;
3528 }
3529
3530 if (arg_console_mode != CONSOLE_PIPE) {
3531 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3532 * are configured for that. Acquire it as controlling tty. */
3533 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3534 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3535 }
3536
3537 log_debug("Inner child completed, invoking payload.");
3538
3539 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3540 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3541 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3542 log_close();
3543 log_set_open_when_needed(true);
3544 log_settle_target();
3545
3546 (void) fdset_close_others(fds);
3547
3548 if (arg_start_mode == START_BOOT) {
3549 char **a;
3550 size_t m;
3551
3552 /* Automatically search for the init system */
3553
3554 m = strv_length(arg_parameters);
3555 a = newa(char*, m + 2);
3556 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3557 a[1 + m] = NULL;
3558
3559 FOREACH_STRING(init,
3560 "/usr/lib/systemd/systemd",
3561 "/lib/systemd/systemd",
3562 "/sbin/init") {
3563 a[0] = (char*) init;
3564 execve(a[0], a, env_use);
3565 }
3566
3567 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3568 } else if (!strv_isempty(arg_parameters)) {
3569 const char *dollar_path;
3570
3571 exec_target = arg_parameters[0];
3572
3573 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3574 * binary. */
3575 dollar_path = strv_env_get(env_use, "PATH");
3576 if (dollar_path) {
3577 if (setenv("PATH", dollar_path, 1) < 0)
3578 return log_error_errno(errno, "Failed to update $PATH: %m");
3579 }
3580
3581 execvpe(arg_parameters[0], arg_parameters, env_use);
3582 } else {
3583 if (!arg_chdir)
3584 /* If we cannot change the directory, we'll end up in /, that is expected. */
3585 (void) chdir(home ?: "/root");
3586
3587 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3588 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3589 execle("/bin/bash", "-bash", NULL, env_use);
3590 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3591 execle("/bin/sh", "-sh", NULL, env_use);
3592
3593 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3594 }
3595
3596 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3597 }
3598
3599 static int setup_notify_child(void) {
3600 _cleanup_close_ int fd = -EBADF;
3601 static const union sockaddr_union sa = {
3602 .un.sun_family = AF_UNIX,
3603 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3604 };
3605 int r;
3606
3607 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3608 if (fd < 0)
3609 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3610
3611 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3612 (void) sockaddr_un_unlink(&sa.un);
3613
3614 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3615 if (r < 0)
3616 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3617
3618 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3619 if (r < 0)
3620 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3621
3622 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3623 if (r < 0)
3624 return log_error_errno(r, "SO_PASSCRED failed: %m");
3625
3626 return TAKE_FD(fd);
3627 }
3628
3629 static int outer_child(
3630 Barrier *barrier,
3631 const char *directory,
3632 DissectedImage *dissected_image,
3633 int fd_outer_socket,
3634 int fd_inner_socket,
3635 FDSet *fds,
3636 int netns_fd) {
3637
3638 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3639 _cleanup_strv_free_ char **os_release_pairs = NULL;
3640 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
3641 bool idmap = false;
3642 const char *p;
3643 pid_t pid;
3644 ssize_t l;
3645 int r;
3646
3647 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3648 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3649 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3650 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3651 * forked off it, and it exits. */
3652
3653 assert(barrier);
3654 assert(directory);
3655 assert(fd_outer_socket >= 0);
3656 assert(fd_inner_socket >= 0);
3657
3658 log_debug("Outer child is initializing.");
3659
3660 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3661 if (r < 0)
3662 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3663
3664 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3665 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3666
3667 r = reset_audit_loginuid();
3668 if (r < 0)
3669 return r;
3670
3671 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3672 * mounts to the real root. */
3673 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3674 if (r < 0)
3675 return r;
3676
3677 if (dissected_image) {
3678 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3679 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3680 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3681 * right place right away. This makes sure ESP partitions and userns are compatible. */
3682
3683 r = dissected_image_mount_and_warn(
3684 dissected_image,
3685 directory,
3686 arg_uid_shift,
3687 arg_uid_range,
3688 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3689 DISSECT_IMAGE_DISCARD_ON_LOOP|
3690 DISSECT_IMAGE_USR_NO_ROOT|
3691 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3692 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3693 if (r < 0)
3694 return r;
3695 }
3696
3697 r = determine_uid_shift(directory);
3698 if (r < 0)
3699 return r;
3700
3701 if (arg_userns_mode != USER_NAMESPACE_NO) {
3702 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3703 if (r < 0)
3704 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3705
3706 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
3707 if (l < 0)
3708 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3709 mntns_fd = safe_close(mntns_fd);
3710
3711 /* Let the parent know which UID shift we read from the image */
3712 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3713 if (l < 0)
3714 return log_error_errno(errno, "Failed to send UID shift: %m");
3715 if (l != sizeof(arg_uid_shift))
3716 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3717 "Short write while sending UID shift.");
3718
3719 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3720 /* When we are supposed to pick the UID shift, the parent will check now whether the
3721 * UID shift we just read from the image is available. If yes, it will send the UID
3722 * shift back to us, if not it will pick a different one, and send it back to us. */
3723
3724 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3725 if (l < 0)
3726 return log_error_errno(errno, "Failed to recv UID shift: %m");
3727 if (l != sizeof(arg_uid_shift))
3728 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3729 "Short read while receiving UID shift.");
3730 }
3731
3732 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3733 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3734 }
3735
3736 if (path_equal(directory, "/")) {
3737 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3738 * place, so that we can make changes to its mount structure (for example, to implement
3739 * --volatile=) without this interfering with our ability to access files such as
3740 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3741 * (instead of a temporary directory, since we are living in our own mount namespace here
3742 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3743 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3744
3745 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3746 if (r < 0)
3747 return r;
3748
3749 directory = "/run/systemd/nspawn-root";
3750 }
3751
3752 /* Make sure we always have a mount that we can move to root later on. */
3753 r = make_mount_point(directory);
3754 if (r < 0)
3755 return r;
3756
3757 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3758 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3759 * we'll live in our own little world from now on, and propagation from the host may only happen via
3760 * the mount tunnel dir, or not at all. */
3761 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3762 if (r < 0)
3763 return r;
3764
3765 r = setup_pivot_root(
3766 directory,
3767 arg_pivot_root_new,
3768 arg_pivot_root_old);
3769 if (r < 0)
3770 return r;
3771
3772 r = setup_volatile_mode(
3773 directory,
3774 arg_volatile_mode,
3775 arg_uid_shift,
3776 arg_selinux_apifs_context);
3777 if (r < 0)
3778 return r;
3779
3780 r = bind_user_prepare(
3781 directory,
3782 arg_bind_user,
3783 arg_uid_shift,
3784 arg_uid_range,
3785 &arg_custom_mounts, &arg_n_custom_mounts,
3786 &bind_user_context);
3787 if (r < 0)
3788 return r;
3789
3790 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3791 /* Send the user maps we determined to the parent, so that it installs it in our user
3792 * namespace UID map table */
3793
3794 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3795 uid_t map[] = {
3796 bind_user_context->data[i].payload_user->uid,
3797 bind_user_context->data[i].host_user->uid,
3798 (uid_t) bind_user_context->data[i].payload_group->gid,
3799 (uid_t) bind_user_context->data[i].host_group->gid,
3800 };
3801
3802 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
3803 if (l < 0)
3804 return log_error_errno(errno, "Failed to send user UID map: %m");
3805 if (l != sizeof(map))
3806 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3807 "Short write while sending user UID map.");
3808 }
3809 }
3810
3811 r = mount_custom(
3812 directory,
3813 arg_custom_mounts,
3814 arg_n_custom_mounts,
3815 arg_uid_shift,
3816 arg_uid_range,
3817 arg_selinux_apifs_context,
3818 MOUNT_ROOT_ONLY);
3819 if (r < 0)
3820 return r;
3821
3822 if (arg_userns_mode != USER_NAMESPACE_NO &&
3823 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3824 arg_uid_shift != 0) {
3825
3826 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
3827 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3828 /* This might fail because the kernel or file system doesn't support idmapping. We
3829 * can't really distinguish this nicely, nor do we have any guarantees about the
3830 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3831 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3832 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3833 "ID mapped mounts are apparently not available, sorry.");
3834
3835 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3836 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3837 } else if (r < 0)
3838 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3839 else {
3840 log_debug("ID mapped mounts available, making use of them.");
3841 idmap = true;
3842 }
3843 }
3844
3845 if (dissected_image) {
3846 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3847 r = dissected_image_mount(
3848 dissected_image,
3849 directory,
3850 arg_uid_shift,
3851 arg_uid_range,
3852 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3853 DISSECT_IMAGE_DISCARD_ON_LOOP|
3854 DISSECT_IMAGE_USR_NO_ROOT|
3855 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3856 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3857 if (r == -EUCLEAN)
3858 return log_error_errno(r, "File system check for image failed: %m");
3859 if (r < 0)
3860 return log_error_errno(r, "Failed to mount image file system: %m");
3861 }
3862
3863 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3864 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3865
3866 r = detect_unified_cgroup_hierarchy_from_image(directory);
3867 if (r < 0)
3868 return r;
3869
3870 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3871 if (l < 0)
3872 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3873 if (l != sizeof(arg_unified_cgroup_hierarchy))
3874 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3875 "Short write while sending cgroup mode.");
3876 }
3877
3878 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3879 if (r < 0)
3880 return r;
3881
3882 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3883 if (r < 0)
3884 return r;
3885
3886 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3887 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3888 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3889 if (r < 0)
3890 return log_error_errno(r, "Failed to make tree read-only: %m");
3891 }
3892
3893 r = mount_all(directory,
3894 arg_mount_settings,
3895 arg_uid_shift,
3896 arg_selinux_apifs_context);
3897 if (r < 0)
3898 return r;
3899
3900 r = copy_devnodes(directory);
3901 if (r < 0)
3902 return r;
3903
3904 r = make_extra_nodes(directory);
3905 if (r < 0)
3906 return r;
3907
3908 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3909
3910 p = prefix_roota(directory, "/run/host");
3911 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3912
3913 r = setup_pts(directory);
3914 if (r < 0)
3915 return r;
3916
3917 r = mount_tunnel_dig(directory);
3918 if (r < 0)
3919 return r;
3920
3921 r = setup_keyring();
3922 if (r < 0)
3923 return r;
3924
3925 r = setup_credentials(directory);
3926 if (r < 0)
3927 return r;
3928
3929 r = bind_user_setup(bind_user_context, directory);
3930 if (r < 0)
3931 return r;
3932
3933 r = mount_custom(
3934 directory,
3935 arg_custom_mounts,
3936 arg_n_custom_mounts,
3937 arg_uid_shift,
3938 arg_uid_range,
3939 arg_selinux_apifs_context,
3940 MOUNT_NON_ROOT_ONLY);
3941 if (r < 0)
3942 return r;
3943
3944 r = setup_timezone(directory);
3945 if (r < 0)
3946 return r;
3947
3948 r = setup_resolv_conf(directory);
3949 if (r < 0)
3950 return r;
3951
3952 r = setup_machine_id(directory);
3953 if (r < 0)
3954 return r;
3955
3956 r = setup_journal(directory);
3957 if (r < 0)
3958 return r;
3959
3960 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3961 p = prefix_roota(directory, "/run/host/container-manager");
3962 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3963
3964 /* The same stuff as the $container_uuid env var */
3965 p = prefix_roota(directory, "/run/host/container-uuid");
3966 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3967
3968 if (!arg_use_cgns) {
3969 r = mount_cgroups(
3970 directory,
3971 arg_unified_cgroup_hierarchy,
3972 arg_userns_mode != USER_NAMESPACE_NO,
3973 arg_uid_shift,
3974 arg_uid_range,
3975 arg_selinux_apifs_context,
3976 false);
3977 if (r < 0)
3978 return r;
3979 }
3980
3981 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3982 * mounts available in systemd services inside the container that create a new mount namespace. See
3983 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3984 * will inherit the shared propagation mode.
3985 *
3986 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3987 * directory mount to root later on.
3988 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3989 */
3990 r = mount_switch_root(directory, MS_SHARED);
3991 if (r < 0)
3992 return log_error_errno(r, "Failed to move root directory: %m");
3993
3994 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3995 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3996 * the container. */
3997 r = mount_tunnel_open();
3998 if (r < 0)
3999 return r;
4000
4001 if (arg_userns_mode != USER_NAMESPACE_NO) {
4002 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4003 * requires that a fully visible instance is already present in the target mount
4004 * namespace. Mount one here so the inner child can mount its own instances. Later
4005 * we umount the temporary instances created here before we actually exec the
4006 * payload. Since the rootfs is shared the umount will propagate into the container.
4007 * Note, the inner child wouldn't be able to unmount the instances on its own since
4008 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4009 * this. */
4010 r = pin_fully_visible_fs();
4011 if (r < 0)
4012 return r;
4013 }
4014
4015 fd = setup_notify_child();
4016 if (fd < 0)
4017 return fd;
4018
4019 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4020 arg_clone_ns_flags |
4021 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
4022 if (pid < 0)
4023 return log_error_errno(errno, "Failed to fork inner child: %m");
4024 if (pid == 0) {
4025 fd_outer_socket = safe_close(fd_outer_socket);
4026
4027 /* The inner child has all namespaces that are requested, so that we all are owned by the
4028 * user if user namespaces are turned on. */
4029
4030 if (arg_network_namespace_path) {
4031 r = namespace_enter(-1, -1, netns_fd, -1, -1);
4032 if (r < 0)
4033 return log_error_errno(r, "Failed to join network namespace: %m");
4034 }
4035
4036 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
4037 if (r < 0)
4038 _exit(EXIT_FAILURE);
4039
4040 _exit(EXIT_SUCCESS);
4041 }
4042
4043 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4044 if (l < 0)
4045 return log_error_errno(errno, "Failed to send PID: %m");
4046 if (l != sizeof(pid))
4047 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4048 "Short write while sending PID.");
4049
4050 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4051 if (l < 0)
4052 return log_error_errno(errno, "Failed to send machine ID: %m");
4053 if (l != sizeof(arg_uuid))
4054 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4055 "Short write while sending machine ID.");
4056
4057 l = send_one_fd(fd_outer_socket, fd, 0);
4058 if (l < 0)
4059 return log_error_errno(l, "Failed to send notify fd: %m");
4060
4061 fd_outer_socket = safe_close(fd_outer_socket);
4062 fd_inner_socket = safe_close(fd_inner_socket);
4063 netns_fd = safe_close(netns_fd);
4064
4065 return 0;
4066 }
4067
4068 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4069 bool tried_hashed = false;
4070 unsigned n_tries = 100;
4071 uid_t candidate;
4072 int r;
4073
4074 assert(shift);
4075 assert(ret_lock_file);
4076 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4077 assert(arg_uid_range == 0x10000U);
4078
4079 candidate = *shift;
4080
4081 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4082
4083 for (;;) {
4084 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4085 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4086
4087 if (--n_tries <= 0)
4088 return -EBUSY;
4089
4090 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4091 goto next;
4092 if ((candidate & UINT32_C(0xFFFF)) != 0)
4093 goto next;
4094
4095 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4096 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4097 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4098 goto next;
4099 if (r < 0)
4100 return r;
4101
4102 /* Make some superficial checks whether the range is currently known in the user database */
4103 if (getpwuid(candidate))
4104 goto next;
4105 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4106 goto next;
4107 if (getgrgid(candidate))
4108 goto next;
4109 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4110 goto next;
4111
4112 *ret_lock_file = lf;
4113 lf = (struct LockFile) LOCK_FILE_INIT;
4114 *shift = candidate;
4115 return 0;
4116
4117 next:
4118 if (arg_machine && !tried_hashed) {
4119 /* Try to hash the base from the container name */
4120
4121 static const uint8_t hash_key[] = {
4122 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4123 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4124 };
4125
4126 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4127
4128 tried_hashed = true;
4129 } else
4130 random_bytes(&candidate, sizeof(candidate));
4131
4132 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4133 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4134 }
4135 }
4136
4137 static int add_one_uid_map(
4138 char **p,
4139 uid_t container_uid,
4140 uid_t host_uid,
4141 uid_t range) {
4142
4143 return strextendf(p,
4144 UID_FMT " " UID_FMT " " UID_FMT "\n",
4145 container_uid, host_uid, range);
4146 }
4147
4148 static int make_uid_map_string(
4149 const uid_t bind_user_uid[],
4150 size_t n_bind_user_uid,
4151 size_t offset,
4152 char **ret) {
4153
4154 _cleanup_free_ char *s = NULL;
4155 uid_t previous_uid = 0;
4156 int r;
4157
4158 assert(n_bind_user_uid == 0 || bind_user_uid);
4159 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4160 assert(ret);
4161
4162 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4163 * quadruplet, consisting of host and container UID + GID. */
4164
4165 for (size_t i = 0; i < n_bind_user_uid; i++) {
4166 uid_t payload_uid = bind_user_uid[i*4+offset],
4167 host_uid = bind_user_uid[i*4+offset+1];
4168
4169 assert(previous_uid <= payload_uid);
4170 assert(payload_uid < arg_uid_range);
4171
4172 /* Add a range to close the gap to previous entry */
4173 if (payload_uid > previous_uid) {
4174 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4175 if (r < 0)
4176 return r;
4177 }
4178
4179 /* Map this specific user */
4180 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4181 if (r < 0)
4182 return r;
4183
4184 previous_uid = payload_uid + 1;
4185 }
4186
4187 /* And add a range to close the gap to finish the range */
4188 if (arg_uid_range > previous_uid) {
4189 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4190 if (r < 0)
4191 return r;
4192 }
4193
4194 assert(s);
4195
4196 *ret = TAKE_PTR(s);
4197 return 0;
4198 }
4199
4200 static int setup_uid_map(
4201 pid_t pid,
4202 const uid_t bind_user_uid[],
4203 size_t n_bind_user_uid) {
4204
4205 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4206 _cleanup_free_ char *s = NULL;
4207 int r;
4208
4209 assert(pid > 1);
4210
4211 /* Build the UID map string */
4212 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4213 return log_oom();
4214
4215 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4216 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4217 if (r < 0)
4218 return log_error_errno(r, "Failed to write UID map: %m");
4219
4220 /* And now build the GID map string */
4221 s = mfree(s);
4222 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4223 return log_oom();
4224
4225 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4226 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4227 if (r < 0)
4228 return log_error_errno(r, "Failed to write GID map: %m");
4229
4230 return 0;
4231 }
4232
4233 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4234 char buf[NOTIFY_BUFFER_MAX+1];
4235 char *p = NULL;
4236 struct iovec iovec = {
4237 .iov_base = buf,
4238 .iov_len = sizeof(buf)-1,
4239 };
4240 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4241 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4242 struct msghdr msghdr = {
4243 .msg_iov = &iovec,
4244 .msg_iovlen = 1,
4245 .msg_control = &control,
4246 .msg_controllen = sizeof(control),
4247 };
4248 struct ucred *ucred;
4249 ssize_t n;
4250 pid_t inner_child_pid;
4251 _cleanup_strv_free_ char **tags = NULL;
4252 int r;
4253
4254 assert(userdata);
4255
4256 inner_child_pid = PTR_TO_PID(userdata);
4257
4258 if (revents != EPOLLIN) {
4259 log_warning("Got unexpected poll event for notify fd.");
4260 return 0;
4261 }
4262
4263 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4264 if (n < 0) {
4265 if (ERRNO_IS_TRANSIENT(n))
4266 return 0;
4267 if (n == -EXFULL) {
4268 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4269 return 0;
4270 }
4271 return log_warning_errno(n, "Couldn't read notification socket: %m");
4272 }
4273
4274 cmsg_close_all(&msghdr);
4275
4276 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4277 if (!ucred || ucred->pid != inner_child_pid) {
4278 log_debug("Received notify message without valid credentials. Ignoring.");
4279 return 0;
4280 }
4281
4282 if ((size_t) n >= sizeof(buf)) {
4283 log_warning("Received notify message exceeded maximum size. Ignoring.");
4284 return 0;
4285 }
4286
4287 buf[n] = 0;
4288 tags = strv_split(buf, "\n\r");
4289 if (!tags)
4290 return log_oom();
4291
4292 if (strv_contains(tags, "READY=1")) {
4293 r = sd_notify(false, "READY=1\n");
4294 if (r < 0)
4295 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4296 }
4297
4298 p = strv_find_startswith(tags, "STATUS=");
4299 if (p)
4300 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4301
4302 return 0;
4303 }
4304
4305 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4306 int r;
4307
4308 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4309 if (r < 0)
4310 return log_error_errno(r, "Failed to allocate notify event source: %m");
4311
4312 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4313
4314 return 0;
4315 }
4316
4317 static int merge_settings(Settings *settings, const char *path) {
4318 int rl;
4319
4320 assert(settings);
4321 assert(path);
4322
4323 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4324 * that this steals the fields of the Settings* structure, and hence modifies it. */
4325
4326 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4327 settings->start_mode >= 0) {
4328 arg_start_mode = settings->start_mode;
4329 strv_free_and_replace(arg_parameters, settings->parameters);
4330 }
4331
4332 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4333 settings->ephemeral >= 0)
4334 arg_ephemeral = settings->ephemeral;
4335
4336 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4337 settings->root) {
4338
4339 if (!arg_settings_trusted)
4340 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4341 else
4342 free_and_replace(arg_directory, settings->root);
4343 }
4344
4345 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4346 settings->pivot_root_new) {
4347 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4348 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4349 }
4350
4351 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4352 settings->working_directory)
4353 free_and_replace(arg_chdir, settings->working_directory);
4354
4355 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4356 settings->environment)
4357 strv_free_and_replace(arg_setenv, settings->environment);
4358
4359 if ((arg_settings_mask & SETTING_USER) == 0) {
4360
4361 if (settings->user)
4362 free_and_replace(arg_user, settings->user);
4363
4364 if (uid_is_valid(settings->uid))
4365 arg_uid = settings->uid;
4366 if (gid_is_valid(settings->gid))
4367 arg_gid = settings->gid;
4368 if (settings->n_supplementary_gids > 0) {
4369 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4370 arg_n_supplementary_gids = settings->n_supplementary_gids;
4371 }
4372 }
4373
4374 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4375 uint64_t plus, minus;
4376 uint64_t network_minus = 0;
4377 uint64_t ambient;
4378
4379 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4380 * Settings structure */
4381
4382 plus = settings->capability;
4383 minus = settings->drop_capability;
4384
4385 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4386 settings_network_configured(settings)) {
4387 if (settings_private_network(settings))
4388 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4389 else
4390 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4391 }
4392
4393 if (!arg_settings_trusted && plus != 0) {
4394 if (settings->capability != 0)
4395 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4396 } else {
4397 arg_caps_retain &= ~network_minus;
4398 arg_caps_retain |= plus;
4399 }
4400
4401 arg_caps_retain &= ~minus;
4402
4403 /* Copy the full capabilities over too */
4404 if (capability_quintet_is_set(&settings->full_capabilities)) {
4405 if (!arg_settings_trusted)
4406 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4407 else
4408 arg_full_capabilities = settings->full_capabilities;
4409 }
4410
4411 ambient = settings->ambient_capability;
4412 if (!arg_settings_trusted && ambient != 0)
4413 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4414 else
4415 arg_caps_ambient |= ambient;
4416 }
4417
4418 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4419 settings->kill_signal > 0)
4420 arg_kill_signal = settings->kill_signal;
4421
4422 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4423 settings->personality != PERSONALITY_INVALID)
4424 arg_personality = settings->personality;
4425
4426 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4427 !sd_id128_is_null(settings->machine_id)) {
4428
4429 if (!arg_settings_trusted)
4430 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4431 else
4432 arg_uuid = settings->machine_id;
4433 }
4434
4435 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4436 settings->read_only >= 0)
4437 arg_read_only = settings->read_only;
4438
4439 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4440 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4441 arg_volatile_mode = settings->volatile_mode;
4442
4443 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4444 settings->n_custom_mounts > 0) {
4445
4446 if (!arg_settings_trusted)
4447 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4448 else {
4449 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4450 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4451 arg_n_custom_mounts = settings->n_custom_mounts;
4452 settings->n_custom_mounts = 0;
4453 }
4454 }
4455
4456 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4457 settings_network_configured(settings)) {
4458
4459 if (!arg_settings_trusted)
4460 log_warning("Ignoring network settings, file %s is not trusted.", path);
4461 else {
4462 arg_network_veth = settings_network_veth(settings);
4463 arg_private_network = settings_private_network(settings);
4464
4465 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4466 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4467 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4468 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4469
4470 free_and_replace(arg_network_bridge, settings->network_bridge);
4471 free_and_replace(arg_network_zone, settings->network_zone);
4472
4473 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4474 }
4475 }
4476
4477 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4478 settings->expose_ports) {
4479
4480 if (!arg_settings_trusted)
4481 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4482 else {
4483 expose_port_free_all(arg_expose_ports);
4484 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4485 }
4486 }
4487
4488 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4489 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4490
4491 if (!arg_settings_trusted)
4492 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4493 else {
4494 arg_userns_mode = settings->userns_mode;
4495 arg_uid_shift = settings->uid_shift;
4496 arg_uid_range = settings->uid_range;
4497 arg_userns_ownership = settings->userns_ownership;
4498 }
4499 }
4500
4501 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4502 !strv_isempty(settings->bind_user))
4503 strv_free_and_replace(arg_bind_user, settings->bind_user);
4504
4505 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4506 settings->notify_ready >= 0)
4507 arg_notify_ready = settings->notify_ready;
4508
4509 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4510
4511 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4512 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4513 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4514 else {
4515 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4516 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4517 }
4518 }
4519
4520 #if HAVE_SECCOMP
4521 if (settings->seccomp) {
4522 if (!arg_settings_trusted)
4523 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4524 else {
4525 seccomp_release(arg_seccomp);
4526 arg_seccomp = TAKE_PTR(settings->seccomp);
4527 }
4528 }
4529 #endif
4530 }
4531
4532 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4533 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4534 continue;
4535
4536 if (!settings->rlimit[rl])
4537 continue;
4538
4539 if (!arg_settings_trusted) {
4540 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4541 continue;
4542 }
4543
4544 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4545 }
4546
4547 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4548 settings->hostname)
4549 free_and_replace(arg_hostname, settings->hostname);
4550
4551 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4552 settings->no_new_privileges >= 0)
4553 arg_no_new_privileges = settings->no_new_privileges;
4554
4555 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4556 settings->oom_score_adjust_set) {
4557
4558 if (!arg_settings_trusted)
4559 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4560 else {
4561 arg_oom_score_adjust = settings->oom_score_adjust;
4562 arg_oom_score_adjust_set = true;
4563 }
4564 }
4565
4566 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4567 settings->cpu_set.set) {
4568
4569 if (!arg_settings_trusted)
4570 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4571 else {
4572 cpu_set_reset(&arg_cpu_set);
4573 arg_cpu_set = settings->cpu_set;
4574 settings->cpu_set = (CPUSet) {};
4575 }
4576 }
4577
4578 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4579 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4580 arg_resolv_conf = settings->resolv_conf;
4581
4582 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4583 settings->link_journal != _LINK_JOURNAL_INVALID) {
4584
4585 if (!arg_settings_trusted)
4586 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4587 else {
4588 arg_link_journal = settings->link_journal;
4589 arg_link_journal_try = settings->link_journal_try;
4590 }
4591 }
4592
4593 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4594 settings->timezone != _TIMEZONE_MODE_INVALID)
4595 arg_timezone = settings->timezone;
4596
4597 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4598 settings->slice) {
4599
4600 if (!arg_settings_trusted)
4601 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4602 else
4603 free_and_replace(arg_slice, settings->slice);
4604 }
4605
4606 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4607 settings->use_cgns >= 0) {
4608
4609 if (!arg_settings_trusted)
4610 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4611 else
4612 arg_use_cgns = settings->use_cgns;
4613 }
4614
4615 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4616 settings->clone_ns_flags != ULONG_MAX) {
4617
4618 if (!arg_settings_trusted)
4619 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4620 else
4621 arg_clone_ns_flags = settings->clone_ns_flags;
4622 }
4623
4624 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4625 settings->console_mode >= 0) {
4626
4627 if (!arg_settings_trusted)
4628 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4629 else
4630 arg_console_mode = settings->console_mode;
4631 }
4632
4633 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4634 settings->suppress_sync >= 0)
4635 arg_suppress_sync = settings->suppress_sync;
4636
4637 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4638 * don't consult arg_settings_mask for them. */
4639
4640 sd_bus_message_unref(arg_property_message);
4641 arg_property_message = TAKE_PTR(settings->properties);
4642
4643 arg_console_width = settings->console_width;
4644 arg_console_height = settings->console_height;
4645
4646 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4647 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4648 arg_n_extra_nodes = settings->n_extra_nodes;
4649
4650 return 0;
4651 }
4652
4653 static int load_settings(void) {
4654 _cleanup_(settings_freep) Settings *settings = NULL;
4655 _cleanup_fclose_ FILE *f = NULL;
4656 _cleanup_free_ char *p = NULL;
4657 int r;
4658
4659 if (arg_oci_bundle)
4660 return 0;
4661
4662 /* If all settings are masked, there's no point in looking for
4663 * the settings file */
4664 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4665 return 0;
4666
4667 /* We first look in the admin's directories in /etc and /run */
4668 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4669 _cleanup_free_ char *j = NULL;
4670
4671 j = path_join(i, arg_settings_filename);
4672 if (!j)
4673 return log_oom();
4674
4675 f = fopen(j, "re");
4676 if (f) {
4677 p = TAKE_PTR(j);
4678
4679 /* By default, we trust configuration from /etc and /run */
4680 if (arg_settings_trusted < 0)
4681 arg_settings_trusted = true;
4682
4683 break;
4684 }
4685
4686 if (errno != ENOENT)
4687 return log_error_errno(errno, "Failed to open %s: %m", j);
4688 }
4689
4690 if (!f) {
4691 /* After that, let's look for a file next to the
4692 * actual image we shall boot. */
4693
4694 if (arg_image) {
4695 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4696 if (r < 0)
4697 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4698 } else if (arg_directory) {
4699 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4700 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4701 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
4702 }
4703
4704 if (p) {
4705 f = fopen(p, "re");
4706 if (!f && errno != ENOENT)
4707 return log_error_errno(errno, "Failed to open %s: %m", p);
4708
4709 /* By default, we do not trust configuration from /var/lib/machines */
4710 if (arg_settings_trusted < 0)
4711 arg_settings_trusted = false;
4712 }
4713 }
4714
4715 if (!f)
4716 return 0;
4717
4718 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4719
4720 r = settings_load(f, p, &settings);
4721 if (r < 0)
4722 return r;
4723
4724 return merge_settings(settings, p);
4725 }
4726
4727 static int load_oci_bundle(void) {
4728 _cleanup_(settings_freep) Settings *settings = NULL;
4729 int r;
4730
4731 if (!arg_oci_bundle)
4732 return 0;
4733
4734 /* By default let's trust OCI bundles */
4735 if (arg_settings_trusted < 0)
4736 arg_settings_trusted = true;
4737
4738 r = oci_load(NULL, arg_oci_bundle, &settings);
4739 if (r < 0)
4740 return r;
4741
4742 return merge_settings(settings, arg_oci_bundle);
4743 }
4744
4745 static int run_container(
4746 DissectedImage *dissected_image,
4747 FDSet *fds,
4748 char veth_name[IFNAMSIZ], bool *veth_created,
4749 struct ExposeArgs *expose_args,
4750 int *master, pid_t *pid, int *ret) {
4751
4752 static const struct sigaction sa = {
4753 .sa_handler = nop_signal_handler,
4754 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4755 };
4756
4757 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4758 _cleanup_close_ int etc_passwd_lock = -EBADF;
4759 _cleanup_close_pair_ int
4760 fd_inner_socket_pair[2] = PIPE_EBADF,
4761 fd_outer_socket_pair[2] = PIPE_EBADF;
4762
4763 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
4764 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4765 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4766 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4767 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4768 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4769 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4770 _cleanup_free_ uid_t *bind_user_uid = NULL;
4771 size_t n_bind_user_uid = 0;
4772 ContainerStatus container_status = 0;
4773 int ifi = 0, r;
4774 ssize_t l;
4775 sigset_t mask_chld;
4776 _cleanup_close_ int child_netns_fd = -EBADF;
4777
4778 assert_se(sigemptyset(&mask_chld) == 0);
4779 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4780
4781 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4782 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4783 * check with getpwuid() if the specific user already exists. Note that /etc might be
4784 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4785 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4786 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4787 * really ours. */
4788
4789 etc_passwd_lock = take_etc_passwd_lock(NULL);
4790 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4791 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4792 }
4793
4794 r = barrier_create(&barrier);
4795 if (r < 0)
4796 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4797
4798 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4799 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4800
4801 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4802 return log_error_errno(errno, "Failed to create outer socket pair: %m");
4803
4804 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4805 * parent's blocking calls and give it a chance to call wait() and terminate. */
4806 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4807 if (r < 0)
4808 return log_error_errno(errno, "Failed to change the signal mask: %m");
4809
4810 r = sigaction(SIGCHLD, &sa, NULL);
4811 if (r < 0)
4812 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4813
4814 if (arg_network_namespace_path) {
4815 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4816 if (child_netns_fd < 0)
4817 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4818
4819 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4820 if (r == -EUCLEAN)
4821 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4822 else if (r < 0)
4823 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4824 else if (r == 0)
4825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4826 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4827 }
4828
4829 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4830 if (*pid < 0)
4831 return log_error_errno(errno, "clone() failed%s: %m",
4832 errno == EINVAL ?
4833 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4834
4835 if (*pid == 0) {
4836 /* The outer child only has a file system namespace. */
4837 barrier_set_role(&barrier, BARRIER_CHILD);
4838
4839 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
4840 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
4841
4842 (void) reset_all_signal_handlers();
4843 (void) reset_signal_mask();
4844
4845 r = outer_child(&barrier,
4846 arg_directory,
4847 dissected_image,
4848 fd_outer_socket_pair[1],
4849 fd_inner_socket_pair[1],
4850 fds,
4851 child_netns_fd);
4852 if (r < 0)
4853 _exit(EXIT_FAILURE);
4854
4855 _exit(EXIT_SUCCESS);
4856 }
4857
4858 barrier_set_role(&barrier, BARRIER_PARENT);
4859
4860 fdset_close(fds);
4861
4862 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
4863 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
4864
4865 if (arg_userns_mode != USER_NAMESPACE_NO) {
4866 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
4867 if (mntns_fd < 0)
4868 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4869
4870 /* The child just let us know the UID shift it might have read from the image. */
4871 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4872 if (l < 0)
4873 return log_error_errno(errno, "Failed to read UID shift: %m");
4874 if (l != sizeof arg_uid_shift)
4875 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4876
4877 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4878 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4879 * image, but if that's already in use, pick a new one, and report back to the child,
4880 * which one we now picked. */
4881
4882 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4883 if (r < 0)
4884 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4885
4886 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4887 if (l < 0)
4888 return log_error_errno(errno, "Failed to send UID shift: %m");
4889 if (l != sizeof arg_uid_shift)
4890 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4891 }
4892
4893 n_bind_user_uid = strv_length(arg_bind_user);
4894 if (n_bind_user_uid > 0) {
4895 /* Right after the UID shift, we'll receive the list of UID mappings for the
4896 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4897
4898 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4899 if (!bind_user_uid)
4900 return log_oom();
4901
4902 for (size_t i = 0; i < n_bind_user_uid; i++) {
4903 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4904 if (l < 0)
4905 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4906 if (l != sizeof(uid_t)*4)
4907 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4908 SYNTHETIC_ERRNO(EIO),
4909 "Short read while reading bind user UID pairs.");
4910 }
4911 }
4912 }
4913
4914 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4915 /* The child let us know the support cgroup mode it might have read from the image. */
4916 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4917 if (l < 0)
4918 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4919 if (l != sizeof(arg_unified_cgroup_hierarchy))
4920 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
4921 l, l == 0 ? " The child is most likely dead." : "");
4922 }
4923
4924 /* Wait for the outer child. */
4925 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4926 if (r < 0)
4927 return r;
4928 if (r != EXIT_SUCCESS)
4929 return -EIO;
4930
4931 /* And now retrieve the PID of the inner child. */
4932 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
4933 if (l < 0)
4934 return log_error_errno(errno, "Failed to read inner child PID: %m");
4935 if (l != sizeof *pid)
4936 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4937
4938 /* We also retrieve container UUID in case it was generated by outer child */
4939 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4940 if (l < 0)
4941 return log_error_errno(errno, "Failed to read container machine ID: %m");
4942 if (l != sizeof(arg_uuid))
4943 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4944
4945 /* We also retrieve the socket used for notifications generated by outer child */
4946 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
4947 if (notify_socket < 0)
4948 return log_error_errno(notify_socket,
4949 "Failed to receive notification socket from the outer child: %m");
4950
4951 log_debug("Init process invoked as PID "PID_FMT, *pid);
4952
4953 if (arg_userns_mode != USER_NAMESPACE_NO) {
4954 if (!barrier_place_and_sync(&barrier)) /* #1 */
4955 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4956
4957 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4958 if (r < 0)
4959 return r;
4960
4961 (void) barrier_place(&barrier); /* #2 */
4962 }
4963
4964 if (arg_private_network) {
4965 if (!arg_network_namespace_path) {
4966 /* Wait until the child has unshared its network namespace. */
4967 if (!barrier_place_and_sync(&barrier)) /* #3 */
4968 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4969 }
4970
4971 if (child_netns_fd < 0) {
4972 /* Make sure we have an open file descriptor to the child's network
4973 * namespace so it stays alive even if the child exits. */
4974 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4975 if (r < 0)
4976 return log_error_errno(r, "Failed to open child network namespace: %m");
4977 }
4978
4979 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4980 if (r < 0)
4981 return r;
4982
4983 if (arg_network_veth) {
4984 r = setup_veth(arg_machine, *pid, veth_name,
4985 arg_network_bridge || arg_network_zone);
4986 if (r < 0)
4987 return r;
4988 else if (r > 0)
4989 ifi = r;
4990
4991 if (arg_network_bridge) {
4992 /* Add the interface to a bridge */
4993 r = setup_bridge(veth_name, arg_network_bridge, false);
4994 if (r < 0)
4995 return r;
4996 if (r > 0)
4997 ifi = r;
4998 } else if (arg_network_zone) {
4999 /* Add the interface to a bridge, possibly creating it */
5000 r = setup_bridge(veth_name, arg_network_zone, true);
5001 if (r < 0)
5002 return r;
5003 if (r > 0)
5004 ifi = r;
5005 }
5006 }
5007
5008 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5009 if (r < 0)
5010 return r;
5011
5012 /* We created the primary and extra veth links now; let's remember this, so that we know to
5013 remove them later on. Note that we don't bother with removing veth links that were created
5014 here when their setup failed half-way, because in that case the kernel should be able to
5015 remove them on its own, since they cannot be referenced by anything yet. */
5016 *veth_created = true;
5017
5018 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5019 if (r < 0)
5020 return r;
5021
5022 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5023 if (r < 0)
5024 return r;
5025 }
5026
5027 if (arg_register || !arg_keep_unit) {
5028 r = sd_bus_default_system(&bus);
5029 if (r < 0)
5030 return log_error_errno(r, "Failed to open system bus: %m");
5031
5032 r = sd_bus_set_close_on_exit(bus, false);
5033 if (r < 0)
5034 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5035 }
5036
5037 if (!arg_keep_unit) {
5038 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5039 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5040 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5041
5042 r = sd_bus_match_signal_async(
5043 bus,
5044 NULL,
5045 "org.freedesktop.systemd1",
5046 NULL,
5047 "org.freedesktop.systemd1.Scope",
5048 "RequestStop",
5049 on_request_stop, NULL, PID_TO_PTR(*pid));
5050 if (r < 0)
5051 return log_error_errno(r, "Failed to request RequestStop match: %m");
5052 }
5053
5054 if (arg_register) {
5055 r = register_machine(
5056 bus,
5057 arg_machine,
5058 *pid,
5059 arg_directory,
5060 arg_uuid,
5061 ifi,
5062 arg_slice,
5063 arg_custom_mounts, arg_n_custom_mounts,
5064 arg_kill_signal,
5065 arg_property,
5066 arg_property_message,
5067 arg_keep_unit,
5068 arg_container_service_name);
5069 if (r < 0)
5070 return r;
5071
5072 } else if (!arg_keep_unit) {
5073 r = allocate_scope(
5074 bus,
5075 arg_machine,
5076 *pid,
5077 arg_slice,
5078 arg_custom_mounts, arg_n_custom_mounts,
5079 arg_kill_signal,
5080 arg_property,
5081 arg_property_message);
5082 if (r < 0)
5083 return r;
5084
5085 } else if (arg_slice || arg_property)
5086 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5087
5088 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5089 if (r < 0)
5090 return r;
5091
5092 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5093 if (r < 0)
5094 return r;
5095
5096 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5097 if (r < 0)
5098 return r;
5099
5100 /* Notify the child that the parent is ready with all
5101 * its setup (including cgroup-ification), and that
5102 * the child can now hand over control to the code to
5103 * run inside the container. */
5104 (void) barrier_place(&barrier); /* #4 */
5105
5106 /* Block SIGCHLD here, before notifying child.
5107 * process_pty() will handle it with the other signals. */
5108 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5109
5110 /* Reset signal to default */
5111 r = default_signals(SIGCHLD);
5112 if (r < 0)
5113 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5114
5115 r = sd_event_new(&event);
5116 if (r < 0)
5117 return log_error_errno(r, "Failed to get default event source: %m");
5118
5119 (void) sd_event_set_watchdog(event, true);
5120
5121 if (bus) {
5122 r = sd_bus_attach_event(bus, event, 0);
5123 if (r < 0)
5124 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5125 }
5126
5127 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5128 if (r < 0)
5129 return r;
5130
5131 if (arg_userns_mode != USER_NAMESPACE_NO) {
5132 r = wipe_fully_visible_fs(mntns_fd);
5133 if (r < 0)
5134 return r;
5135 mntns_fd = safe_close(mntns_fd);
5136 }
5137
5138 /* Let the child know that we are ready and wait that the child is completely ready now. */
5139 if (!barrier_place_and_sync(&barrier)) /* #5 */
5140 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5141
5142 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5143 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5144 etc_passwd_lock = safe_close(etc_passwd_lock);
5145
5146 (void) sd_notifyf(false,
5147 "STATUS=Container running.\n"
5148 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5149 if (!arg_notify_ready) {
5150 r = sd_notify(false, "READY=1\n");
5151 if (r < 0)
5152 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5153 }
5154
5155 if (arg_kill_signal > 0) {
5156 /* Try to kill the init system on SIGINT or SIGTERM */
5157 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5158 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5159 } else {
5160 /* Immediately exit */
5161 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5162 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5163 }
5164
5165 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5166
5167 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5168 if (r < 0)
5169 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5170
5171 /* Exit when the child exits */
5172 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5173
5174 /* Retrieve the kmsg fifo allocated by inner child */
5175 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5176 if (fd_kmsg_fifo < 0)
5177 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5178
5179 if (arg_expose_ports) {
5180 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
5181 if (r < 0)
5182 return r;
5183
5184 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5185 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5186 }
5187
5188 if (arg_console_mode != CONSOLE_PIPE) {
5189 _cleanup_close_ int fd = -EBADF;
5190 PTYForwardFlags flags = 0;
5191
5192 /* Retrieve the master pty allocated by inner child */
5193 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5194 if (fd < 0)
5195 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5196
5197 switch (arg_console_mode) {
5198
5199 case CONSOLE_READ_ONLY:
5200 flags |= PTY_FORWARD_READ_ONLY;
5201
5202 _fallthrough_;
5203
5204 case CONSOLE_INTERACTIVE:
5205 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5206
5207 r = pty_forward_new(event, fd, flags, &forward);
5208 if (r < 0)
5209 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5210
5211 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5212 (void) pty_forward_set_width_height(forward,
5213 arg_console_width,
5214 arg_console_height);
5215 break;
5216
5217 default:
5218 assert(arg_console_mode == CONSOLE_PASSIVE);
5219 }
5220
5221 *master = TAKE_FD(fd);
5222 }
5223
5224 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5225
5226 r = sd_event_loop(event);
5227 if (r < 0)
5228 return log_error_errno(r, "Failed to run event loop: %m");
5229
5230 if (forward) {
5231 char last_char = 0;
5232
5233 (void) pty_forward_get_last_char(forward, &last_char);
5234 forward = pty_forward_free(forward);
5235
5236 if (!arg_quiet && last_char != '\n')
5237 putc('\n', stdout);
5238 }
5239
5240 /* Kill if it is not dead yet anyway */
5241 if (!arg_register && !arg_keep_unit && bus)
5242 terminate_scope(bus, arg_machine);
5243
5244 /* Normally redundant, but better safe than sorry */
5245 (void) kill(*pid, SIGKILL);
5246
5247 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5248
5249 if (arg_private_network) {
5250 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5251 * to avoid having to move the parent to the child network namespace. */
5252 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5253 if (r < 0)
5254 return r;
5255
5256 if (r == 0) {
5257 _cleanup_close_ int parent_netns_fd = -EBADF;
5258
5259 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5260 if (r < 0) {
5261 log_error_errno(r, "Failed to open parent network namespace: %m");
5262 _exit(EXIT_FAILURE);
5263 }
5264
5265 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5266 if (r < 0) {
5267 log_error_errno(r, "Failed to enter child network namespace: %m");
5268 _exit(EXIT_FAILURE);
5269 }
5270
5271 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5272 if (r < 0)
5273 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5274
5275 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5276 }
5277 }
5278
5279 r = wait_for_container(TAKE_PID(*pid), &container_status);
5280
5281 /* Tell machined that we are gone. */
5282 if (bus)
5283 (void) unregister_machine(bus, arg_machine);
5284
5285 if (r < 0)
5286 /* We failed to wait for the container, or the container exited abnormally. */
5287 return r;
5288 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5289 /* r > 0 → The container exited with a non-zero status.
5290 * As a special case, we need to replace 133 with a different value,
5291 * because 133 is special-cased in the service file to reboot the container.
5292 * otherwise → The container exited with zero status and a reboot was not requested.
5293 */
5294 if (r == EXIT_FORCE_RESTART)
5295 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5296 *ret = r;
5297 return 0; /* finito */
5298 }
5299
5300 /* CONTAINER_REBOOTED, loop again */
5301
5302 if (arg_keep_unit) {
5303 /* Special handling if we are running as a service: instead of simply
5304 * restarting the machine we want to restart the entire service, so let's
5305 * inform systemd about this with the special exit code 133. The service
5306 * file uses RestartForceExitStatus=133 so that this results in a full
5307 * nspawn restart. This is necessary since we might have cgroup parameters
5308 * set we want to have flushed out. */
5309 *ret = EXIT_FORCE_RESTART;
5310 return 0; /* finito */
5311 }
5312
5313 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5314 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5315
5316 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5317 *veth_created = false;
5318 return 1; /* loop again */
5319 }
5320
5321 static int initialize_rlimits(void) {
5322 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5323 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5324 * container execution environments. */
5325
5326 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5327 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5328 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5329 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5330 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5331 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5332 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5333 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5334 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5335 [RLIMIT_NICE] = { 0, 0 },
5336 [RLIMIT_NOFILE] = { 1024, 4096 },
5337 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5338 [RLIMIT_RTPRIO] = { 0, 0 },
5339 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5340 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5341
5342 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5343 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5344 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5345 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5346 * that PID 1 changes a number of other resource limits during early initialization which is why we
5347 * don't read the other limits from PID 1 but prefer the static table above. */
5348 };
5349
5350 int rl;
5351
5352 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5353 /* Let's only fill in what the user hasn't explicitly configured anyway */
5354 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5355 const struct rlimit *v;
5356 struct rlimit buffer;
5357
5358 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5359 /* For these two let's read the limits off PID 1. See above for an explanation. */
5360
5361 if (prlimit(1, rl, NULL, &buffer) < 0)
5362 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5363
5364 v = &buffer;
5365 } else if (rl == RLIMIT_NOFILE) {
5366 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5367 * userspace. Given that nspawn containers are often run without our PID 1,
5368 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5369 * so that container userspace gets similar resources as host userspace
5370 * gets. */
5371 buffer = kernel_defaults[rl];
5372 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5373 v = &buffer;
5374 } else
5375 v = kernel_defaults + rl;
5376
5377 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5378 if (!arg_rlimit[rl])
5379 return log_oom();
5380 }
5381
5382 if (DEBUG_LOGGING) {
5383 _cleanup_free_ char *k = NULL;
5384
5385 (void) rlimit_format(arg_rlimit[rl], &k);
5386 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5387 }
5388 }
5389
5390 return 0;
5391 }
5392
5393 static int cant_be_in_netns(void) {
5394 _cleanup_close_ int fd = -EBADF;
5395 struct ucred ucred;
5396 int r;
5397
5398 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5399 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5400 * nice message. */
5401
5402 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5403 return 0;
5404
5405 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5406 if (fd < 0)
5407 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5408
5409 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5410 if (r < 0) {
5411 if (r == -ENOENT || ERRNO_IS_DISCONNECT(r))
5412 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5413 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5414
5415 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5416 }
5417
5418 r = getpeercred(fd, &ucred);
5419 if (r < 0)
5420 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5421
5422 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
5423 if (r < 0)
5424 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5425 if (r == 0)
5426 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5427 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5428 return 0;
5429 }
5430
5431 static int run(int argc, char *argv[]) {
5432 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5433 _cleanup_close_ int master = -EBADF;
5434 _cleanup_fdset_free_ FDSet *fds = NULL;
5435 int r, n_fd_passed, ret = EXIT_SUCCESS;
5436 char veth_name[IFNAMSIZ] = "";
5437 struct ExposeArgs expose_args = {};
5438 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5439 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5440 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5441 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5442 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5443 pid_t pid = 0;
5444
5445 log_parse_environment();
5446 log_open();
5447
5448 r = parse_argv(argc, argv);
5449 if (r <= 0)
5450 goto finish;
5451
5452 if (geteuid() != 0) {
5453 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5454 argc >= 2 ? "Need to be root." :
5455 "Need to be root (and some arguments are usually required).\nHint: try --help");
5456 goto finish;
5457 }
5458
5459 r = cant_be_in_netns();
5460 if (r < 0)
5461 goto finish;
5462
5463 r = initialize_rlimits();
5464 if (r < 0)
5465 goto finish;
5466
5467 r = load_oci_bundle();
5468 if (r < 0)
5469 goto finish;
5470
5471 r = determine_names();
5472 if (r < 0)
5473 goto finish;
5474
5475 r = load_settings();
5476 if (r < 0)
5477 goto finish;
5478
5479 r = cg_unified();
5480 if (r < 0) {
5481 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5482 goto finish;
5483 }
5484
5485 r = verify_arguments();
5486 if (r < 0)
5487 goto finish;
5488
5489 /* Reapply environment settings. */
5490 (void) detect_unified_cgroup_hierarchy_from_environment();
5491
5492 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5493 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5494 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5495 (void) ignore_signals(SIGPIPE);
5496
5497 n_fd_passed = sd_listen_fds(false);
5498 if (n_fd_passed > 0) {
5499 r = fdset_new_listen_fds(&fds, false);
5500 if (r < 0) {
5501 log_error_errno(r, "Failed to collect file descriptors: %m");
5502 goto finish;
5503 }
5504 }
5505
5506 /* The "default" umask. This is appropriate for most file and directory
5507 * operations performed by nspawn, and is the umask that will be used for
5508 * the child. Functions like copy_devnodes() change the umask temporarily. */
5509 umask(0022);
5510
5511 if (arg_directory) {
5512 assert(!arg_image);
5513
5514 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5515 * /var from the host will propagate into container dynamically (because bad things happen if
5516 * two systems write to the same /var). Let's allow it for the special cases where /var is
5517 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5518 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5519 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5520 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5521 goto finish;
5522 }
5523
5524 if (arg_ephemeral) {
5525 _cleanup_free_ char *np = NULL;
5526
5527 r = chase_and_update(&arg_directory, 0);
5528 if (r < 0)
5529 goto finish;
5530
5531 /* If the specified path is a mount point we generate the new snapshot immediately
5532 * inside it under a random name. However if the specified is not a mount point we
5533 * create the new snapshot in the parent directory, just next to it. */
5534 r = path_is_mount_point(arg_directory, NULL, 0);
5535 if (r < 0) {
5536 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5537 goto finish;
5538 }
5539 if (r > 0)
5540 r = tempfn_random_child(arg_directory, "machine.", &np);
5541 else
5542 r = tempfn_random(arg_directory, "machine.", &np);
5543 if (r < 0) {
5544 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5545 goto finish;
5546 }
5547
5548 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5549 * only owned by us and no one else. */
5550 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5551 if (r < 0) {
5552 log_error_errno(r, "Failed to lock %s: %m", np);
5553 goto finish;
5554 }
5555
5556 {
5557 BLOCK_SIGNALS(SIGINT);
5558 r = btrfs_subvol_snapshot(arg_directory, np,
5559 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5560 BTRFS_SNAPSHOT_FALLBACK_COPY |
5561 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5562 BTRFS_SNAPSHOT_RECURSIVE |
5563 BTRFS_SNAPSHOT_QUOTA |
5564 BTRFS_SNAPSHOT_SIGINT);
5565 }
5566 if (r == -EINTR) {
5567 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5568 goto finish;
5569 }
5570 if (r < 0) {
5571 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5572 goto finish;
5573 }
5574
5575 free_and_replace(arg_directory, np);
5576 remove_directory = true;
5577 } else {
5578 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5579 if (r < 0)
5580 goto finish;
5581
5582 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5583 if (r == -EBUSY) {
5584 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5585 goto finish;
5586 }
5587 if (r < 0) {
5588 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5589 goto finish;
5590 }
5591
5592 if (arg_template) {
5593 r = chase_and_update(&arg_template, 0);
5594 if (r < 0)
5595 goto finish;
5596
5597 {
5598 BLOCK_SIGNALS(SIGINT);
5599 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5600 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5601 BTRFS_SNAPSHOT_FALLBACK_COPY |
5602 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5603 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5604 BTRFS_SNAPSHOT_RECURSIVE |
5605 BTRFS_SNAPSHOT_QUOTA |
5606 BTRFS_SNAPSHOT_SIGINT);
5607 }
5608 if (r == -EEXIST)
5609 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5610 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5611 else if (r == -EINTR) {
5612 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5613 goto finish;
5614 } else if (r < 0) {
5615 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5616 goto finish;
5617 } else
5618 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5619 "Populated %s from template %s.", arg_directory, arg_template);
5620 }
5621 }
5622
5623 if (arg_start_mode == START_BOOT) {
5624 _cleanup_free_ char *b = NULL;
5625 const char *p;
5626
5627 if (arg_pivot_root_new) {
5628 b = path_join(arg_directory, arg_pivot_root_new);
5629 if (!b)
5630 return log_oom();
5631
5632 p = b;
5633 } else
5634 p = arg_directory;
5635
5636 if (path_is_os_tree(p) <= 0) {
5637 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5638 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5639 goto finish;
5640 }
5641 } else {
5642 _cleanup_free_ char *p = NULL;
5643
5644 if (arg_pivot_root_new)
5645 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5646 else
5647 p = path_join(arg_directory, "/usr/");
5648 if (!p)
5649 return log_oom();
5650
5651 if (laccess(p, F_OK) < 0) {
5652 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5653 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5654 goto finish;
5655 }
5656 }
5657
5658 } else {
5659 DissectImageFlags dissect_image_flags =
5660 DISSECT_IMAGE_GENERIC_ROOT |
5661 DISSECT_IMAGE_REQUIRE_ROOT |
5662 DISSECT_IMAGE_RELAX_VAR_CHECK |
5663 DISSECT_IMAGE_USR_NO_ROOT |
5664 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5665 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
5666 assert(arg_image);
5667 assert(!arg_template);
5668
5669 r = chase_and_update(&arg_image, 0);
5670 if (r < 0)
5671 goto finish;
5672
5673 if (arg_ephemeral) {
5674 _cleanup_free_ char *np = NULL;
5675
5676 r = tempfn_random(arg_image, "machine.", &np);
5677 if (r < 0) {
5678 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5679 goto finish;
5680 }
5681
5682 /* Always take an exclusive lock on our own ephemeral copy. */
5683 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5684 if (r < 0) {
5685 r = log_error_errno(r, "Failed to create image lock: %m");
5686 goto finish;
5687 }
5688
5689 {
5690 BLOCK_SIGNALS(SIGINT);
5691 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5692 FS_NOCOW_FL, FS_NOCOW_FL,
5693 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5694 NULL, NULL);
5695 }
5696 if (r == -EINTR) {
5697 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5698 goto finish;
5699 }
5700 if (r < 0) {
5701 r = log_error_errno(r, "Failed to copy image file: %m");
5702 goto finish;
5703 }
5704
5705 free_and_replace(arg_image, np);
5706 remove_image = true;
5707 } else {
5708 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5709 if (r == -EBUSY) {
5710 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5711 goto finish;
5712 }
5713 if (r < 0) {
5714 r = log_error_errno(r, "Failed to create image lock: %m");
5715 goto finish;
5716 }
5717
5718 r = verity_settings_load(
5719 &arg_verity_settings,
5720 arg_image, NULL, NULL);
5721 if (r < 0) {
5722 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5723 goto finish;
5724 }
5725
5726 if (arg_verity_settings.data_path)
5727 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5728 }
5729
5730 if (!mkdtemp(tmprootdir)) {
5731 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5732 goto finish;
5733 }
5734
5735 remove_tmprootdir = true;
5736
5737 arg_directory = strdup(tmprootdir);
5738 if (!arg_directory) {
5739 r = log_oom();
5740 goto finish;
5741 }
5742
5743 r = loop_device_make_by_path(
5744 arg_image,
5745 arg_read_only ? O_RDONLY : O_RDWR,
5746 /* sector_size= */ UINT32_MAX,
5747 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5748 LOCK_SH,
5749 &loop);
5750 if (r < 0) {
5751 log_error_errno(r, "Failed to set up loopback block device: %m");
5752 goto finish;
5753 }
5754
5755 r = dissect_loop_device_and_warn(
5756 loop,
5757 &arg_verity_settings,
5758 NULL,
5759 dissect_image_flags,
5760 &dissected_image);
5761 if (r == -ENOPKG) {
5762 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5763 log_notice("Note that the disk image needs to\n"
5764 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5765 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5766 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5767 " d) or contain a file system without a partition table\n"
5768 "in order to be bootable with systemd-nspawn.");
5769 goto finish;
5770 }
5771 if (r < 0)
5772 goto finish;
5773
5774 r = dissected_image_load_verity_sig_partition(
5775 dissected_image,
5776 loop->fd,
5777 &arg_verity_settings);
5778 if (r < 0)
5779 goto finish;
5780
5781 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5782 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5783 "root hash signature found! Proceeding without integrity checking.", arg_image);
5784
5785 r = dissected_image_decrypt_interactively(
5786 dissected_image,
5787 NULL,
5788 &arg_verity_settings,
5789 0);
5790 if (r < 0)
5791 goto finish;
5792
5793 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5794 if (remove_image && unlink(arg_image) >= 0)
5795 remove_image = false;
5796
5797 if (arg_architecture < 0)
5798 arg_architecture = dissected_image_architecture(dissected_image);
5799 }
5800
5801 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5802 if (r < 0)
5803 goto finish;
5804
5805 if (arg_console_mode < 0)
5806 arg_console_mode =
5807 isatty(STDIN_FILENO) > 0 &&
5808 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5809
5810 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5811 arg_quiet = true;
5812
5813 if (!arg_quiet)
5814 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
5815 arg_machine, arg_image ?: arg_directory);
5816
5817 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
5818
5819 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5820 r = log_error_errno(errno, "Failed to become subreaper: %m");
5821 goto finish;
5822 }
5823
5824 if (arg_expose_ports) {
5825 r = fw_ctx_new(&fw_ctx);
5826 if (r < 0) {
5827 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5828 goto finish;
5829 }
5830 expose_args.fw_ctx = fw_ctx;
5831 }
5832 for (;;) {
5833 r = run_container(dissected_image,
5834 fds,
5835 veth_name, &veth_created,
5836 &expose_args, &master,
5837 &pid, &ret);
5838 if (r <= 0)
5839 break;
5840 }
5841
5842 finish:
5843 (void) sd_notify(false,
5844 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5845 "STOPPING=1\nSTATUS=Terminating...");
5846
5847 if (pid > 0)
5848 (void) kill(pid, SIGKILL);
5849
5850 /* Try to flush whatever is still queued in the pty */
5851 if (master >= 0) {
5852 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5853 master = safe_close(master);
5854 }
5855
5856 if (pid > 0)
5857 (void) wait_for_terminate(pid, NULL);
5858
5859 pager_close();
5860
5861 if (remove_directory && arg_directory) {
5862 int k;
5863
5864 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5865 if (k < 0)
5866 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5867 }
5868
5869 if (remove_image && arg_image) {
5870 if (unlink(arg_image) < 0)
5871 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5872 }
5873
5874 if (remove_tmprootdir) {
5875 if (rmdir(tmprootdir) < 0)
5876 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5877 }
5878
5879 if (arg_machine) {
5880 const char *p;
5881
5882 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5883 (void) rm_rf(p, REMOVE_ROOT);
5884 }
5885
5886 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5887 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5888
5889 if (veth_created)
5890 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5891 (void) remove_bridge(arg_network_zone);
5892
5893 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5894 expose_port_free_all(arg_expose_ports);
5895 rlimit_free_all(arg_rlimit);
5896 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5897 credential_free_all(arg_credentials, arg_n_credentials);
5898
5899 if (r < 0)
5900 return r;
5901
5902 return ret;
5903 }
5904
5905 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);