]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tree-wide: "a" -> "an"
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "barrier.h"
28 #include "base-filesystem.h"
29 #include "blkid-util.h"
30 #include "btrfs-util.h"
31 #include "bus-error.h"
32 #include "bus-util.h"
33 #include "cap-list.h"
34 #include "capability-util.h"
35 #include "cgroup-util.h"
36 #include "copy.h"
37 #include "cpu-set-util.h"
38 #include "creds-util.h"
39 #include "dev-setup.h"
40 #include "discover-image.h"
41 #include "dissect-image.h"
42 #include "env-util.h"
43 #include "escape.h"
44 #include "fd-util.h"
45 #include "fdset.h"
46 #include "fileio.h"
47 #include "format-util.h"
48 #include "fs-util.h"
49 #include "gpt.h"
50 #include "hexdecoct.h"
51 #include "hostname-setup.h"
52 #include "hostname-util.h"
53 #include "id128-util.h"
54 #include "io-util.h"
55 #include "log.h"
56 #include "loop-util.h"
57 #include "loopback-setup.h"
58 #include "macro.h"
59 #include "main-func.h"
60 #include "missing_sched.h"
61 #include "mkdir.h"
62 #include "mount-util.h"
63 #include "mountpoint-util.h"
64 #include "namespace-util.h"
65 #include "netlink-util.h"
66 #include "nspawn-bind-user.h"
67 #include "nspawn-cgroup.h"
68 #include "nspawn-creds.h"
69 #include "nspawn-def.h"
70 #include "nspawn-expose-ports.h"
71 #include "nspawn-mount.h"
72 #include "nspawn-network.h"
73 #include "nspawn-oci.h"
74 #include "nspawn-patch-uid.h"
75 #include "nspawn-register.h"
76 #include "nspawn-seccomp.h"
77 #include "nspawn-settings.h"
78 #include "nspawn-setuid.h"
79 #include "nspawn-stub-pid1.h"
80 #include "nspawn.h"
81 #include "nulstr-util.h"
82 #include "os-util.h"
83 #include "pager.h"
84 #include "parse-argument.h"
85 #include "parse-util.h"
86 #include "path-util.h"
87 #include "pretty-print.h"
88 #include "process-util.h"
89 #include "ptyfwd.h"
90 #include "random-util.h"
91 #include "raw-clone.h"
92 #include "resolve-util.h"
93 #include "rlimit-util.h"
94 #include "rm-rf.h"
95 #if HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98 #include "selinux-util.h"
99 #include "signal-util.h"
100 #include "socket-util.h"
101 #include "stat-util.h"
102 #include "stdio-util.h"
103 #include "string-table.h"
104 #include "string-util.h"
105 #include "strv.h"
106 #include "sysctl-util.h"
107 #include "terminal-util.h"
108 #include "tmpfile-util.h"
109 #include "umask-util.h"
110 #include "unit-name.h"
111 #include "user-util.h"
112 #include "util.h"
113
114 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
115 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
116
117 #define EXIT_FORCE_RESTART 133
118
119 typedef enum ContainerStatus {
120 CONTAINER_TERMINATED,
121 CONTAINER_REBOOTED,
122 } ContainerStatus;
123
124 static char *arg_directory = NULL;
125 static char *arg_template = NULL;
126 static char *arg_chdir = NULL;
127 static char *arg_pivot_root_new = NULL;
128 static char *arg_pivot_root_old = NULL;
129 static char *arg_user = NULL;
130 static uid_t arg_uid = UID_INVALID;
131 static gid_t arg_gid = GID_INVALID;
132 static gid_t* arg_supplementary_gids = NULL;
133 static size_t arg_n_supplementary_gids = 0;
134 static sd_id128_t arg_uuid = {};
135 static char *arg_machine = NULL; /* The name used by the host to refer to this */
136 static char *arg_hostname = NULL; /* The name the payload sees by default */
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static StartMode arg_start_mode = START_PID1;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_caps_retain =
147 (1ULL << CAP_AUDIT_CONTROL) |
148 (1ULL << CAP_AUDIT_WRITE) |
149 (1ULL << CAP_CHOWN) |
150 (1ULL << CAP_DAC_OVERRIDE) |
151 (1ULL << CAP_DAC_READ_SEARCH) |
152 (1ULL << CAP_FOWNER) |
153 (1ULL << CAP_FSETID) |
154 (1ULL << CAP_IPC_OWNER) |
155 (1ULL << CAP_KILL) |
156 (1ULL << CAP_LEASE) |
157 (1ULL << CAP_LINUX_IMMUTABLE) |
158 (1ULL << CAP_MKNOD) |
159 (1ULL << CAP_NET_BIND_SERVICE) |
160 (1ULL << CAP_NET_BROADCAST) |
161 (1ULL << CAP_NET_RAW) |
162 (1ULL << CAP_SETFCAP) |
163 (1ULL << CAP_SETGID) |
164 (1ULL << CAP_SETPCAP) |
165 (1ULL << CAP_SETUID) |
166 (1ULL << CAP_SYS_ADMIN) |
167 (1ULL << CAP_SYS_BOOT) |
168 (1ULL << CAP_SYS_CHROOT) |
169 (1ULL << CAP_SYS_NICE) |
170 (1ULL << CAP_SYS_PTRACE) |
171 (1ULL << CAP_SYS_RESOURCE) |
172 (1ULL << CAP_SYS_TTY_CONFIG);
173 static uint64_t arg_caps_ambient = 0;
174 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
175 static CustomMount *arg_custom_mounts = NULL;
176 static size_t arg_n_custom_mounts = 0;
177 static char **arg_setenv = NULL;
178 static bool arg_quiet = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static char **arg_network_veth_extra = NULL;
186 static char *arg_network_bridge = NULL;
187 static char *arg_network_zone = NULL;
188 static char *arg_network_namespace_path = NULL;
189 static PagerFlags arg_pager_flags = 0;
190 static unsigned long arg_personality = PERSONALITY_INVALID;
191 static char *arg_image = NULL;
192 static char *arg_oci_bundle = NULL;
193 static VolatileMode arg_volatile_mode = VOLATILE_NO;
194 static ExposePort *arg_expose_ports = NULL;
195 static char **arg_property = NULL;
196 static sd_bus_message *arg_property_message = NULL;
197 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
198 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
199 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
200 static int arg_kill_signal = 0;
201 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
202 static SettingsMask arg_settings_mask = 0;
203 static int arg_settings_trusted = -1;
204 static char **arg_parameters = NULL;
205 static const char *arg_container_service_name = "systemd-nspawn";
206 static bool arg_notify_ready = false;
207 static bool arg_use_cgns = true;
208 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
209 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
210 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
211 static char **arg_syscall_allow_list = NULL;
212 static char **arg_syscall_deny_list = NULL;
213 #if HAVE_SECCOMP
214 static scmp_filter_ctx arg_seccomp = NULL;
215 #endif
216 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
217 static bool arg_no_new_privileges = false;
218 static int arg_oom_score_adjust = 0;
219 static bool arg_oom_score_adjust_set = false;
220 static CPUSet arg_cpu_set = {};
221 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
222 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
223 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
224 static DeviceNode* arg_extra_nodes = NULL;
225 static size_t arg_n_extra_nodes = 0;
226 static char **arg_sysctl = NULL;
227 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
228 static Credential *arg_credentials = NULL;
229 static size_t arg_n_credentials = 0;
230 static char **arg_bind_user = NULL;
231
232 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
233 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
234 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
235 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
254 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
256 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
258 #if HAVE_SECCOMP
259 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
260 #endif
261 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
262 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
263 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
264
265 static int handle_arg_console(const char *arg) {
266 if (streq(arg, "help")) {
267 puts("autopipe\n"
268 "interactive\n"
269 "passive\n"
270 "pipe\n"
271 "read-only");
272 return 0;
273 }
274
275 if (streq(arg, "interactive"))
276 arg_console_mode = CONSOLE_INTERACTIVE;
277 else if (streq(arg, "read-only"))
278 arg_console_mode = CONSOLE_READ_ONLY;
279 else if (streq(arg, "passive"))
280 arg_console_mode = CONSOLE_PASSIVE;
281 else if (streq(arg, "pipe")) {
282 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
283 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
284 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
285 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
286 "Proceeding anyway.");
287
288 arg_console_mode = CONSOLE_PIPE;
289 } else if (streq(arg, "autopipe")) {
290 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
291 arg_console_mode = CONSOLE_INTERACTIVE;
292 else
293 arg_console_mode = CONSOLE_PIPE;
294 } else
295 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
296
297 arg_settings_mask |= SETTING_CONSOLE_MODE;
298 return 1;
299 }
300
301 static int help(void) {
302 _cleanup_free_ char *link = NULL;
303 int r;
304
305 (void) pager_open(arg_pager_flags);
306
307 r = terminal_urlify_man("systemd-nspawn", "1", &link);
308 if (r < 0)
309 return log_oom();
310
311 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
312 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
313 " -h --help Show this help\n"
314 " --version Print version string\n"
315 " -q --quiet Do not show status information\n"
316 " --no-pager Do not pipe output into a pager\n"
317 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
318 "%3$sImage:%4$s\n"
319 " -D --directory=PATH Root directory for the container\n"
320 " --template=PATH Initialize root directory from template directory,\n"
321 " if missing\n"
322 " -x --ephemeral Run container with snapshot of root directory, and\n"
323 " remove it after exit\n"
324 " -i --image=PATH Root file system disk image (or device node) for\n"
325 " the container\n"
326 " --oci-bundle=PATH OCI bundle directory\n"
327 " --read-only Mount the root directory read-only\n"
328 " --volatile[=MODE] Run the system in volatile mode\n"
329 " --root-hash=HASH Specify verity root hash for root disk image\n"
330 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
331 " as a DER encoded PKCS7, either as a path to a file\n"
332 " or as an ASCII base64 encoded string prefixed by\n"
333 " 'base64:'\n"
334 " --verity-data=PATH Specify hash device for verity\n"
335 " --pivot-root=PATH[:PATH]\n"
336 " Pivot root to given directory in the container\n\n"
337 "%3$sExecution:%4$s\n"
338 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
339 " -b --boot Boot up full system (i.e. invoke init)\n"
340 " --chdir=PATH Set working directory in the container\n"
341 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
342 " -u --user=USER Run the command under specified user or UID\n"
343 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
344 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
345 "%3$sSystem Identity:%4$s\n"
346 " -M --machine=NAME Set the machine name for the container\n"
347 " --hostname=NAME Override the hostname for the container\n"
348 " --uuid=UUID Set a specific machine UUID for the container\n\n"
349 "%3$sProperties:%4$s\n"
350 " -S --slice=SLICE Place the container in the specified slice\n"
351 " --property=NAME=VALUE Set scope unit property\n"
352 " --register=BOOLEAN Register container as machine\n"
353 " --keep-unit Do not register a scope for the machine, reuse\n"
354 " the service unit nspawn is running in\n\n"
355 "%3$sUser Namespacing:%4$s\n"
356 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
357 " --private-users[=UIDBASE[:NUIDS]]\n"
358 " Similar, but with user configured UID/GID range\n"
359 " --private-users-ownership=MODE\n"
360 " Adjust ('chown') or map ('map') OS tree ownership\n"
361 " to private UID/GID range\n\n"
362 "%3$sNetworking:%4$s\n"
363 " --private-network Disable network in container\n"
364 " --network-interface=INTERFACE\n"
365 " Assign an existing network interface to the\n"
366 " container\n"
367 " --network-macvlan=INTERFACE\n"
368 " Create a macvlan network interface based on an\n"
369 " existing network interface to the container\n"
370 " --network-ipvlan=INTERFACE\n"
371 " Create an ipvlan network interface based on an\n"
372 " existing network interface to the container\n"
373 " -n --network-veth Add a virtual Ethernet connection between host\n"
374 " and container\n"
375 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
376 " Add an additional virtual Ethernet link between\n"
377 " host and container\n"
378 " --network-bridge=INTERFACE\n"
379 " Add a virtual Ethernet connection to the container\n"
380 " and attach it to an existing bridge on the host\n"
381 " --network-zone=NAME Similar, but attach the new interface to an\n"
382 " an automatically managed bridge interface\n"
383 " --network-namespace-path=PATH\n"
384 " Set network namespace to the one represented by\n"
385 " the specified kernel namespace file node\n"
386 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
387 " Expose a container IP port on the host\n\n"
388 "%3$sSecurity:%4$s\n"
389 " --capability=CAP In addition to the default, retain specified\n"
390 " capability\n"
391 " --drop-capability=CAP Drop the specified capability from the default set\n"
392 " --ambient-capability=CAP\n"
393 " Sets the specified capability for the started\n"
394 " process. Not useful if booting a machine.\n"
395 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
396 " --system-call-filter=LIST|~LIST\n"
397 " Permit/prohibit specific system calls\n"
398 " -Z --selinux-context=SECLABEL\n"
399 " Set the SELinux security context to be used by\n"
400 " processes in the container\n"
401 " -L --selinux-apifs-context=SECLABEL\n"
402 " Set the SELinux security context to be used by\n"
403 " API/tmpfs file systems in the container\n\n"
404 "%3$sResources:%4$s\n"
405 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
406 " --oom-score-adjust=VALUE\n"
407 " Adjust the OOM score value for the payload\n"
408 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
409 " --personality=ARCH Pick personality for this container\n\n"
410 "%3$sIntegration:%4$s\n"
411 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
412 " --timezone=MODE Select mode of /etc/localtime initialization\n"
413 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
414 " host, try-guest, try-host\n"
415 " -j Equivalent to --link-journal=try-guest\n\n"
416 "%3$sMounts:%4$s\n"
417 " --bind=PATH[:PATH[:OPTIONS]]\n"
418 " Bind mount a file or directory from the host into\n"
419 " the container\n"
420 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
421 " Similar, but creates a read-only bind mount\n"
422 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
423 " it\n"
424 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
425 " --overlay=PATH[:PATH...]:PATH\n"
426 " Create an overlay mount from the host to \n"
427 " the container\n"
428 " --overlay-ro=PATH[:PATH...]:PATH\n"
429 " Similar, but creates a read-only overlay mount\n"
430 " --bind-user=NAME Bind user from host to container\n\n"
431 "%3$sInput/Output:%4$s\n"
432 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
433 " set up for the container.\n"
434 " -P --pipe Equivalent to --console=pipe\n\n"
435 "%3$sCredentials:%4$s\n"
436 " --set-credential=ID:VALUE\n"
437 " Pass a credential with literal value to container.\n"
438 " --load-credential=ID:PATH\n"
439 " Load credential to pass to container from file or\n"
440 " AF_UNIX stream socket.\n"
441 "\nSee the %2$s for details.\n",
442 program_invocation_short_name,
443 link,
444 ansi_underline(),
445 ansi_normal(),
446 ansi_highlight(),
447 ansi_normal());
448
449 return 0;
450 }
451
452 static int custom_mount_check_all(void) {
453 size_t i;
454
455 for (i = 0; i < arg_n_custom_mounts; i++) {
456 CustomMount *m = &arg_custom_mounts[i];
457
458 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
459 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
460 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
461 "--private-users-ownership=own may not be combined with custom root mounts.");
462 if (arg_uid_shift == UID_INVALID)
463 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
464 "--private-users with automatic UID shift may not be combined with custom root mounts.");
465 }
466 }
467
468 return 0;
469 }
470
471 static int detect_unified_cgroup_hierarchy_from_environment(void) {
472 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
473 int r;
474
475 /* Allow the user to control whether the unified hierarchy is used */
476
477 e = getenv(var);
478 if (!e) {
479 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
480 var = "UNIFIED_CGROUP_HIERARCHY";
481 e = getenv(var);
482 }
483
484 if (!isempty(e)) {
485 r = parse_boolean(e);
486 if (r < 0)
487 return log_error_errno(r, "Failed to parse $%s: %m", var);
488 if (r > 0)
489 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
490 else
491 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
492 }
493
494 return 0;
495 }
496
497 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
498 int r;
499
500 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
501 * in the image actually supports. */
502 r = cg_all_unified();
503 if (r < 0)
504 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
505 if (r > 0) {
506 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
507 * routine only detects 231, so we'll have a false negative here for 230. */
508 r = systemd_installation_has_version(directory, 230);
509 if (r < 0)
510 return log_error_errno(r, "Failed to determine systemd version in container: %m");
511 if (r > 0)
512 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
513 else
514 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
515 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
516 /* Mixed cgroup hierarchy support was added in 233 */
517 r = systemd_installation_has_version(directory, 233);
518 if (r < 0)
519 return log_error_errno(r, "Failed to determine systemd version in container: %m");
520 if (r > 0)
521 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
522 else
523 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
524 } else
525 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
526
527 log_debug("Using %s hierarchy for container.",
528 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
529 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
530
531 return 0;
532 }
533
534 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
535 uint64_t mask = 0;
536 int r;
537
538 for (;;) {
539 _cleanup_free_ char *t = NULL;
540
541 r = extract_first_word(&spec, &t, ",", 0);
542 if (r < 0)
543 return log_error_errno(r, "Failed to parse capability %s.", t);
544 if (r == 0)
545 break;
546
547 if (streq(t, "help")) {
548 for (int i = 0; i < capability_list_length(); i++) {
549 const char *name;
550
551 name = capability_to_name(i);
552 if (name)
553 puts(name);
554 }
555
556 return 0; /* quit */
557 }
558
559 if (streq(t, "all"))
560 mask = UINT64_MAX;
561 else {
562 r = capability_from_name(t);
563 if (r < 0)
564 return log_error_errno(r, "Failed to parse capability %s.", t);
565
566 mask |= 1ULL << r;
567 }
568 }
569
570 *ret_mask = mask;
571 return 1; /* continue */
572 }
573
574 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
575 int r;
576
577 r = getenv_bool(name);
578 if (r == -ENXIO)
579 return 0;
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse $%s: %m", name);
582
583 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
584 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
585 return 0;
586 }
587
588 static int parse_mount_settings_env(void) {
589 const char *e;
590 int r;
591
592 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
593 if (r < 0 && r != -ENXIO)
594 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
595 if (r >= 0)
596 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
597
598 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
599 if (streq_ptr(e, "network"))
600 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
601
602 else if (e) {
603 r = parse_boolean(e);
604 if (r < 0)
605 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
606
607 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
608 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
609 }
610
611 return 0;
612 }
613
614 static int parse_environment(void) {
615 const char *e;
616 int r;
617
618 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
619 if (r < 0)
620 return r;
621 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
622 if (r < 0)
623 return r;
624 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
625 if (r < 0)
626 return r;
627 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
628 if (r < 0)
629 return r;
630
631 r = parse_mount_settings_env();
632 if (r < 0)
633 return r;
634
635 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
636 * even if it is supported. If not supported, it has no effect. */
637 if (!cg_ns_supported())
638 arg_use_cgns = false;
639 else {
640 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
641 if (r < 0) {
642 if (r != -ENXIO)
643 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
644
645 arg_use_cgns = true;
646 } else {
647 arg_use_cgns = r > 0;
648 arg_settings_mask |= SETTING_USE_CGNS;
649 }
650 }
651
652 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
653 if (e)
654 arg_container_service_name = e;
655
656 return detect_unified_cgroup_hierarchy_from_environment();
657 }
658
659 static int parse_argv(int argc, char *argv[]) {
660 enum {
661 ARG_VERSION = 0x100,
662 ARG_PRIVATE_NETWORK,
663 ARG_UUID,
664 ARG_READ_ONLY,
665 ARG_CAPABILITY,
666 ARG_AMBIENT_CAPABILITY,
667 ARG_DROP_CAPABILITY,
668 ARG_LINK_JOURNAL,
669 ARG_BIND,
670 ARG_BIND_RO,
671 ARG_TMPFS,
672 ARG_OVERLAY,
673 ARG_OVERLAY_RO,
674 ARG_INACCESSIBLE,
675 ARG_SHARE_SYSTEM,
676 ARG_REGISTER,
677 ARG_KEEP_UNIT,
678 ARG_NETWORK_INTERFACE,
679 ARG_NETWORK_MACVLAN,
680 ARG_NETWORK_IPVLAN,
681 ARG_NETWORK_BRIDGE,
682 ARG_NETWORK_ZONE,
683 ARG_NETWORK_VETH_EXTRA,
684 ARG_NETWORK_NAMESPACE_PATH,
685 ARG_PERSONALITY,
686 ARG_VOLATILE,
687 ARG_TEMPLATE,
688 ARG_PROPERTY,
689 ARG_PRIVATE_USERS,
690 ARG_KILL_SIGNAL,
691 ARG_SETTINGS,
692 ARG_CHDIR,
693 ARG_PIVOT_ROOT,
694 ARG_PRIVATE_USERS_CHOWN,
695 ARG_PRIVATE_USERS_OWNERSHIP,
696 ARG_NOTIFY_READY,
697 ARG_ROOT_HASH,
698 ARG_ROOT_HASH_SIG,
699 ARG_VERITY_DATA,
700 ARG_SYSTEM_CALL_FILTER,
701 ARG_RLIMIT,
702 ARG_HOSTNAME,
703 ARG_NO_NEW_PRIVILEGES,
704 ARG_OOM_SCORE_ADJUST,
705 ARG_CPU_AFFINITY,
706 ARG_RESOLV_CONF,
707 ARG_TIMEZONE,
708 ARG_CONSOLE,
709 ARG_PIPE,
710 ARG_OCI_BUNDLE,
711 ARG_NO_PAGER,
712 ARG_SET_CREDENTIAL,
713 ARG_LOAD_CREDENTIAL,
714 ARG_BIND_USER,
715 };
716
717 static const struct option options[] = {
718 { "help", no_argument, NULL, 'h' },
719 { "version", no_argument, NULL, ARG_VERSION },
720 { "directory", required_argument, NULL, 'D' },
721 { "template", required_argument, NULL, ARG_TEMPLATE },
722 { "ephemeral", no_argument, NULL, 'x' },
723 { "user", required_argument, NULL, 'u' },
724 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
725 { "as-pid2", no_argument, NULL, 'a' },
726 { "boot", no_argument, NULL, 'b' },
727 { "uuid", required_argument, NULL, ARG_UUID },
728 { "read-only", no_argument, NULL, ARG_READ_ONLY },
729 { "capability", required_argument, NULL, ARG_CAPABILITY },
730 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
731 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
732 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
733 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
734 { "bind", required_argument, NULL, ARG_BIND },
735 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
736 { "tmpfs", required_argument, NULL, ARG_TMPFS },
737 { "overlay", required_argument, NULL, ARG_OVERLAY },
738 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
739 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
740 { "machine", required_argument, NULL, 'M' },
741 { "hostname", required_argument, NULL, ARG_HOSTNAME },
742 { "slice", required_argument, NULL, 'S' },
743 { "setenv", required_argument, NULL, 'E' },
744 { "selinux-context", required_argument, NULL, 'Z' },
745 { "selinux-apifs-context", required_argument, NULL, 'L' },
746 { "quiet", no_argument, NULL, 'q' },
747 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
748 { "register", required_argument, NULL, ARG_REGISTER },
749 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
750 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
751 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
752 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
753 { "network-veth", no_argument, NULL, 'n' },
754 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
755 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
756 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
757 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
758 { "personality", required_argument, NULL, ARG_PERSONALITY },
759 { "image", required_argument, NULL, 'i' },
760 { "volatile", optional_argument, NULL, ARG_VOLATILE },
761 { "port", required_argument, NULL, 'p' },
762 { "property", required_argument, NULL, ARG_PROPERTY },
763 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
764 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
765 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
766 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
767 { "settings", required_argument, NULL, ARG_SETTINGS },
768 { "chdir", required_argument, NULL, ARG_CHDIR },
769 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
770 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
771 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
772 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
773 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
774 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
775 { "rlimit", required_argument, NULL, ARG_RLIMIT },
776 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
777 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
778 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
779 { "timezone", required_argument, NULL, ARG_TIMEZONE },
780 { "console", required_argument, NULL, ARG_CONSOLE },
781 { "pipe", no_argument, NULL, ARG_PIPE },
782 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
783 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
784 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
785 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
786 { "bind-user", required_argument, NULL, ARG_BIND_USER },
787 {}
788 };
789
790 int c, r;
791 uint64_t plus = 0, minus = 0;
792 bool mask_all_settings = false, mask_no_settings = false;
793
794 assert(argc >= 0);
795 assert(argv);
796
797 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
798 switch (c) {
799
800 case 'h':
801 return help();
802
803 case ARG_VERSION:
804 return version();
805
806 case 'D':
807 r = parse_path_argument(optarg, false, &arg_directory);
808 if (r < 0)
809 return r;
810
811 arg_settings_mask |= SETTING_DIRECTORY;
812 break;
813
814 case ARG_TEMPLATE:
815 r = parse_path_argument(optarg, false, &arg_template);
816 if (r < 0)
817 return r;
818
819 arg_settings_mask |= SETTING_DIRECTORY;
820 break;
821
822 case 'i':
823 r = parse_path_argument(optarg, false, &arg_image);
824 if (r < 0)
825 return r;
826
827 arg_settings_mask |= SETTING_DIRECTORY;
828 break;
829
830 case ARG_OCI_BUNDLE:
831 r = parse_path_argument(optarg, false, &arg_oci_bundle);
832 if (r < 0)
833 return r;
834
835 break;
836
837 case 'x':
838 arg_ephemeral = true;
839 arg_settings_mask |= SETTING_EPHEMERAL;
840 break;
841
842 case 'u':
843 r = free_and_strdup(&arg_user, optarg);
844 if (r < 0)
845 return log_oom();
846
847 arg_settings_mask |= SETTING_USER;
848 break;
849
850 case ARG_NETWORK_ZONE: {
851 char *j;
852
853 j = strjoin("vz-", optarg);
854 if (!j)
855 return log_oom();
856
857 if (!ifname_valid(j)) {
858 log_error("Network zone name not valid: %s", j);
859 free(j);
860 return -EINVAL;
861 }
862
863 free_and_replace(arg_network_zone, j);
864
865 arg_network_veth = true;
866 arg_private_network = true;
867 arg_settings_mask |= SETTING_NETWORK;
868 break;
869 }
870
871 case ARG_NETWORK_BRIDGE:
872
873 if (!ifname_valid(optarg))
874 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
875 "Bridge interface name not valid: %s", optarg);
876
877 r = free_and_strdup(&arg_network_bridge, optarg);
878 if (r < 0)
879 return log_oom();
880
881 _fallthrough_;
882 case 'n':
883 arg_network_veth = true;
884 arg_private_network = true;
885 arg_settings_mask |= SETTING_NETWORK;
886 break;
887
888 case ARG_NETWORK_VETH_EXTRA:
889 r = veth_extra_parse(&arg_network_veth_extra, optarg);
890 if (r < 0)
891 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
892
893 arg_private_network = true;
894 arg_settings_mask |= SETTING_NETWORK;
895 break;
896
897 case ARG_NETWORK_INTERFACE:
898 if (!ifname_valid(optarg))
899 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
900 "Network interface name not valid: %s", optarg);
901
902 r = test_network_interface_initialized(optarg);
903 if (r < 0)
904 return r;
905
906 if (strv_extend(&arg_network_interfaces, optarg) < 0)
907 return log_oom();
908
909 arg_private_network = true;
910 arg_settings_mask |= SETTING_NETWORK;
911 break;
912
913 case ARG_NETWORK_MACVLAN:
914
915 if (!ifname_valid(optarg))
916 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
917 "MACVLAN network interface name not valid: %s", optarg);
918
919 r = test_network_interface_initialized(optarg);
920 if (r < 0)
921 return r;
922
923 if (strv_extend(&arg_network_macvlan, optarg) < 0)
924 return log_oom();
925
926 arg_private_network = true;
927 arg_settings_mask |= SETTING_NETWORK;
928 break;
929
930 case ARG_NETWORK_IPVLAN:
931
932 if (!ifname_valid(optarg))
933 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
934 "IPVLAN network interface name not valid: %s", optarg);
935
936 r = test_network_interface_initialized(optarg);
937 if (r < 0)
938 return r;
939
940 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
941 return log_oom();
942
943 _fallthrough_;
944 case ARG_PRIVATE_NETWORK:
945 arg_private_network = true;
946 arg_settings_mask |= SETTING_NETWORK;
947 break;
948
949 case ARG_NETWORK_NAMESPACE_PATH:
950 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
951 if (r < 0)
952 return r;
953
954 arg_settings_mask |= SETTING_NETWORK;
955 break;
956
957 case 'b':
958 if (arg_start_mode == START_PID2)
959 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
960 "--boot and --as-pid2 may not be combined.");
961
962 arg_start_mode = START_BOOT;
963 arg_settings_mask |= SETTING_START_MODE;
964 break;
965
966 case 'a':
967 if (arg_start_mode == START_BOOT)
968 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
969 "--boot and --as-pid2 may not be combined.");
970
971 arg_start_mode = START_PID2;
972 arg_settings_mask |= SETTING_START_MODE;
973 break;
974
975 case ARG_UUID:
976 r = sd_id128_from_string(optarg, &arg_uuid);
977 if (r < 0)
978 return log_error_errno(r, "Invalid UUID: %s", optarg);
979
980 if (sd_id128_is_null(arg_uuid))
981 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
982 "Machine UUID may not be all zeroes.");
983
984 arg_settings_mask |= SETTING_MACHINE_ID;
985 break;
986
987 case 'S': {
988 _cleanup_free_ char *mangled = NULL;
989
990 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
991 if (r < 0)
992 return log_oom();
993
994 free_and_replace(arg_slice, mangled);
995 arg_settings_mask |= SETTING_SLICE;
996 break;
997 }
998
999 case 'M':
1000 if (isempty(optarg))
1001 arg_machine = mfree(arg_machine);
1002 else {
1003 if (!hostname_is_valid(optarg, 0))
1004 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1005 "Invalid machine name: %s", optarg);
1006
1007 r = free_and_strdup(&arg_machine, optarg);
1008 if (r < 0)
1009 return log_oom();
1010 }
1011 break;
1012
1013 case ARG_HOSTNAME:
1014 if (isempty(optarg))
1015 arg_hostname = mfree(arg_hostname);
1016 else {
1017 if (!hostname_is_valid(optarg, 0))
1018 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019 "Invalid hostname: %s", optarg);
1020
1021 r = free_and_strdup(&arg_hostname, optarg);
1022 if (r < 0)
1023 return log_oom();
1024 }
1025
1026 arg_settings_mask |= SETTING_HOSTNAME;
1027 break;
1028
1029 case 'Z':
1030 arg_selinux_context = optarg;
1031 break;
1032
1033 case 'L':
1034 arg_selinux_apifs_context = optarg;
1035 break;
1036
1037 case ARG_READ_ONLY:
1038 arg_read_only = true;
1039 arg_settings_mask |= SETTING_READ_ONLY;
1040 break;
1041
1042 case ARG_AMBIENT_CAPABILITY: {
1043 uint64_t m;
1044 r = parse_capability_spec(optarg, &m);
1045 if (r <= 0)
1046 return r;
1047 arg_caps_ambient |= m;
1048 arg_settings_mask |= SETTING_CAPABILITY;
1049 break;
1050 }
1051 case ARG_CAPABILITY:
1052 case ARG_DROP_CAPABILITY: {
1053 uint64_t m;
1054 r = parse_capability_spec(optarg, &m);
1055 if (r <= 0)
1056 return r;
1057
1058 if (c == ARG_CAPABILITY)
1059 plus |= m;
1060 else
1061 minus |= m;
1062 arg_settings_mask |= SETTING_CAPABILITY;
1063 break;
1064 }
1065 case ARG_NO_NEW_PRIVILEGES:
1066 r = parse_boolean(optarg);
1067 if (r < 0)
1068 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1069
1070 arg_no_new_privileges = r;
1071 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1072 break;
1073
1074 case 'j':
1075 arg_link_journal = LINK_GUEST;
1076 arg_link_journal_try = true;
1077 arg_settings_mask |= SETTING_LINK_JOURNAL;
1078 break;
1079
1080 case ARG_LINK_JOURNAL:
1081 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1082 if (r < 0)
1083 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1084
1085 arg_settings_mask |= SETTING_LINK_JOURNAL;
1086 break;
1087
1088 case ARG_BIND:
1089 case ARG_BIND_RO:
1090 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1091 if (r < 0)
1092 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1093
1094 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1095 break;
1096
1097 case ARG_TMPFS:
1098 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1099 if (r < 0)
1100 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1101
1102 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1103 break;
1104
1105 case ARG_OVERLAY:
1106 case ARG_OVERLAY_RO:
1107 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1108 if (r == -EADDRNOTAVAIL)
1109 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1112
1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1114 break;
1115
1116 case ARG_INACCESSIBLE:
1117 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1118 if (r < 0)
1119 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1120
1121 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1122 break;
1123
1124 case 'E': {
1125 if (!env_assignment_is_valid(optarg))
1126 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1127 "Environment variable assignment '%s' is not valid.", optarg);
1128 r = strv_env_replace_strdup(&arg_setenv, optarg);
1129 if (r < 0)
1130 return r;
1131
1132 arg_settings_mask |= SETTING_ENVIRONMENT;
1133 break;
1134 }
1135
1136 case 'q':
1137 arg_quiet = true;
1138 break;
1139
1140 case ARG_SHARE_SYSTEM:
1141 /* We don't officially support this anymore, except for compat reasons. People should use the
1142 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1143 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1144 arg_clone_ns_flags = 0;
1145 break;
1146
1147 case ARG_REGISTER:
1148 r = parse_boolean(optarg);
1149 if (r < 0) {
1150 log_error("Failed to parse --register= argument: %s", optarg);
1151 return r;
1152 }
1153
1154 arg_register = r;
1155 break;
1156
1157 case ARG_KEEP_UNIT:
1158 arg_keep_unit = true;
1159 break;
1160
1161 case ARG_PERSONALITY:
1162
1163 arg_personality = personality_from_string(optarg);
1164 if (arg_personality == PERSONALITY_INVALID)
1165 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1166 "Unknown or unsupported personality '%s'.", optarg);
1167
1168 arg_settings_mask |= SETTING_PERSONALITY;
1169 break;
1170
1171 case ARG_VOLATILE:
1172
1173 if (!optarg)
1174 arg_volatile_mode = VOLATILE_YES;
1175 else if (streq(optarg, "help")) {
1176 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1177 return 0;
1178 } else {
1179 VolatileMode m;
1180
1181 m = volatile_mode_from_string(optarg);
1182 if (m < 0)
1183 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1184 "Failed to parse --volatile= argument: %s", optarg);
1185 else
1186 arg_volatile_mode = m;
1187 }
1188
1189 arg_settings_mask |= SETTING_VOLATILE_MODE;
1190 break;
1191
1192 case 'p':
1193 r = expose_port_parse(&arg_expose_ports, optarg);
1194 if (r == -EEXIST)
1195 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1196 if (r < 0)
1197 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1198
1199 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1200 break;
1201
1202 case ARG_PROPERTY:
1203 if (strv_extend(&arg_property, optarg) < 0)
1204 return log_oom();
1205
1206 break;
1207
1208 case ARG_PRIVATE_USERS: {
1209 int boolean;
1210
1211 if (!optarg)
1212 boolean = true;
1213 else if (!in_charset(optarg, DIGITS))
1214 /* do *not* parse numbers as booleans */
1215 boolean = parse_boolean(optarg);
1216 else
1217 boolean = -1;
1218
1219 if (boolean == 0) {
1220 /* no: User namespacing off */
1221 arg_userns_mode = USER_NAMESPACE_NO;
1222 arg_uid_shift = UID_INVALID;
1223 arg_uid_range = UINT32_C(0x10000);
1224 } else if (boolean > 0) {
1225 /* yes: User namespacing on, UID range is read from root dir */
1226 arg_userns_mode = USER_NAMESPACE_FIXED;
1227 arg_uid_shift = UID_INVALID;
1228 arg_uid_range = UINT32_C(0x10000);
1229 } else if (streq(optarg, "pick")) {
1230 /* pick: User namespacing on, UID range is picked randomly */
1231 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1232 * implied by USER_NAMESPACE_PICK
1233 * further down. */
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
1236
1237 } else if (streq(optarg, "identity")) {
1238 /* identitiy: User namespaces on, UID range is map the 0…0xFFFF range to
1239 * itself, i.e. we don't actually map anything, but do take benefit of
1240 * isolation of capability sets. */
1241 arg_userns_mode = USER_NAMESPACE_FIXED;
1242 arg_uid_shift = 0;
1243 arg_uid_range = UINT32_C(0x10000);
1244 } else {
1245 _cleanup_free_ char *buffer = NULL;
1246 const char *range, *shift;
1247
1248 /* anything else: User namespacing on, UID range is explicitly configured */
1249
1250 range = strchr(optarg, ':');
1251 if (range) {
1252 buffer = strndup(optarg, range - optarg);
1253 if (!buffer)
1254 return log_oom();
1255 shift = buffer;
1256
1257 range++;
1258 r = safe_atou32(range, &arg_uid_range);
1259 if (r < 0)
1260 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1261 } else
1262 shift = optarg;
1263
1264 r = parse_uid(shift, &arg_uid_shift);
1265 if (r < 0)
1266 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1267
1268 arg_userns_mode = USER_NAMESPACE_FIXED;
1269
1270 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1271 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1272 }
1273
1274 arg_settings_mask |= SETTING_USERNS;
1275 break;
1276 }
1277
1278 case 'U':
1279 if (userns_supported()) {
1280 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1281 * implied by USER_NAMESPACE_PICK
1282 * further down. */
1283 arg_uid_shift = UID_INVALID;
1284 arg_uid_range = UINT32_C(0x10000);
1285
1286 arg_settings_mask |= SETTING_USERNS;
1287 }
1288
1289 break;
1290
1291 case ARG_PRIVATE_USERS_CHOWN:
1292 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1293
1294 arg_settings_mask |= SETTING_USERNS;
1295 break;
1296
1297 case ARG_PRIVATE_USERS_OWNERSHIP:
1298 if (streq(optarg, "help")) {
1299 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1300 return 0;
1301 }
1302
1303 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1304 if (arg_userns_ownership < 0)
1305 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1306
1307 arg_settings_mask |= SETTING_USERNS;
1308 break;
1309
1310 case ARG_KILL_SIGNAL:
1311 if (streq(optarg, "help")) {
1312 DUMP_STRING_TABLE(signal, int, _NSIG);
1313 return 0;
1314 }
1315
1316 arg_kill_signal = signal_from_string(optarg);
1317 if (arg_kill_signal < 0)
1318 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1319
1320 arg_settings_mask |= SETTING_KILL_SIGNAL;
1321 break;
1322
1323 case ARG_SETTINGS:
1324
1325 /* no → do not read files
1326 * yes → read files, do not override cmdline, trust only subset
1327 * override → read files, override cmdline, trust only subset
1328 * trusted → read files, do not override cmdline, trust all
1329 */
1330
1331 r = parse_boolean(optarg);
1332 if (r < 0) {
1333 if (streq(optarg, "trusted")) {
1334 mask_all_settings = false;
1335 mask_no_settings = false;
1336 arg_settings_trusted = true;
1337
1338 } else if (streq(optarg, "override")) {
1339 mask_all_settings = false;
1340 mask_no_settings = true;
1341 arg_settings_trusted = -1;
1342 } else
1343 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1344 } else if (r > 0) {
1345 /* yes */
1346 mask_all_settings = false;
1347 mask_no_settings = false;
1348 arg_settings_trusted = -1;
1349 } else {
1350 /* no */
1351 mask_all_settings = true;
1352 mask_no_settings = false;
1353 arg_settings_trusted = false;
1354 }
1355
1356 break;
1357
1358 case ARG_CHDIR:
1359 if (!path_is_absolute(optarg))
1360 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1361 "Working directory %s is not an absolute path.", optarg);
1362
1363 r = free_and_strdup(&arg_chdir, optarg);
1364 if (r < 0)
1365 return log_oom();
1366
1367 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1368 break;
1369
1370 case ARG_PIVOT_ROOT:
1371 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1372 if (r < 0)
1373 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1374
1375 arg_settings_mask |= SETTING_PIVOT_ROOT;
1376 break;
1377
1378 case ARG_NOTIFY_READY:
1379 r = parse_boolean(optarg);
1380 if (r < 0)
1381 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1382 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1383 arg_notify_ready = r;
1384 arg_settings_mask |= SETTING_NOTIFY_READY;
1385 break;
1386
1387 case ARG_ROOT_HASH: {
1388 _cleanup_free_ void *k = NULL;
1389 size_t l;
1390
1391 r = unhexmem(optarg, strlen(optarg), &k, &l);
1392 if (r < 0)
1393 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1394 if (l < sizeof(sd_id128_t))
1395 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1396
1397 free_and_replace(arg_verity_settings.root_hash, k);
1398 arg_verity_settings.root_hash_size = l;
1399 break;
1400 }
1401
1402 case ARG_ROOT_HASH_SIG: {
1403 char *value;
1404 size_t l;
1405 void *p;
1406
1407 if ((value = startswith(optarg, "base64:"))) {
1408 r = unbase64mem(value, strlen(value), &p, &l);
1409 if (r < 0)
1410 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1411
1412 } else {
1413 r = read_full_file(optarg, (char**) &p, &l);
1414 if (r < 0)
1415 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1416 }
1417
1418 free_and_replace(arg_verity_settings.root_hash_sig, p);
1419 arg_verity_settings.root_hash_sig_size = l;
1420 break;
1421 }
1422
1423 case ARG_VERITY_DATA:
1424 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1425 if (r < 0)
1426 return r;
1427 break;
1428
1429 case ARG_SYSTEM_CALL_FILTER: {
1430 bool negative;
1431 const char *items;
1432
1433 negative = optarg[0] == '~';
1434 items = negative ? optarg + 1 : optarg;
1435
1436 for (;;) {
1437 _cleanup_free_ char *word = NULL;
1438
1439 r = extract_first_word(&items, &word, NULL, 0);
1440 if (r == 0)
1441 break;
1442 if (r == -ENOMEM)
1443 return log_oom();
1444 if (r < 0)
1445 return log_error_errno(r, "Failed to parse system call filter: %m");
1446
1447 if (negative)
1448 r = strv_extend(&arg_syscall_deny_list, word);
1449 else
1450 r = strv_extend(&arg_syscall_allow_list, word);
1451 if (r < 0)
1452 return log_oom();
1453 }
1454
1455 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1456 break;
1457 }
1458
1459 case ARG_RLIMIT: {
1460 const char *eq;
1461 _cleanup_free_ char *name = NULL;
1462 int rl;
1463
1464 if (streq(optarg, "help")) {
1465 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1466 return 0;
1467 }
1468
1469 eq = strchr(optarg, '=');
1470 if (!eq)
1471 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1472 "--rlimit= expects an '=' assignment.");
1473
1474 name = strndup(optarg, eq - optarg);
1475 if (!name)
1476 return log_oom();
1477
1478 rl = rlimit_from_string_harder(name);
1479 if (rl < 0)
1480 return log_error_errno(rl, "Unknown resource limit: %s", name);
1481
1482 if (!arg_rlimit[rl]) {
1483 arg_rlimit[rl] = new0(struct rlimit, 1);
1484 if (!arg_rlimit[rl])
1485 return log_oom();
1486 }
1487
1488 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1489 if (r < 0)
1490 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1491
1492 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1493 break;
1494 }
1495
1496 case ARG_OOM_SCORE_ADJUST:
1497 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1500
1501 arg_oom_score_adjust_set = true;
1502 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1503 break;
1504
1505 case ARG_CPU_AFFINITY: {
1506 CPUSet cpuset;
1507
1508 r = parse_cpu_set(optarg, &cpuset);
1509 if (r < 0)
1510 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1511
1512 cpu_set_reset(&arg_cpu_set);
1513 arg_cpu_set = cpuset;
1514 arg_settings_mask |= SETTING_CPU_AFFINITY;
1515 break;
1516 }
1517
1518 case ARG_RESOLV_CONF:
1519 if (streq(optarg, "help")) {
1520 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1521 return 0;
1522 }
1523
1524 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1525 if (arg_resolv_conf < 0)
1526 return log_error_errno(arg_resolv_conf,
1527 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1528
1529 arg_settings_mask |= SETTING_RESOLV_CONF;
1530 break;
1531
1532 case ARG_TIMEZONE:
1533 if (streq(optarg, "help")) {
1534 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1535 return 0;
1536 }
1537
1538 arg_timezone = timezone_mode_from_string(optarg);
1539 if (arg_timezone < 0)
1540 return log_error_errno(arg_timezone,
1541 "Failed to parse /etc/localtime mode: %s", optarg);
1542
1543 arg_settings_mask |= SETTING_TIMEZONE;
1544 break;
1545
1546 case ARG_CONSOLE:
1547 r = handle_arg_console(optarg);
1548 if (r <= 0)
1549 return r;
1550 break;
1551
1552 case 'P':
1553 case ARG_PIPE:
1554 r = handle_arg_console("pipe");
1555 if (r <= 0)
1556 return r;
1557 break;
1558
1559 case ARG_NO_PAGER:
1560 arg_pager_flags |= PAGER_DISABLE;
1561 break;
1562
1563 case ARG_SET_CREDENTIAL: {
1564 _cleanup_free_ char *word = NULL, *data = NULL;
1565 const char *p = optarg;
1566 Credential *a;
1567 size_t i;
1568 int l;
1569
1570 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1571 if (r == -ENOMEM)
1572 return log_oom();
1573 if (r < 0)
1574 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1575 if (r == 0 || !p)
1576 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1577
1578 if (!credential_name_valid(word))
1579 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1580
1581 for (i = 0; i < arg_n_credentials; i++)
1582 if (streq(arg_credentials[i].id, word))
1583 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1584
1585 l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1586 if (l < 0)
1587 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1588
1589 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1590 if (!a)
1591 return log_oom();
1592
1593 a[arg_n_credentials++] = (Credential) {
1594 .id = TAKE_PTR(word),
1595 .data = TAKE_PTR(data),
1596 .size = l,
1597 };
1598
1599 arg_credentials = a;
1600
1601 arg_settings_mask |= SETTING_CREDENTIALS;
1602 break;
1603 }
1604
1605 case ARG_LOAD_CREDENTIAL: {
1606 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1607 _cleanup_(erase_and_freep) char *data = NULL;
1608 _cleanup_free_ char *word = NULL, *j = NULL;
1609 const char *p = optarg;
1610 Credential *a;
1611 size_t size, i;
1612
1613 r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1614 if (r == -ENOMEM)
1615 return log_oom();
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1618 if (r == 0 || !p)
1619 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1620
1621 if (!credential_name_valid(word))
1622 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1623
1624 for (i = 0; i < arg_n_credentials; i++)
1625 if (streq(arg_credentials[i].id, word))
1626 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1627
1628 if (path_is_absolute(p))
1629 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1630 else {
1631 const char *e;
1632
1633 r = get_credentials_dir(&e);
1634 if (r < 0)
1635 return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1636
1637 j = path_join(e, p);
1638 if (!j)
1639 return log_oom();
1640 }
1641
1642 r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1643 flags,
1644 NULL,
1645 &data, &size);
1646 if (r < 0)
1647 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1648
1649 a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1650 if (!a)
1651 return log_oom();
1652
1653 a[arg_n_credentials++] = (Credential) {
1654 .id = TAKE_PTR(word),
1655 .data = TAKE_PTR(data),
1656 .size = size,
1657 };
1658
1659 arg_credentials = a;
1660
1661 arg_settings_mask |= SETTING_CREDENTIALS;
1662 break;
1663 }
1664
1665 case ARG_BIND_USER:
1666 if (!valid_user_group_name(optarg, 0))
1667 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1668
1669 if (strv_extend(&arg_bind_user, optarg) < 0)
1670 return log_oom();
1671
1672 arg_settings_mask |= SETTING_BIND_USER;
1673 break;
1674
1675 case '?':
1676 return -EINVAL;
1677
1678 default:
1679 assert_not_reached("Unhandled option");
1680 }
1681
1682 if (argc > optind) {
1683 strv_free(arg_parameters);
1684 arg_parameters = strv_copy(argv + optind);
1685 if (!arg_parameters)
1686 return log_oom();
1687
1688 arg_settings_mask |= SETTING_START_MODE;
1689 }
1690
1691 if (arg_ephemeral && arg_template && !arg_directory)
1692 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1693 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1694 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1695 * --directory=". */
1696 arg_directory = TAKE_PTR(arg_template);
1697
1698 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1699
1700 /* Make sure to parse environment before we reset the settings mask below */
1701 r = parse_environment();
1702 if (r < 0)
1703 return r;
1704
1705 /* Load all settings from .nspawn files */
1706 if (mask_no_settings)
1707 arg_settings_mask = 0;
1708
1709 /* Don't load any settings from .nspawn files */
1710 if (mask_all_settings)
1711 arg_settings_mask = _SETTINGS_MASK_ALL;
1712
1713 return 1;
1714 }
1715
1716 static int verify_arguments(void) {
1717 int r;
1718
1719 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1720 /* If we are running the stub init in the container, we don't need to look at what the init
1721 * in the container supports, because we are not using it. Let's immediately pick the right
1722 * setting based on the host system configuration.
1723 *
1724 * We only do this, if the user didn't use an environment variable to override the detection.
1725 */
1726
1727 r = cg_all_unified();
1728 if (r < 0)
1729 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1730 if (r > 0)
1731 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1732 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1733 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1734 else
1735 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1736 }
1737
1738 if (arg_userns_mode != USER_NAMESPACE_NO)
1739 arg_mount_settings |= MOUNT_USE_USERNS;
1740
1741 if (arg_private_network)
1742 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1743
1744 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1745 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1746 arg_register = false;
1747 if (arg_start_mode != START_PID1)
1748 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1749 }
1750
1751 if (arg_userns_ownership < 0)
1752 arg_userns_ownership =
1753 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1754 USER_NAMESPACE_OWNERSHIP_OFF;
1755
1756 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1757 arg_kill_signal = SIGRTMIN+3;
1758
1759 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1760 arg_read_only = true;
1761
1762 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1763 arg_read_only = true;
1764
1765 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1766 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1767 * The latter is not technically a user session, but we don't need to labour the point. */
1768 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1769
1770 if (arg_directory && arg_image)
1771 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1772
1773 if (arg_template && arg_image)
1774 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1775
1776 if (arg_template && !(arg_directory || arg_machine))
1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1778
1779 if (arg_ephemeral && arg_template)
1780 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1781
1782 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1784
1785 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1786 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1787
1788 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1789 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1790 "--read-only and --private-users-ownership=chown may not be combined.");
1791
1792 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1793 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1794 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1795 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1796 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1797
1798 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1799 * we need to error out, to avoid conflicts between different network options. */
1800 if (arg_network_namespace_path &&
1801 (arg_network_interfaces || arg_network_macvlan ||
1802 arg_network_ipvlan || arg_network_veth_extra ||
1803 arg_network_bridge || arg_network_zone ||
1804 arg_network_veth))
1805 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1806
1807 if (arg_network_bridge && arg_network_zone)
1808 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1809 "--network-bridge= and --network-zone= may not be combined.");
1810
1811 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1813
1814 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1815 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1816
1817 if (arg_expose_ports && !arg_private_network)
1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1819
1820 if (arg_caps_ambient) {
1821 if (arg_caps_ambient == UINT64_MAX)
1822 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1823
1824 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1826
1827 if (arg_start_mode == START_BOOT)
1828 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1829 }
1830
1831 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1833
1834 /* Drop duplicate --bind-user= entries */
1835 strv_uniq(arg_bind_user);
1836
1837 r = custom_mount_check_all();
1838 if (r < 0)
1839 return r;
1840
1841 return 0;
1842 }
1843
1844 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1845 assert(p);
1846
1847 if (arg_userns_mode == USER_NAMESPACE_NO)
1848 return 0;
1849
1850 if (uid == UID_INVALID && gid == GID_INVALID)
1851 return 0;
1852
1853 if (uid != UID_INVALID) {
1854 uid += arg_uid_shift;
1855
1856 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1857 return -EOVERFLOW;
1858 }
1859
1860 if (gid != GID_INVALID) {
1861 gid += (gid_t) arg_uid_shift;
1862
1863 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1864 return -EOVERFLOW;
1865 }
1866
1867 if (lchown(p, uid, gid) < 0)
1868 return -errno;
1869
1870 return 0;
1871 }
1872
1873 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1874 const char *q;
1875 int r;
1876
1877 q = prefix_roota(root, path);
1878 r = mkdir_errno_wrapper(q, mode);
1879 if (r == -EEXIST)
1880 return 0;
1881 if (r < 0)
1882 return r;
1883
1884 return userns_lchown(q, uid, gid);
1885 }
1886
1887 static const char *timezone_from_path(const char *path) {
1888 return PATH_STARTSWITH_SET(
1889 path,
1890 "../usr/share/zoneinfo/",
1891 "/usr/share/zoneinfo/");
1892 }
1893
1894 static bool etc_writable(void) {
1895 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1896 }
1897
1898 static int setup_timezone(const char *dest) {
1899 _cleanup_free_ char *p = NULL, *etc = NULL;
1900 const char *where, *check;
1901 TimezoneMode m;
1902 int r;
1903
1904 assert(dest);
1905
1906 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1907 r = readlink_malloc("/etc/localtime", &p);
1908 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1909 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1910 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1911 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1912 else if (r < 0) {
1913 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1914 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1915 * file.
1916 *
1917 * Example:
1918 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1919 */
1920 return 0;
1921 } else if (arg_timezone == TIMEZONE_AUTO)
1922 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1923 else
1924 m = arg_timezone;
1925 } else
1926 m = arg_timezone;
1927
1928 if (m == TIMEZONE_OFF)
1929 return 0;
1930
1931 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1932 if (r < 0) {
1933 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1934 return 0;
1935 }
1936
1937 where = strjoina(etc, "/localtime");
1938
1939 switch (m) {
1940
1941 case TIMEZONE_DELETE:
1942 if (unlink(where) < 0)
1943 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1944
1945 return 0;
1946
1947 case TIMEZONE_SYMLINK: {
1948 _cleanup_free_ char *q = NULL;
1949 const char *z, *what;
1950
1951 z = timezone_from_path(p);
1952 if (!z) {
1953 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1954 return 0;
1955 }
1956
1957 r = readlink_malloc(where, &q);
1958 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1959 return 0; /* Already pointing to the right place? Then do nothing .. */
1960
1961 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1962 r = chase_symlinks(check, dest, 0, NULL, NULL);
1963 if (r < 0)
1964 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1965 else {
1966 if (unlink(where) < 0 && errno != ENOENT) {
1967 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1968 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1969 return 0;
1970 }
1971
1972 what = strjoina("../usr/share/zoneinfo/", z);
1973 if (symlink(what, where) < 0) {
1974 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1975 errno, "Failed to correct timezone of container, ignoring: %m");
1976 return 0;
1977 }
1978
1979 break;
1980 }
1981
1982 _fallthrough_;
1983 }
1984
1985 case TIMEZONE_BIND: {
1986 _cleanup_free_ char *resolved = NULL;
1987 int found;
1988
1989 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1990 if (found < 0) {
1991 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1992 return 0;
1993 }
1994
1995 if (found == 0) /* missing? */
1996 (void) touch(resolved);
1997
1998 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1999 if (r >= 0)
2000 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2001
2002 _fallthrough_;
2003 }
2004
2005 case TIMEZONE_COPY:
2006 /* If mounting failed, try to copy */
2007 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2008 if (r < 0) {
2009 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2010 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2011 return 0;
2012 }
2013
2014 break;
2015
2016 default:
2017 assert_not_reached("unexpected mode");
2018 }
2019
2020 /* Fix permissions of the symlink or file copy we just created */
2021 r = userns_lchown(where, 0, 0);
2022 if (r < 0)
2023 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2024
2025 return 0;
2026 }
2027
2028 static int have_resolv_conf(const char *path) {
2029 assert(path);
2030
2031 if (access(path, F_OK) < 0) {
2032 if (errno == ENOENT)
2033 return 0;
2034
2035 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2036 }
2037
2038 return 1;
2039 }
2040
2041 static int resolved_listening(void) {
2042 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2043 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2044 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2045 int r;
2046
2047 /* Check if resolved is listening */
2048
2049 r = sd_bus_open_system(&bus);
2050 if (r < 0)
2051 return log_debug_errno(r, "Failed to open system bus: %m");
2052
2053 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2054 if (r < 0)
2055 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2056 if (r == 0)
2057 return 0;
2058
2059 r = sd_bus_get_property_string(bus,
2060 "org.freedesktop.resolve1",
2061 "/org/freedesktop/resolve1",
2062 "org.freedesktop.resolve1.Manager",
2063 "DNSStubListener",
2064 &error,
2065 &dns_stub_listener_mode);
2066 if (r < 0)
2067 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2068
2069 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2070 }
2071
2072 static int setup_resolv_conf(const char *dest) {
2073 _cleanup_free_ char *etc = NULL;
2074 const char *where, *what;
2075 ResolvConfMode m;
2076 int r;
2077
2078 assert(dest);
2079
2080 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2081 if (arg_private_network)
2082 m = RESOLV_CONF_OFF;
2083 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2084 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2085 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2086 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2087 else
2088 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2089
2090 } else
2091 m = arg_resolv_conf;
2092
2093 if (m == RESOLV_CONF_OFF)
2094 return 0;
2095
2096 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2097 if (r < 0) {
2098 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2099 return 0;
2100 }
2101
2102 where = strjoina(etc, "/resolv.conf");
2103
2104 if (m == RESOLV_CONF_DELETE) {
2105 if (unlink(where) < 0)
2106 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2107
2108 return 0;
2109 }
2110
2111 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2112 what = PRIVATE_STATIC_RESOLV_CONF;
2113 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2114 what = PRIVATE_UPLINK_RESOLV_CONF;
2115 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2116 what = PRIVATE_STUB_RESOLV_CONF;
2117 else
2118 what = "/etc/resolv.conf";
2119
2120 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2121 _cleanup_free_ char *resolved = NULL;
2122 int found;
2123
2124 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2125 if (found < 0) {
2126 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2127 return 0;
2128 }
2129
2130 if (found == 0) /* missing? */
2131 (void) touch(resolved);
2132
2133 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2134 if (r >= 0)
2135 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2136
2137 /* If that didn't work, let's copy the file */
2138 }
2139
2140 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2141 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2142 else
2143 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
2144 if (r < 0) {
2145 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2146 * resolved or something similar runs inside and the symlink points there.
2147 *
2148 * If the disk image is read-only, there's also no point in complaining.
2149 */
2150 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2151 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2152 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2153 return 0;
2154 }
2155
2156 r = userns_lchown(where, 0, 0);
2157 if (r < 0)
2158 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2159
2160 return 0;
2161 }
2162
2163 static int setup_boot_id(void) {
2164 _cleanup_(unlink_and_freep) char *from = NULL;
2165 _cleanup_free_ char *path = NULL;
2166 sd_id128_t rnd = SD_ID128_NULL;
2167 const char *to;
2168 int r;
2169
2170 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2171
2172 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2175
2176 r = sd_id128_randomize(&rnd);
2177 if (r < 0)
2178 return log_error_errno(r, "Failed to generate random boot id: %m");
2179
2180 r = id128_write(path, ID128_UUID, rnd, false);
2181 if (r < 0)
2182 return log_error_errno(r, "Failed to write boot id: %m");
2183
2184 from = TAKE_PTR(path);
2185 to = "/proc/sys/kernel/random/boot_id";
2186
2187 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2188 if (r < 0)
2189 return r;
2190
2191 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2192 }
2193
2194 static int copy_devnodes(const char *dest) {
2195 static const char devnodes[] =
2196 "null\0"
2197 "zero\0"
2198 "full\0"
2199 "random\0"
2200 "urandom\0"
2201 "tty\0"
2202 "net/tun\0";
2203
2204 _cleanup_umask_ mode_t u;
2205 const char *d;
2206 int r = 0;
2207
2208 assert(dest);
2209
2210 u = umask(0000);
2211
2212 /* Create /dev/net, so that we can create /dev/net/tun in it */
2213 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2214 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2215
2216 NULSTR_FOREACH(d, devnodes) {
2217 _cleanup_free_ char *from = NULL, *to = NULL;
2218 struct stat st;
2219
2220 from = path_join("/dev/", d);
2221 if (!from)
2222 return log_oom();
2223
2224 to = path_join(dest, from);
2225 if (!to)
2226 return log_oom();
2227
2228 if (stat(from, &st) < 0) {
2229
2230 if (errno != ENOENT)
2231 return log_error_errno(errno, "Failed to stat %s: %m", from);
2232
2233 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2234 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2235 "%s is not a char or block device, cannot copy.", from);
2236 else {
2237 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2238
2239 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2240 /* Explicitly warn the user when /dev is already populated. */
2241 if (errno == EEXIST)
2242 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2243 if (errno != EPERM)
2244 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2245
2246 /* Some systems abusively restrict mknod but allow bind mounts. */
2247 r = touch(to);
2248 if (r < 0)
2249 return log_error_errno(r, "touch (%s) failed: %m", to);
2250 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2251 if (r < 0)
2252 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2253 }
2254
2255 r = userns_lchown(to, 0, 0);
2256 if (r < 0)
2257 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2258
2259 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2260 if (!dn)
2261 return log_oom();
2262
2263 r = userns_mkdir(dest, dn, 0755, 0, 0);
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to create '%s': %m", dn);
2266
2267 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2268 return log_oom();
2269
2270 prefixed = path_join(dest, sl);
2271 if (!prefixed)
2272 return log_oom();
2273
2274 t = path_join("..", d);
2275 if (!t)
2276 return log_oom();
2277
2278 if (symlink(t, prefixed) < 0)
2279 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2280 }
2281 }
2282
2283 return r;
2284 }
2285
2286 static int make_extra_nodes(const char *dest) {
2287 _cleanup_umask_ mode_t u;
2288 size_t i;
2289 int r;
2290
2291 u = umask(0000);
2292
2293 for (i = 0; i < arg_n_extra_nodes; i++) {
2294 _cleanup_free_ char *path = NULL;
2295 DeviceNode *n = arg_extra_nodes + i;
2296
2297 path = path_join(dest, n->path);
2298 if (!path)
2299 return log_oom();
2300
2301 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2302 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2303
2304 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2307 }
2308
2309 return 0;
2310 }
2311
2312 static int setup_pts(const char *dest) {
2313 _cleanup_free_ char *options = NULL;
2314 const char *p;
2315 int r;
2316
2317 #if HAVE_SELINUX
2318 if (arg_selinux_apifs_context)
2319 (void) asprintf(&options,
2320 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2321 arg_uid_shift + TTY_GID,
2322 arg_selinux_apifs_context);
2323 else
2324 #endif
2325 (void) asprintf(&options,
2326 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2327 arg_uid_shift + TTY_GID);
2328
2329 if (!options)
2330 return log_oom();
2331
2332 /* Mount /dev/pts itself */
2333 p = prefix_roota(dest, "/dev/pts");
2334 r = mkdir_errno_wrapper(p, 0755);
2335 if (r < 0)
2336 return log_error_errno(r, "Failed to create /dev/pts: %m");
2337
2338 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2339 if (r < 0)
2340 return r;
2341 r = userns_lchown(p, 0, 0);
2342 if (r < 0)
2343 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2344
2345 /* Create /dev/ptmx symlink */
2346 p = prefix_roota(dest, "/dev/ptmx");
2347 if (symlink("pts/ptmx", p) < 0)
2348 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2349 r = userns_lchown(p, 0, 0);
2350 if (r < 0)
2351 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2352
2353 /* And fix /dev/pts/ptmx ownership */
2354 p = prefix_roota(dest, "/dev/pts/ptmx");
2355 r = userns_lchown(p, 0, 0);
2356 if (r < 0)
2357 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2358
2359 return 0;
2360 }
2361
2362 static int setup_stdio_as_dev_console(void) {
2363 _cleanup_close_ int terminal = -1;
2364 int r;
2365
2366 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2367 * explicitly, if we are configured to. */
2368 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2369 if (terminal < 0)
2370 return log_error_errno(terminal, "Failed to open console: %m");
2371
2372 /* Make sure we can continue logging to the original stderr, even if
2373 * stderr points elsewhere now */
2374 r = log_dup_console();
2375 if (r < 0)
2376 return log_error_errno(r, "Failed to duplicate stderr: %m");
2377
2378 /* invalidates 'terminal' on success and failure */
2379 r = rearrange_stdio(terminal, terminal, terminal);
2380 TAKE_FD(terminal);
2381 if (r < 0)
2382 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2383
2384 return 0;
2385 }
2386
2387 static int setup_dev_console(const char *console) {
2388 _cleanup_free_ char *p = NULL;
2389 int r;
2390
2391 /* Create /dev/console symlink */
2392 r = path_make_relative("/dev", console, &p);
2393 if (r < 0)
2394 return log_error_errno(r, "Failed to create relative path: %m");
2395
2396 if (symlink(p, "/dev/console") < 0)
2397 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2398
2399 return 0;
2400 }
2401
2402 static int setup_keyring(void) {
2403 key_serial_t keyring;
2404
2405 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2406 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2407 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2408 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2409 * into the container. */
2410
2411 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2412 if (keyring == -1) {
2413 if (errno == ENOSYS)
2414 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2415 else if (ERRNO_IS_PRIVILEGE(errno))
2416 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2417 else
2418 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2419 }
2420
2421 return 0;
2422 }
2423
2424 static int setup_credentials(const char *root) {
2425 const char *q;
2426 int r;
2427
2428 if (arg_n_credentials <= 0)
2429 return 0;
2430
2431 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2432 if (r < 0)
2433 return log_error_errno(r, "Failed to create /run/host: %m");
2434
2435 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2436 if (r < 0)
2437 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2438
2439 q = prefix_roota(root, "/run/host/credentials");
2440 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2441 if (r < 0)
2442 return r;
2443
2444 for (size_t i = 0; i < arg_n_credentials; i++) {
2445 _cleanup_free_ char *j = NULL;
2446 _cleanup_close_ int fd = -1;
2447
2448 j = path_join(q, arg_credentials[i].id);
2449 if (!j)
2450 return log_oom();
2451
2452 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2453 if (fd < 0)
2454 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2455
2456 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2459
2460 if (fchmod(fd, 0400) < 0)
2461 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2462
2463 if (arg_userns_mode != USER_NAMESPACE_NO) {
2464 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2465 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2466 }
2467 }
2468
2469 if (chmod(q, 0500) < 0)
2470 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2471
2472 r = userns_lchown(q, 0, 0);
2473 if (r < 0)
2474 return r;
2475
2476 /* Make both mount and superblock read-only now */
2477 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2478 if (r < 0)
2479 return r;
2480
2481 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2482 }
2483
2484 static int setup_kmsg(int kmsg_socket) {
2485 _cleanup_(unlink_and_freep) char *from = NULL;
2486 _cleanup_free_ char *fifo = NULL;
2487 _cleanup_close_ int fd = -1;
2488 _cleanup_umask_ mode_t u;
2489 int r;
2490
2491 assert(kmsg_socket >= 0);
2492
2493 u = umask(0000);
2494
2495 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2496 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2497 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2498 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2499
2500 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2501 if (r < 0)
2502 return log_error_errno(r, "Failed to generate kmsg path: %m");
2503
2504 if (mkfifo(fifo, 0600) < 0)
2505 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2506
2507 from = TAKE_PTR(fifo);
2508
2509 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2510 if (r < 0)
2511 return r;
2512
2513 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2514 if (fd < 0)
2515 return log_error_errno(errno, "Failed to open fifo: %m");
2516
2517 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2518 r = send_one_fd(kmsg_socket, fd, 0);
2519 if (r < 0)
2520 return log_error_errno(r, "Failed to send FIFO fd: %m");
2521
2522 return 0;
2523 }
2524
2525 struct ExposeArgs {
2526 union in_addr_union address4;
2527 union in_addr_union address6;
2528 struct FirewallContext *fw_ctx;
2529 };
2530
2531 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2532 struct ExposeArgs *args = userdata;
2533
2534 assert(rtnl);
2535 assert(m);
2536 assert(args);
2537
2538 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2539 expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2540 return 0;
2541 }
2542
2543 static int setup_hostname(void) {
2544 int r;
2545
2546 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2547 return 0;
2548
2549 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to set hostname: %m");
2552
2553 return 0;
2554 }
2555
2556 static int setup_journal(const char *directory) {
2557 _cleanup_free_ char *d = NULL;
2558 char id[SD_ID128_STRING_MAX];
2559 const char *dirname, *p, *q;
2560 sd_id128_t this_id;
2561 bool try;
2562 int r;
2563
2564 /* Don't link journals in ephemeral mode */
2565 if (arg_ephemeral)
2566 return 0;
2567
2568 if (arg_link_journal == LINK_NO)
2569 return 0;
2570
2571 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2572
2573 r = sd_id128_get_machine(&this_id);
2574 if (r < 0)
2575 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2576
2577 if (sd_id128_equal(arg_uuid, this_id)) {
2578 log_full(try ? LOG_WARNING : LOG_ERR,
2579 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2580 if (try)
2581 return 0;
2582 return -EEXIST;
2583 }
2584
2585 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2586 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2587 if (r < 0) {
2588 bool ignore = r == -EROFS && try;
2589 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2590 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2591 return ignore ? 0 : r;
2592 }
2593 }
2594
2595 (void) sd_id128_to_string(arg_uuid, id);
2596
2597 p = strjoina("/var/log/journal/", id);
2598 q = prefix_roota(directory, p);
2599
2600 if (path_is_mount_point(p, NULL, 0) > 0) {
2601 if (try)
2602 return 0;
2603
2604 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2605 "%s: already a mount point, refusing to use for journal", p);
2606 }
2607
2608 if (path_is_mount_point(q, NULL, 0) > 0) {
2609 if (try)
2610 return 0;
2611
2612 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2613 "%s: already a mount point, refusing to use for journal", q);
2614 }
2615
2616 r = readlink_and_make_absolute(p, &d);
2617 if (r >= 0) {
2618 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2619 path_equal(d, q)) {
2620
2621 r = userns_mkdir(directory, p, 0755, 0, 0);
2622 if (r < 0)
2623 log_warning_errno(r, "Failed to create directory %s: %m", q);
2624 return 0;
2625 }
2626
2627 if (unlink(p) < 0)
2628 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2629 } else if (r == -EINVAL) {
2630
2631 if (arg_link_journal == LINK_GUEST &&
2632 rmdir(p) < 0) {
2633
2634 if (errno == ENOTDIR) {
2635 log_error("%s already exists and is neither a symlink nor a directory", p);
2636 return r;
2637 } else
2638 return log_error_errno(errno, "Failed to remove %s: %m", p);
2639 }
2640 } else if (r != -ENOENT)
2641 return log_error_errno(r, "readlink(%s) failed: %m", p);
2642
2643 if (arg_link_journal == LINK_GUEST) {
2644
2645 if (symlink(q, p) < 0) {
2646 if (try) {
2647 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2648 return 0;
2649 } else
2650 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2651 }
2652
2653 r = userns_mkdir(directory, p, 0755, 0, 0);
2654 if (r < 0)
2655 log_warning_errno(r, "Failed to create directory %s: %m", q);
2656 return 0;
2657 }
2658
2659 if (arg_link_journal == LINK_HOST) {
2660 /* don't create parents here — if the host doesn't have
2661 * permanent journal set up, don't force it here */
2662
2663 r = mkdir_errno_wrapper(p, 0755);
2664 if (r < 0 && r != -EEXIST) {
2665 if (try) {
2666 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2667 return 0;
2668 } else
2669 return log_error_errno(r, "Failed to create %s: %m", p);
2670 }
2671
2672 } else if (access(p, F_OK) < 0)
2673 return 0;
2674
2675 if (dir_is_empty(q) == 0)
2676 log_warning("%s is not empty, proceeding anyway.", q);
2677
2678 r = userns_mkdir(directory, p, 0755, 0, 0);
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to create %s: %m", q);
2681
2682 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2683 if (r < 0)
2684 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2685
2686 return 0;
2687 }
2688
2689 static int drop_capabilities(uid_t uid) {
2690 CapabilityQuintet q;
2691
2692 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2693 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2694 * arg_caps_retain. */
2695
2696 if (capability_quintet_is_set(&arg_full_capabilities)) {
2697 q = arg_full_capabilities;
2698
2699 if (q.bounding == UINT64_MAX)
2700 q.bounding = uid == 0 ? arg_caps_retain : 0;
2701
2702 if (q.effective == UINT64_MAX)
2703 q.effective = uid == 0 ? q.bounding : 0;
2704
2705 if (q.inheritable == UINT64_MAX)
2706 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2707
2708 if (q.permitted == UINT64_MAX)
2709 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2710
2711 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2712 q.ambient = arg_caps_ambient;
2713
2714 if (capability_quintet_mangle(&q))
2715 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2716
2717 } else {
2718 q = (CapabilityQuintet) {
2719 .bounding = arg_caps_retain,
2720 .effective = uid == 0 ? arg_caps_retain : 0,
2721 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2722 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2723 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2724 };
2725
2726 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2727 * in order to maintain the same behavior as systemd < 242. */
2728 if (capability_quintet_mangle(&q))
2729 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2730 "Some capabilities will not be set because they are not in the current bounding set.");
2731
2732 }
2733
2734 return capability_quintet_enforce(&q);
2735 }
2736
2737 static int reset_audit_loginuid(void) {
2738 _cleanup_free_ char *p = NULL;
2739 int r;
2740
2741 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2742 return 0;
2743
2744 r = read_one_line_file("/proc/self/loginuid", &p);
2745 if (r == -ENOENT)
2746 return 0;
2747 if (r < 0)
2748 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2749
2750 /* Already reset? */
2751 if (streq(p, "4294967295"))
2752 return 0;
2753
2754 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2755 if (r < 0) {
2756 log_error_errno(r,
2757 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2758 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2759 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2760 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2761 "using systemd-nspawn. Sleeping for 5s... (%m)");
2762
2763 sleep(5);
2764 }
2765
2766 return 0;
2767 }
2768
2769 static int setup_propagate(const char *root) {
2770 const char *p, *q;
2771 int r;
2772
2773 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2774 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2775 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2776 (void) mkdir_p(p, 0600);
2777
2778 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2779 if (r < 0)
2780 return log_error_errno(r, "Failed to create /run/host: %m");
2781
2782 r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
2783 if (r < 0)
2784 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
2785
2786 q = prefix_roota(root, "/run/host/incoming");
2787 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2788 if (r < 0)
2789 return r;
2790
2791 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2792 if (r < 0)
2793 return r;
2794
2795 /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
2796 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2797 }
2798
2799 static int setup_machine_id(const char *directory) {
2800 const char *etc_machine_id;
2801 sd_id128_t id;
2802 int r;
2803
2804 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2805 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2806 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2807 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2808 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2809 * container behaves nicely). */
2810
2811 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2812
2813 r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
2814 if (r < 0) {
2815 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2816 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2817
2818 if (sd_id128_is_null(arg_uuid)) {
2819 r = sd_id128_randomize(&arg_uuid);
2820 if (r < 0)
2821 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2822 }
2823 } else {
2824 if (sd_id128_is_null(id))
2825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2826 "Machine ID in container image is zero, refusing.");
2827
2828 arg_uuid = id;
2829 }
2830
2831 return 0;
2832 }
2833
2834 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2835 int r;
2836
2837 assert(directory);
2838
2839 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2840 return 0;
2841
2842 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2843 if (r == -EOPNOTSUPP)
2844 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2845 if (r == -EBADE)
2846 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2849 if (r == 0)
2850 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2851 else
2852 log_debug("Patched directory tree to match UID/GID range.");
2853
2854 return r;
2855 }
2856
2857 /*
2858 * Return values:
2859 * < 0 : wait_for_terminate() failed to get the state of the
2860 * container, the container was terminated by a signal, or
2861 * failed for an unknown reason. No change is made to the
2862 * container argument.
2863 * > 0 : The program executed in the container terminated with an
2864 * error. The exit code of the program executed in the
2865 * container is returned. The container argument has been set
2866 * to CONTAINER_TERMINATED.
2867 * 0 : The container is being rebooted, has been shut down or exited
2868 * successfully. The container argument has been set to either
2869 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2870 *
2871 * That is, success is indicated by a return value of zero, and an
2872 * error is indicated by a non-zero value.
2873 */
2874 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2875 siginfo_t status;
2876 int r;
2877
2878 r = wait_for_terminate(pid, &status);
2879 if (r < 0)
2880 return log_warning_errno(r, "Failed to wait for container: %m");
2881
2882 switch (status.si_code) {
2883
2884 case CLD_EXITED:
2885 if (status.si_status == 0)
2886 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2887 else
2888 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2889
2890 *container = CONTAINER_TERMINATED;
2891 return status.si_status;
2892
2893 case CLD_KILLED:
2894 if (status.si_status == SIGINT) {
2895 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2896 *container = CONTAINER_TERMINATED;
2897 return 0;
2898
2899 } else if (status.si_status == SIGHUP) {
2900 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2901 *container = CONTAINER_REBOOTED;
2902 return 0;
2903 }
2904
2905 _fallthrough_;
2906 case CLD_DUMPED:
2907 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2908 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2909
2910 default:
2911 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2912 "Container %s failed due to unknown reason.", arg_machine);
2913 }
2914 }
2915
2916 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2917 pid_t pid;
2918
2919 pid = PTR_TO_PID(userdata);
2920 if (pid > 0) {
2921 if (kill(pid, arg_kill_signal) >= 0) {
2922 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2923 sd_event_source_set_userdata(s, NULL);
2924 return 0;
2925 }
2926 }
2927
2928 sd_event_exit(sd_event_source_get_event(s), 0);
2929 return 0;
2930 }
2931
2932 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2933 pid_t pid;
2934
2935 assert(s);
2936 assert(ssi);
2937
2938 pid = PTR_TO_PID(userdata);
2939
2940 for (;;) {
2941 siginfo_t si = {};
2942
2943 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2944 return log_error_errno(errno, "Failed to waitid(): %m");
2945 if (si.si_pid == 0) /* No pending children. */
2946 break;
2947 if (si.si_pid == pid) {
2948 /* The main process we care for has exited. Return from
2949 * signal handler but leave the zombie. */
2950 sd_event_exit(sd_event_source_get_event(s), 0);
2951 break;
2952 }
2953
2954 /* Reap all other children. */
2955 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2956 }
2957
2958 return 0;
2959 }
2960
2961 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2962 pid_t pid;
2963
2964 assert(m);
2965
2966 pid = PTR_TO_PID(userdata);
2967
2968 if (arg_kill_signal > 0) {
2969 log_info("Container termination requested. Attempting to halt container.");
2970 (void) kill(pid, arg_kill_signal);
2971 } else {
2972 log_info("Container termination requested. Exiting.");
2973 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2974 }
2975
2976 return 0;
2977 }
2978
2979 static int determine_names(void) {
2980 int r;
2981
2982 if (arg_template && !arg_directory && arg_machine) {
2983
2984 /* If --template= was specified then we should not
2985 * search for a machine, but instead create a new one
2986 * in /var/lib/machine. */
2987
2988 arg_directory = path_join("/var/lib/machines", arg_machine);
2989 if (!arg_directory)
2990 return log_oom();
2991 }
2992
2993 if (!arg_image && !arg_directory) {
2994 if (arg_machine) {
2995 _cleanup_(image_unrefp) Image *i = NULL;
2996
2997 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
2998 if (r == -ENOENT)
2999 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3000 if (r < 0)
3001 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3002
3003 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3004 r = free_and_strdup(&arg_image, i->path);
3005 else
3006 r = free_and_strdup(&arg_directory, i->path);
3007 if (r < 0)
3008 return log_oom();
3009
3010 if (!arg_ephemeral)
3011 arg_read_only = arg_read_only || i->read_only;
3012 } else {
3013 r = safe_getcwd(&arg_directory);
3014 if (r < 0)
3015 return log_error_errno(r, "Failed to determine current directory: %m");
3016 }
3017
3018 if (!arg_directory && !arg_image)
3019 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3020 }
3021
3022 if (!arg_machine) {
3023 if (arg_directory && path_equal(arg_directory, "/"))
3024 arg_machine = gethostname_malloc();
3025 else if (arg_image) {
3026 char *e;
3027
3028 arg_machine = strdup(basename(arg_image));
3029
3030 /* Truncate suffix if there is one */
3031 e = endswith(arg_machine, ".raw");
3032 if (e)
3033 *e = 0;
3034 } else
3035 arg_machine = strdup(basename(arg_directory));
3036 if (!arg_machine)
3037 return log_oom();
3038
3039 hostname_cleanup(arg_machine);
3040 if (!hostname_is_valid(arg_machine, 0))
3041 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3042
3043 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3044 * instances at once without manually having to specify -M each time. */
3045 if (arg_ephemeral)
3046 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3047 return log_oom();
3048 }
3049
3050 return 0;
3051 }
3052
3053 static int chase_symlinks_and_update(char **p, unsigned flags) {
3054 char *chased;
3055 int r;
3056
3057 assert(p);
3058
3059 if (!*p)
3060 return 0;
3061
3062 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3063 if (r < 0)
3064 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3065
3066 return free_and_replace(*p, chased);
3067 }
3068
3069 static int determine_uid_shift(const char *directory) {
3070
3071 if (arg_userns_mode == USER_NAMESPACE_NO) {
3072 arg_uid_shift = 0;
3073 return 0;
3074 }
3075
3076 if (arg_uid_shift == UID_INVALID) {
3077 struct stat st;
3078
3079 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3080
3081 if (stat(directory, &st) < 0)
3082 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3083
3084 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3085
3086 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3087 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3088 "UID and GID base of %s don't match.", directory);
3089
3090 arg_uid_range = UINT32_C(0x10000);
3091
3092 if (arg_uid_shift != 0) {
3093 /* If the image is shifted already, then we'll fall back to classic chowning, for
3094 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3095
3096 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3097 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3098 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3099 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3100 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3101 "UID base of %s is not zero, UID mapping not supported.", directory);
3102 }
3103 }
3104
3105 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3106 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3107
3108 return 0;
3109 }
3110
3111 static unsigned long effective_clone_ns_flags(void) {
3112 unsigned long flags = arg_clone_ns_flags;
3113
3114 if (arg_private_network)
3115 flags |= CLONE_NEWNET;
3116 if (arg_use_cgns)
3117 flags |= CLONE_NEWCGROUP;
3118 if (arg_userns_mode != USER_NAMESPACE_NO)
3119 flags |= CLONE_NEWUSER;
3120
3121 return flags;
3122 }
3123
3124 static int patch_sysctl(void) {
3125
3126 /* This table is inspired by runc's sysctl() function */
3127 static const struct {
3128 const char *key;
3129 bool prefix;
3130 unsigned long clone_flags;
3131 } safe_sysctl[] = {
3132 { "kernel.hostname", false, CLONE_NEWUTS },
3133 { "kernel.domainname", false, CLONE_NEWUTS },
3134 { "kernel.msgmax", false, CLONE_NEWIPC },
3135 { "kernel.msgmnb", false, CLONE_NEWIPC },
3136 { "kernel.msgmni", false, CLONE_NEWIPC },
3137 { "kernel.sem", false, CLONE_NEWIPC },
3138 { "kernel.shmall", false, CLONE_NEWIPC },
3139 { "kernel.shmmax", false, CLONE_NEWIPC },
3140 { "kernel.shmmni", false, CLONE_NEWIPC },
3141 { "fs.mqueue.", true, CLONE_NEWIPC },
3142 { "net.", true, CLONE_NEWNET },
3143 };
3144
3145 unsigned long flags;
3146 char **k, **v;
3147 int r;
3148
3149 flags = effective_clone_ns_flags();
3150
3151 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3152 bool good = false;
3153 size_t i;
3154
3155 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3156
3157 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3158 continue;
3159
3160 if (safe_sysctl[i].prefix)
3161 good = startswith(*k, safe_sysctl[i].key);
3162 else
3163 good = streq(*k, safe_sysctl[i].key);
3164
3165 if (good)
3166 break;
3167 }
3168
3169 if (!good)
3170 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3171
3172 r = sysctl_write(*k, *v);
3173 if (r < 0)
3174 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3175 }
3176
3177 return 0;
3178 }
3179
3180 static int inner_child(
3181 Barrier *barrier,
3182 const char *directory,
3183 bool secondary,
3184 int kmsg_socket,
3185 int rtnl_socket,
3186 int master_pty_socket,
3187 FDSet *fds,
3188 char **os_release_pairs) {
3189
3190 _cleanup_free_ char *home = NULL;
3191 char as_uuid[ID128_UUID_STRING_MAX];
3192 size_t n_env = 1;
3193 const char *envp[] = {
3194 "PATH=" DEFAULT_PATH_COMPAT,
3195 NULL, /* container */
3196 NULL, /* TERM */
3197 NULL, /* HOME */
3198 NULL, /* USER */
3199 NULL, /* LOGNAME */
3200 NULL, /* container_uuid */
3201 NULL, /* LISTEN_FDS */
3202 NULL, /* LISTEN_PID */
3203 NULL, /* NOTIFY_SOCKET */
3204 NULL, /* CREDENTIALS_DIRECTORY */
3205 NULL
3206 };
3207 const char *exec_target;
3208 _cleanup_strv_free_ char **env_use = NULL;
3209 int r, which_failed;
3210
3211 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3212 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3213 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3214 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3215 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3216 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3217 * namespace.
3218 *
3219 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3220 * unshare(). See below. */
3221
3222 assert(barrier);
3223 assert(directory);
3224 assert(kmsg_socket >= 0);
3225
3226 log_debug("Inner child is initializing.");
3227
3228 if (arg_userns_mode != USER_NAMESPACE_NO) {
3229 /* Tell the parent, that it now can write the UID map. */
3230 (void) barrier_place(barrier); /* #1 */
3231
3232 /* Wait until the parent wrote the UID map */
3233 if (!barrier_place_and_sync(barrier)) /* #2 */
3234 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3235
3236 /* Become the new root user inside our namespace */
3237 r = reset_uid_gid();
3238 if (r < 0)
3239 return log_error_errno(r, "Couldn't become new root: %m");
3240
3241 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3242 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3243 * propagation, but simply create new peer groups for all our mounts). */
3244 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3245 if (r < 0)
3246 return r;
3247 }
3248
3249 r = mount_all(NULL,
3250 arg_mount_settings | MOUNT_IN_USERNS,
3251 arg_uid_shift,
3252 arg_selinux_apifs_context);
3253 if (r < 0)
3254 return r;
3255
3256 if (!arg_network_namespace_path && arg_private_network) {
3257 r = unshare(CLONE_NEWNET);
3258 if (r < 0)
3259 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3260
3261 /* Tell the parent that it can setup network interfaces. */
3262 (void) barrier_place(barrier); /* #3 */
3263 }
3264
3265 r = mount_sysfs(NULL, arg_mount_settings);
3266 if (r < 0)
3267 return r;
3268
3269 /* Wait until we are cgroup-ified, so that we
3270 * can mount the right cgroup path writable */
3271 if (!barrier_place_and_sync(barrier)) /* #4 */
3272 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3273 "Parent died too early");
3274
3275 if (arg_use_cgns) {
3276 r = unshare(CLONE_NEWCGROUP);
3277 if (r < 0)
3278 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3279 r = mount_cgroups(
3280 "",
3281 arg_unified_cgroup_hierarchy,
3282 arg_userns_mode != USER_NAMESPACE_NO,
3283 arg_uid_shift,
3284 arg_uid_range,
3285 arg_selinux_apifs_context,
3286 true);
3287 } else
3288 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3289 if (r < 0)
3290 return r;
3291
3292 r = setup_boot_id();
3293 if (r < 0)
3294 return r;
3295
3296 r = setup_kmsg(kmsg_socket);
3297 if (r < 0)
3298 return r;
3299 kmsg_socket = safe_close(kmsg_socket);
3300
3301 r = mount_custom(
3302 "/",
3303 arg_custom_mounts,
3304 arg_n_custom_mounts,
3305 0,
3306 arg_selinux_apifs_context,
3307 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3308 if (r < 0)
3309 return r;
3310
3311 if (setsid() < 0)
3312 return log_error_errno(errno, "setsid() failed: %m");
3313
3314 if (arg_private_network)
3315 (void) loopback_setup();
3316
3317 if (arg_expose_ports) {
3318 r = expose_port_send_rtnl(rtnl_socket);
3319 if (r < 0)
3320 return r;
3321 rtnl_socket = safe_close(rtnl_socket);
3322 }
3323
3324 if (arg_console_mode != CONSOLE_PIPE) {
3325 _cleanup_close_ int master = -1;
3326 _cleanup_free_ char *console = NULL;
3327
3328 /* Allocate a pty and make it available as /dev/console. */
3329 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3330 if (master < 0)
3331 return log_error_errno(master, "Failed to allocate a pty: %m");
3332
3333 r = setup_dev_console(console);
3334 if (r < 0)
3335 return log_error_errno(r, "Failed to set up /dev/console: %m");
3336
3337 r = send_one_fd(master_pty_socket, master, 0);
3338 if (r < 0)
3339 return log_error_errno(r, "Failed to send master fd: %m");
3340 master_pty_socket = safe_close(master_pty_socket);
3341
3342 r = setup_stdio_as_dev_console();
3343 if (r < 0)
3344 return r;
3345 }
3346
3347 r = patch_sysctl();
3348 if (r < 0)
3349 return r;
3350
3351 if (arg_oom_score_adjust_set) {
3352 r = set_oom_score_adjust(arg_oom_score_adjust);
3353 if (r < 0)
3354 return log_error_errno(r, "Failed to adjust OOM score: %m");
3355 }
3356
3357 if (arg_cpu_set.set)
3358 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3359 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3360
3361 (void) setup_hostname();
3362
3363 if (arg_personality != PERSONALITY_INVALID) {
3364 r = safe_personality(arg_personality);
3365 if (r < 0)
3366 return log_error_errno(r, "personality() failed: %m");
3367 } else if (secondary) {
3368 r = safe_personality(PER_LINUX32);
3369 if (r < 0)
3370 return log_error_errno(r, "personality() failed: %m");
3371 }
3372
3373 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3374 if (r < 0)
3375 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3376
3377 #if HAVE_SECCOMP
3378 if (arg_seccomp) {
3379
3380 if (is_seccomp_available()) {
3381
3382 r = seccomp_load(arg_seccomp);
3383 if (ERRNO_IS_SECCOMP_FATAL(r))
3384 return log_error_errno(r, "Failed to install seccomp filter: %m");
3385 if (r < 0)
3386 log_debug_errno(r, "Failed to install seccomp filter: %m");
3387 }
3388 } else
3389 #endif
3390 {
3391 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3392 if (r < 0)
3393 return r;
3394 }
3395
3396 #if HAVE_SELINUX
3397 if (arg_selinux_context)
3398 if (setexeccon(arg_selinux_context) < 0)
3399 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3400 #endif
3401
3402 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3403 * if we need to later on. */
3404 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3405 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3406
3407 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3408 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3409 else
3410 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3411 if (r < 0)
3412 return r;
3413
3414 r = drop_capabilities(getuid());
3415 if (r < 0)
3416 return log_error_errno(r, "Dropping capabilities failed: %m");
3417
3418 if (arg_no_new_privileges)
3419 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3420 return log_error_errno(errno, "Failed to disable new privileges: %m");
3421
3422 /* LXC sets container=lxc, so follow the scheme here */
3423 envp[n_env++] = strjoina("container=", arg_container_service_name);
3424
3425 envp[n_env] = strv_find_prefix(environ, "TERM=");
3426 if (envp[n_env])
3427 n_env++;
3428
3429 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3430 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3431 return log_oom();
3432
3433 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3434 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3435 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3436 return log_oom();
3437
3438 assert(!sd_id128_is_null(arg_uuid));
3439
3440 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3441 return log_oom();
3442
3443 if (fdset_size(fds) > 0) {
3444 r = fdset_cloexec(fds, false);
3445 if (r < 0)
3446 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3447
3448 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3449 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3450 return log_oom();
3451 }
3452 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3453 return log_oom();
3454
3455 if (arg_n_credentials > 0) {
3456 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3457 if (!envp[n_env])
3458 return log_oom();
3459 n_env++;
3460 }
3461
3462 env_use = strv_env_merge(3, envp, os_release_pairs, arg_setenv);
3463 if (!env_use)
3464 return log_oom();
3465
3466 /* Let the parent know that we are ready and
3467 * wait until the parent is ready with the
3468 * setup, too... */
3469 if (!barrier_place_and_sync(barrier)) /* #5 */
3470 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3471
3472 if (arg_chdir)
3473 if (chdir(arg_chdir) < 0)
3474 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3475
3476 if (arg_start_mode == START_PID2) {
3477 r = stub_pid1(arg_uuid);
3478 if (r < 0)
3479 return r;
3480 }
3481
3482 if (arg_console_mode != CONSOLE_PIPE) {
3483 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3484 * are configured for that. Acquire it as controlling tty. */
3485 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3486 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3487 }
3488
3489 log_debug("Inner child completed, invoking payload.");
3490
3491 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3492 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3493 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3494 log_close();
3495 log_set_open_when_needed(true);
3496
3497 (void) fdset_close_others(fds);
3498
3499 if (arg_start_mode == START_BOOT) {
3500 char **a;
3501 size_t m;
3502
3503 /* Automatically search for the init system */
3504
3505 m = strv_length(arg_parameters);
3506 a = newa(char*, m + 2);
3507 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3508 a[1 + m] = NULL;
3509
3510 a[0] = (char*) "/usr/lib/systemd/systemd";
3511 execve(a[0], a, env_use);
3512
3513 a[0] = (char*) "/lib/systemd/systemd";
3514 execve(a[0], a, env_use);
3515
3516 a[0] = (char*) "/sbin/init";
3517 execve(a[0], a, env_use);
3518
3519 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3520 } else if (!strv_isempty(arg_parameters)) {
3521 const char *dollar_path;
3522
3523 exec_target = arg_parameters[0];
3524
3525 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3526 * binary. */
3527 dollar_path = strv_env_get(env_use, "PATH");
3528 if (dollar_path) {
3529 if (setenv("PATH", dollar_path, 1) < 0)
3530 return log_error_errno(errno, "Failed to update $PATH: %m");
3531 }
3532
3533 execvpe(arg_parameters[0], arg_parameters, env_use);
3534 } else {
3535 if (!arg_chdir)
3536 /* If we cannot change the directory, we'll end up in /, that is expected. */
3537 (void) chdir(home ?: "/root");
3538
3539 execle("/bin/bash", "-bash", NULL, env_use);
3540 execle("/bin/sh", "-sh", NULL, env_use);
3541
3542 exec_target = "/bin/bash, /bin/sh";
3543 }
3544
3545 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3546 }
3547
3548 static int setup_notify_child(void) {
3549 _cleanup_close_ int fd = -1;
3550 union sockaddr_union sa = {
3551 .un.sun_family = AF_UNIX,
3552 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3553 };
3554 int r;
3555
3556 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3557 if (fd < 0)
3558 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3559
3560 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3561 (void) sockaddr_un_unlink(&sa.un);
3562
3563 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3564 if (r < 0)
3565 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3566
3567 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3568 if (r < 0)
3569 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3570
3571 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3572 if (r < 0)
3573 return log_error_errno(r, "SO_PASSCRED failed: %m");
3574
3575 return TAKE_FD(fd);
3576 }
3577
3578 static int outer_child(
3579 Barrier *barrier,
3580 const char *directory,
3581 DissectedImage *dissected_image,
3582 bool secondary,
3583 int pid_socket,
3584 int uuid_socket,
3585 int notify_socket,
3586 int kmsg_socket,
3587 int rtnl_socket,
3588 int uid_shift_socket,
3589 int master_pty_socket,
3590 int unified_cgroup_hierarchy_socket,
3591 FDSet *fds,
3592 int netns_fd) {
3593
3594 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3595 _cleanup_strv_free_ char **os_release_pairs = NULL;
3596 _cleanup_close_ int fd = -1;
3597 bool idmap = false;
3598 const char *p;
3599 pid_t pid;
3600 ssize_t l;
3601 int r;
3602
3603 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3604 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3605 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3606 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3607
3608 assert(barrier);
3609 assert(directory);
3610 assert(pid_socket >= 0);
3611 assert(uuid_socket >= 0);
3612 assert(notify_socket >= 0);
3613 assert(master_pty_socket >= 0);
3614 assert(kmsg_socket >= 0);
3615
3616 log_debug("Outer child is initializing.");
3617
3618 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3619 if (r < 0)
3620 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3621
3622 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3623 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3624
3625 r = reset_audit_loginuid();
3626 if (r < 0)
3627 return r;
3628
3629 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3630 * mounts to the real root. */
3631 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3632 if (r < 0)
3633 return r;
3634
3635 if (dissected_image) {
3636 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3637 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3638 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3639 * makes sure ESP partitions and userns are compatible. */
3640
3641 r = dissected_image_mount_and_warn(
3642 dissected_image,
3643 directory,
3644 arg_uid_shift,
3645 arg_uid_range,
3646 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3647 DISSECT_IMAGE_DISCARD_ON_LOOP|
3648 DISSECT_IMAGE_USR_NO_ROOT|
3649 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3650 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3651 if (r < 0)
3652 return r;
3653 }
3654
3655 r = determine_uid_shift(directory);
3656 if (r < 0)
3657 return r;
3658
3659 if (arg_userns_mode != USER_NAMESPACE_NO) {
3660 /* Let the parent know which UID shift we read from the image */
3661 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3662 if (l < 0)
3663 return log_error_errno(errno, "Failed to send UID shift: %m");
3664 if (l != sizeof(arg_uid_shift))
3665 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3666 "Short write while sending UID shift.");
3667
3668 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3669 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3670 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3671 * not it will pick a different one, and send it back to us. */
3672
3673 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3674 if (l < 0)
3675 return log_error_errno(errno, "Failed to recv UID shift: %m");
3676 if (l != sizeof(arg_uid_shift))
3677 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3678 "Short read while receiving UID shift.");
3679 }
3680
3681 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3682 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3683 }
3684
3685 if (path_equal(directory, "/")) {
3686 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3687 * place, so that we can make changes to its mount structure (for example, to implement
3688 * --volatile=) without this interfering with our ability to access files such as
3689 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3690 * (instead of a temporary directory, since we are living in our own mount namspace here
3691 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3692 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3693
3694 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3695 if (r < 0)
3696 return r;
3697
3698 directory = "/run/systemd/nspawn-root";
3699 }
3700
3701 if (arg_userns_mode != USER_NAMESPACE_NO &&
3702 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3703 arg_uid_shift != 0) {
3704 r = make_mount_point(directory);
3705 if (r < 0)
3706 return r;
3707
3708 r = remount_idmap(directory, arg_uid_shift, arg_uid_range);
3709 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3710 /* This might fail because the kernel or file system doesn't support idmapping. We
3711 * can't really distinguish this nicely, nor do we have any guarantees about the
3712 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3713 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3714 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3715 "ID mapped mounts are apparently not available, sorry.");
3716
3717 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3718 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3719 } else if (r < 0)
3720 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3721 else {
3722 log_debug("ID mapped mounts available, making use of them.");
3723 idmap = true;
3724 }
3725 }
3726
3727 r = setup_pivot_root(
3728 directory,
3729 arg_pivot_root_new,
3730 arg_pivot_root_old);
3731 if (r < 0)
3732 return r;
3733
3734 r = setup_volatile_mode(
3735 directory,
3736 arg_volatile_mode,
3737 arg_uid_shift,
3738 arg_selinux_apifs_context);
3739 if (r < 0)
3740 return r;
3741
3742 r = bind_user_prepare(
3743 directory,
3744 arg_bind_user,
3745 arg_uid_shift,
3746 arg_uid_range,
3747 &arg_custom_mounts, &arg_n_custom_mounts,
3748 &bind_user_context);
3749 if (r < 0)
3750 return r;
3751
3752 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3753 /* Send the user maps we determined to the parent, so that it installs it in our user namespace UID map table */
3754
3755 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3756 uid_t map[] = {
3757 bind_user_context->data[i].payload_user->uid,
3758 bind_user_context->data[i].host_user->uid,
3759 (uid_t) bind_user_context->data[i].payload_group->gid,
3760 (uid_t) bind_user_context->data[i].host_group->gid,
3761 };
3762
3763 l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
3764 if (l < 0)
3765 return log_error_errno(errno, "Failed to send user UID map: %m");
3766 if (l != sizeof(map))
3767 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3768 "Short write while sending user UID map.");
3769 }
3770 }
3771
3772 r = mount_custom(
3773 directory,
3774 arg_custom_mounts,
3775 arg_n_custom_mounts,
3776 arg_uid_shift,
3777 arg_selinux_apifs_context,
3778 MOUNT_ROOT_ONLY);
3779 if (r < 0)
3780 return r;
3781
3782 /* Make sure we always have a mount that we can move to root later on. */
3783 r = make_mount_point(directory);
3784 if (r < 0)
3785 return r;
3786
3787 if (dissected_image) {
3788 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3789 r = dissected_image_mount(
3790 dissected_image,
3791 directory,
3792 arg_uid_shift,
3793 arg_uid_range,
3794 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3795 DISSECT_IMAGE_DISCARD_ON_LOOP|
3796 DISSECT_IMAGE_USR_NO_ROOT|
3797 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3798 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3799 if (r == -EUCLEAN)
3800 return log_error_errno(r, "File system check for image failed: %m");
3801 if (r < 0)
3802 return log_error_errno(r, "Failed to mount image file system: %m");
3803 }
3804
3805 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3806 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3807
3808 r = detect_unified_cgroup_hierarchy_from_image(directory);
3809 if (r < 0)
3810 return r;
3811
3812 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3813 if (l < 0)
3814 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3815 if (l != sizeof(arg_unified_cgroup_hierarchy))
3816 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3817 "Short write while sending cgroup mode.");
3818
3819 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3820 }
3821
3822 /* Mark everything as shared so our mounts get propagated down. This is
3823 * required to make new bind mounts available in systemd services
3824 * inside the container that create a new mount namespace.
3825 * See https://github.com/systemd/systemd/issues/3860
3826 * Further submounts (such as /dev) done after this will inherit the
3827 * shared propagation mode.
3828 *
3829 * IMPORTANT: Do not overmount the root directory anymore from now on to
3830 * enable moving the root directory mount to root later on.
3831 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3832 */
3833 r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3834 if (r < 0)
3835 return r;
3836
3837 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3838 if (r < 0)
3839 return r;
3840
3841 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3842 if (r < 0)
3843 return r;
3844
3845 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3846 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3847 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3848 if (r < 0)
3849 return log_error_errno(r, "Failed to make tree read-only: %m");
3850 }
3851
3852 r = mount_all(directory,
3853 arg_mount_settings,
3854 arg_uid_shift,
3855 arg_selinux_apifs_context);
3856 if (r < 0)
3857 return r;
3858
3859 r = copy_devnodes(directory);
3860 if (r < 0)
3861 return r;
3862
3863 r = make_extra_nodes(directory);
3864 if (r < 0)
3865 return r;
3866
3867 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3868
3869 p = prefix_roota(directory, "/run/host");
3870 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3871
3872 r = setup_pts(directory);
3873 if (r < 0)
3874 return r;
3875
3876 r = setup_propagate(directory);
3877 if (r < 0)
3878 return r;
3879
3880 r = setup_keyring();
3881 if (r < 0)
3882 return r;
3883
3884 r = setup_credentials(directory);
3885 if (r < 0)
3886 return r;
3887
3888 r = bind_user_setup(bind_user_context, directory);
3889 if (r < 0)
3890 return r;
3891
3892 r = mount_custom(
3893 directory,
3894 arg_custom_mounts,
3895 arg_n_custom_mounts,
3896 arg_uid_shift,
3897 arg_selinux_apifs_context,
3898 MOUNT_NON_ROOT_ONLY);
3899 if (r < 0)
3900 return r;
3901
3902 r = setup_timezone(directory);
3903 if (r < 0)
3904 return r;
3905
3906 r = setup_resolv_conf(directory);
3907 if (r < 0)
3908 return r;
3909
3910 r = setup_machine_id(directory);
3911 if (r < 0)
3912 return r;
3913
3914 r = setup_journal(directory);
3915 if (r < 0)
3916 return r;
3917
3918 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3919 p = prefix_roota(directory, "/run/host/container-manager");
3920 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3921
3922 /* The same stuff as the $container_uuid env var */
3923 p = prefix_roota(directory, "/run/host/container-uuid");
3924 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3925
3926 if (!arg_use_cgns) {
3927 r = mount_cgroups(
3928 directory,
3929 arg_unified_cgroup_hierarchy,
3930 arg_userns_mode != USER_NAMESPACE_NO,
3931 arg_uid_shift,
3932 arg_uid_range,
3933 arg_selinux_apifs_context,
3934 false);
3935 if (r < 0)
3936 return r;
3937 }
3938
3939 r = mount_move_root(directory);
3940 if (r < 0)
3941 return log_error_errno(r, "Failed to move root directory: %m");
3942
3943 fd = setup_notify_child();
3944 if (fd < 0)
3945 return fd;
3946
3947 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3948 arg_clone_ns_flags |
3949 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3950 if (pid < 0)
3951 return log_error_errno(errno, "Failed to fork inner child: %m");
3952 if (pid == 0) {
3953 pid_socket = safe_close(pid_socket);
3954 uuid_socket = safe_close(uuid_socket);
3955 notify_socket = safe_close(notify_socket);
3956 uid_shift_socket = safe_close(uid_shift_socket);
3957
3958 /* The inner child has all namespaces that are requested, so that we all are owned by the
3959 * user if user namespaces are turned on. */
3960
3961 if (arg_network_namespace_path) {
3962 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3963 if (r < 0)
3964 return log_error_errno(r, "Failed to join network namespace: %m");
3965 }
3966
3967 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
3968 if (r < 0)
3969 _exit(EXIT_FAILURE);
3970
3971 _exit(EXIT_SUCCESS);
3972 }
3973
3974 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3975 if (l < 0)
3976 return log_error_errno(errno, "Failed to send PID: %m");
3977 if (l != sizeof(pid))
3978 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3979 "Short write while sending PID.");
3980
3981 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3982 if (l < 0)
3983 return log_error_errno(errno, "Failed to send machine ID: %m");
3984 if (l != sizeof(arg_uuid))
3985 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3986 "Short write while sending machine ID.");
3987
3988 l = send_one_fd(notify_socket, fd, 0);
3989 if (l < 0)
3990 return log_error_errno(l, "Failed to send notify fd: %m");
3991
3992 pid_socket = safe_close(pid_socket);
3993 uuid_socket = safe_close(uuid_socket);
3994 notify_socket = safe_close(notify_socket);
3995 master_pty_socket = safe_close(master_pty_socket);
3996 kmsg_socket = safe_close(kmsg_socket);
3997 rtnl_socket = safe_close(rtnl_socket);
3998 netns_fd = safe_close(netns_fd);
3999
4000 return 0;
4001 }
4002
4003 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4004 bool tried_hashed = false;
4005 unsigned n_tries = 100;
4006 uid_t candidate;
4007 int r;
4008
4009 assert(shift);
4010 assert(ret_lock_file);
4011 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4012 assert(arg_uid_range == 0x10000U);
4013
4014 candidate = *shift;
4015
4016 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4017
4018 for (;;) {
4019 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4020 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4021
4022 if (--n_tries <= 0)
4023 return -EBUSY;
4024
4025 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4026 goto next;
4027 if ((candidate & UINT32_C(0xFFFF)) != 0)
4028 goto next;
4029
4030 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4031 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4032 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4033 goto next;
4034 if (r < 0)
4035 return r;
4036
4037 /* Make some superficial checks whether the range is currently known in the user database */
4038 if (getpwuid(candidate))
4039 goto next;
4040 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4041 goto next;
4042 if (getgrgid(candidate))
4043 goto next;
4044 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4045 goto next;
4046
4047 *ret_lock_file = lf;
4048 lf = (struct LockFile) LOCK_FILE_INIT;
4049 *shift = candidate;
4050 return 0;
4051
4052 next:
4053 if (arg_machine && !tried_hashed) {
4054 /* Try to hash the base from the container name */
4055
4056 static const uint8_t hash_key[] = {
4057 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4058 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4059 };
4060
4061 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4062
4063 tried_hashed = true;
4064 } else
4065 random_bytes(&candidate, sizeof(candidate));
4066
4067 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4068 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4069 }
4070 }
4071
4072 static int add_one_uid_map(
4073 char **p,
4074 uid_t container_uid,
4075 uid_t host_uid,
4076 uid_t range) {
4077
4078 return strextendf(p,
4079 UID_FMT " " UID_FMT " " UID_FMT "\n",
4080 container_uid, host_uid, range);
4081 }
4082
4083 static int make_uid_map_string(
4084 const uid_t bind_user_uid[],
4085 size_t n_bind_user_uid,
4086 size_t offset,
4087 char **ret) {
4088
4089 _cleanup_free_ char *s = NULL;
4090 uid_t previous_uid = 0;
4091 int r;
4092
4093 assert(n_bind_user_uid == 0 || bind_user_uid);
4094 assert(offset == 0 || offset == 2); /* used to switch between UID and GID map */
4095 assert(ret);
4096
4097 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4098 * quadruplet, consisting of host and container UID + GID. */
4099
4100 for (size_t i = 0; i < n_bind_user_uid; i++) {
4101 uid_t payload_uid = bind_user_uid[i*2+offset],
4102 host_uid = bind_user_uid[i*2+offset+1];
4103
4104 assert(previous_uid <= payload_uid);
4105 assert(payload_uid < arg_uid_range);
4106
4107 /* Add a range to close the gap to previous entry */
4108 if (payload_uid > previous_uid) {
4109 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4110 if (r < 0)
4111 return r;
4112 }
4113
4114 /* Map this specific user */
4115 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4116 if (r < 0)
4117 return r;
4118
4119 previous_uid = payload_uid + 1;
4120 }
4121
4122 /* And add a range to close the gap to finish the range */
4123 if (arg_uid_range > previous_uid) {
4124 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4125 if (r < 0)
4126 return r;
4127 }
4128
4129 assert(s);
4130
4131 *ret = TAKE_PTR(s);
4132 return 0;
4133 }
4134
4135 static int setup_uid_map(
4136 pid_t pid,
4137 const uid_t bind_user_uid[],
4138 size_t n_bind_user_uid) {
4139
4140 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4141 _cleanup_free_ char *s = NULL;
4142 int r;
4143
4144 assert(pid > 1);
4145
4146 /* Build the UID map string */
4147 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4148 return log_oom();
4149
4150 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4151 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4152 if (r < 0)
4153 return log_error_errno(r, "Failed to write UID map: %m");
4154
4155 /* And now build the GID map string */
4156 s = mfree(s);
4157 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4158 return log_oom();
4159
4160 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4161 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4162 if (r < 0)
4163 return log_error_errno(r, "Failed to write GID map: %m");
4164
4165 return 0;
4166 }
4167
4168 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4169 char buf[NOTIFY_BUFFER_MAX+1];
4170 char *p = NULL;
4171 struct iovec iovec = {
4172 .iov_base = buf,
4173 .iov_len = sizeof(buf)-1,
4174 };
4175 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4176 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4177 struct msghdr msghdr = {
4178 .msg_iov = &iovec,
4179 .msg_iovlen = 1,
4180 .msg_control = &control,
4181 .msg_controllen = sizeof(control),
4182 };
4183 struct ucred *ucred;
4184 ssize_t n;
4185 pid_t inner_child_pid;
4186 _cleanup_strv_free_ char **tags = NULL;
4187
4188 assert(userdata);
4189
4190 inner_child_pid = PTR_TO_PID(userdata);
4191
4192 if (revents != EPOLLIN) {
4193 log_warning("Got unexpected poll event for notify fd.");
4194 return 0;
4195 }
4196
4197 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4198 if (IN_SET(n, -EAGAIN, -EINTR))
4199 return 0;
4200 if (n == -EXFULL) {
4201 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4202 return 0;
4203 }
4204 if (n < 0)
4205 return log_warning_errno(n, "Couldn't read notification socket: %m");
4206
4207 cmsg_close_all(&msghdr);
4208
4209 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4210 if (!ucred || ucred->pid != inner_child_pid) {
4211 log_debug("Received notify message without valid credentials. Ignoring.");
4212 return 0;
4213 }
4214
4215 if ((size_t) n >= sizeof(buf)) {
4216 log_warning("Received notify message exceeded maximum size. Ignoring.");
4217 return 0;
4218 }
4219
4220 buf[n] = 0;
4221 tags = strv_split(buf, "\n\r");
4222 if (!tags)
4223 return log_oom();
4224
4225 if (strv_find(tags, "READY=1"))
4226 (void) sd_notifyf(false, "READY=1\n");
4227
4228 p = strv_find_startswith(tags, "STATUS=");
4229 if (p)
4230 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4231
4232 return 0;
4233 }
4234
4235 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4236 int r;
4237
4238 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4239 if (r < 0)
4240 return log_error_errno(r, "Failed to allocate notify event source: %m");
4241
4242 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4243
4244 return 0;
4245 }
4246
4247 static int merge_settings(Settings *settings, const char *path) {
4248 int rl;
4249
4250 assert(settings);
4251 assert(path);
4252
4253 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4254 * that this steals the fields of the Settings* structure, and hence modifies it. */
4255
4256 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4257 settings->start_mode >= 0) {
4258 arg_start_mode = settings->start_mode;
4259 strv_free_and_replace(arg_parameters, settings->parameters);
4260 }
4261
4262 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
4263 arg_ephemeral = settings->ephemeral;
4264
4265 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4266 settings->root) {
4267
4268 if (!arg_settings_trusted)
4269 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4270 else
4271 free_and_replace(arg_directory, settings->root);
4272 }
4273
4274 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4275 settings->pivot_root_new) {
4276 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4277 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4278 }
4279
4280 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4281 settings->working_directory)
4282 free_and_replace(arg_chdir, settings->working_directory);
4283
4284 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4285 settings->environment)
4286 strv_free_and_replace(arg_setenv, settings->environment);
4287
4288 if ((arg_settings_mask & SETTING_USER) == 0) {
4289
4290 if (settings->user)
4291 free_and_replace(arg_user, settings->user);
4292
4293 if (uid_is_valid(settings->uid))
4294 arg_uid = settings->uid;
4295 if (gid_is_valid(settings->gid))
4296 arg_gid = settings->gid;
4297 if (settings->n_supplementary_gids > 0) {
4298 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4299 arg_n_supplementary_gids = settings->n_supplementary_gids;
4300 }
4301 }
4302
4303 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4304 uint64_t plus, minus;
4305 uint64_t network_minus = 0;
4306 uint64_t ambient;
4307
4308 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4309 * Settings structure */
4310
4311 plus = settings->capability;
4312 minus = settings->drop_capability;
4313
4314 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
4315 if (settings_private_network(settings))
4316 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4317 else
4318 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4319 }
4320
4321 if (!arg_settings_trusted && plus != 0) {
4322 if (settings->capability != 0)
4323 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4324 } else {
4325 arg_caps_retain &= ~network_minus;
4326 arg_caps_retain |= plus;
4327 }
4328
4329 arg_caps_retain &= ~minus;
4330
4331 /* Copy the full capabilities over too */
4332 if (capability_quintet_is_set(&settings->full_capabilities)) {
4333 if (!arg_settings_trusted)
4334 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4335 else
4336 arg_full_capabilities = settings->full_capabilities;
4337 }
4338
4339 ambient = settings->ambient_capability;
4340 if (!arg_settings_trusted && ambient != 0)
4341 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4342 else
4343 arg_caps_ambient |= ambient;
4344 }
4345
4346 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4347 settings->kill_signal > 0)
4348 arg_kill_signal = settings->kill_signal;
4349
4350 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4351 settings->personality != PERSONALITY_INVALID)
4352 arg_personality = settings->personality;
4353
4354 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4355 !sd_id128_is_null(settings->machine_id)) {
4356
4357 if (!arg_settings_trusted)
4358 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4359 else
4360 arg_uuid = settings->machine_id;
4361 }
4362
4363 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4364 settings->read_only >= 0)
4365 arg_read_only = settings->read_only;
4366
4367 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4368 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4369 arg_volatile_mode = settings->volatile_mode;
4370
4371 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4372 settings->n_custom_mounts > 0) {
4373
4374 if (!arg_settings_trusted)
4375 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4376 else {
4377 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4378 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4379 arg_n_custom_mounts = settings->n_custom_mounts;
4380 settings->n_custom_mounts = 0;
4381 }
4382 }
4383
4384 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4385 (settings->private_network >= 0 ||
4386 settings->network_veth >= 0 ||
4387 settings->network_bridge ||
4388 settings->network_zone ||
4389 settings->network_interfaces ||
4390 settings->network_macvlan ||
4391 settings->network_ipvlan ||
4392 settings->network_veth_extra ||
4393 settings->network_namespace_path)) {
4394
4395 if (!arg_settings_trusted)
4396 log_warning("Ignoring network settings, file %s is not trusted.", path);
4397 else {
4398 arg_network_veth = settings_network_veth(settings);
4399 arg_private_network = settings_private_network(settings);
4400
4401 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4402 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4403 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4404 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4405
4406 free_and_replace(arg_network_bridge, settings->network_bridge);
4407 free_and_replace(arg_network_zone, settings->network_zone);
4408
4409 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4410 }
4411 }
4412
4413 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4414 settings->expose_ports) {
4415
4416 if (!arg_settings_trusted)
4417 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4418 else {
4419 expose_port_free_all(arg_expose_ports);
4420 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4421 }
4422 }
4423
4424 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4425 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4426
4427 if (!arg_settings_trusted)
4428 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4429 else {
4430 arg_userns_mode = settings->userns_mode;
4431 arg_uid_shift = settings->uid_shift;
4432 arg_uid_range = settings->uid_range;
4433 arg_userns_ownership = settings->userns_ownership;
4434 }
4435 }
4436
4437 if ((arg_settings_mask & SETTING_BIND_USER) == 0)
4438 strv_free_and_replace(arg_bind_user, settings->bind_user);
4439
4440 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
4441 arg_notify_ready = settings->notify_ready;
4442
4443 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4444
4445 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4446 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4447 else {
4448 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4449 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4450 }
4451
4452 #if HAVE_SECCOMP
4453 if (!arg_settings_trusted && settings->seccomp)
4454 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4455 else {
4456 seccomp_release(arg_seccomp);
4457 arg_seccomp = TAKE_PTR(settings->seccomp);
4458 }
4459 #endif
4460 }
4461
4462 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4463 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4464 continue;
4465
4466 if (!settings->rlimit[rl])
4467 continue;
4468
4469 if (!arg_settings_trusted) {
4470 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4471 continue;
4472 }
4473
4474 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4475 }
4476
4477 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4478 settings->hostname)
4479 free_and_replace(arg_hostname, settings->hostname);
4480
4481 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4482 settings->no_new_privileges >= 0)
4483 arg_no_new_privileges = settings->no_new_privileges;
4484
4485 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4486 settings->oom_score_adjust_set) {
4487
4488 if (!arg_settings_trusted)
4489 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4490 else {
4491 arg_oom_score_adjust = settings->oom_score_adjust;
4492 arg_oom_score_adjust_set = true;
4493 }
4494 }
4495
4496 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4497 settings->cpu_set.set) {
4498
4499 if (!arg_settings_trusted)
4500 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4501 else {
4502 cpu_set_reset(&arg_cpu_set);
4503 arg_cpu_set = settings->cpu_set;
4504 settings->cpu_set = (CPUSet) {};
4505 }
4506 }
4507
4508 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4509 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4510 arg_resolv_conf = settings->resolv_conf;
4511
4512 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4513 settings->link_journal != _LINK_JOURNAL_INVALID) {
4514
4515 if (!arg_settings_trusted)
4516 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4517 else {
4518 arg_link_journal = settings->link_journal;
4519 arg_link_journal_try = settings->link_journal_try;
4520 }
4521 }
4522
4523 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4524 settings->timezone != _TIMEZONE_MODE_INVALID)
4525 arg_timezone = settings->timezone;
4526
4527 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4528 settings->slice) {
4529
4530 if (!arg_settings_trusted)
4531 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4532 else
4533 free_and_replace(arg_slice, settings->slice);
4534 }
4535
4536 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4537 settings->use_cgns >= 0) {
4538
4539 if (!arg_settings_trusted)
4540 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4541 else
4542 arg_use_cgns = settings->use_cgns;
4543 }
4544
4545 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4546 settings->clone_ns_flags != ULONG_MAX) {
4547
4548 if (!arg_settings_trusted)
4549 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4550 else
4551 arg_clone_ns_flags = settings->clone_ns_flags;
4552 }
4553
4554 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4555 settings->console_mode >= 0) {
4556
4557 if (!arg_settings_trusted)
4558 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4559 else
4560 arg_console_mode = settings->console_mode;
4561 }
4562
4563 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4564 * don't consult arg_settings_mask for them. */
4565
4566 sd_bus_message_unref(arg_property_message);
4567 arg_property_message = TAKE_PTR(settings->properties);
4568
4569 arg_console_width = settings->console_width;
4570 arg_console_height = settings->console_height;
4571
4572 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4573 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4574 arg_n_extra_nodes = settings->n_extra_nodes;
4575
4576 return 0;
4577 }
4578
4579 static int load_settings(void) {
4580 _cleanup_(settings_freep) Settings *settings = NULL;
4581 _cleanup_fclose_ FILE *f = NULL;
4582 _cleanup_free_ char *p = NULL;
4583 const char *fn, *i;
4584 int r;
4585
4586 if (arg_oci_bundle)
4587 return 0;
4588
4589 /* If all settings are masked, there's no point in looking for
4590 * the settings file */
4591 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4592 return 0;
4593
4594 fn = strjoina(arg_machine, ".nspawn");
4595
4596 /* We first look in the admin's directories in /etc and /run */
4597 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4598 _cleanup_free_ char *j = NULL;
4599
4600 j = path_join(i, fn);
4601 if (!j)
4602 return log_oom();
4603
4604 f = fopen(j, "re");
4605 if (f) {
4606 p = TAKE_PTR(j);
4607
4608 /* By default, we trust configuration from /etc and /run */
4609 if (arg_settings_trusted < 0)
4610 arg_settings_trusted = true;
4611
4612 break;
4613 }
4614
4615 if (errno != ENOENT)
4616 return log_error_errno(errno, "Failed to open %s: %m", j);
4617 }
4618
4619 if (!f) {
4620 /* After that, let's look for a file next to the
4621 * actual image we shall boot. */
4622
4623 if (arg_image) {
4624 p = file_in_same_dir(arg_image, fn);
4625 if (!p)
4626 return log_oom();
4627 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4628 p = file_in_same_dir(arg_directory, fn);
4629 if (!p)
4630 return log_oom();
4631 }
4632
4633 if (p) {
4634 f = fopen(p, "re");
4635 if (!f && errno != ENOENT)
4636 return log_error_errno(errno, "Failed to open %s: %m", p);
4637
4638 /* By default, we do not trust configuration from /var/lib/machines */
4639 if (arg_settings_trusted < 0)
4640 arg_settings_trusted = false;
4641 }
4642 }
4643
4644 if (!f)
4645 return 0;
4646
4647 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4648
4649 r = settings_load(f, p, &settings);
4650 if (r < 0)
4651 return r;
4652
4653 return merge_settings(settings, p);
4654 }
4655
4656 static int load_oci_bundle(void) {
4657 _cleanup_(settings_freep) Settings *settings = NULL;
4658 int r;
4659
4660 if (!arg_oci_bundle)
4661 return 0;
4662
4663 /* By default let's trust OCI bundles */
4664 if (arg_settings_trusted < 0)
4665 arg_settings_trusted = true;
4666
4667 r = oci_load(NULL, arg_oci_bundle, &settings);
4668 if (r < 0)
4669 return r;
4670
4671 return merge_settings(settings, arg_oci_bundle);
4672 }
4673
4674 static int run_container(
4675 DissectedImage *dissected_image,
4676 bool secondary,
4677 FDSet *fds,
4678 char veth_name[IFNAMSIZ], bool *veth_created,
4679 struct ExposeArgs *expose_args,
4680 int *master, pid_t *pid, int *ret) {
4681
4682 static const struct sigaction sa = {
4683 .sa_handler = nop_signal_handler,
4684 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4685 };
4686
4687 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4688 _cleanup_close_ int etc_passwd_lock = -1;
4689 _cleanup_close_pair_ int
4690 kmsg_socket_pair[2] = { -1, -1 },
4691 rtnl_socket_pair[2] = { -1, -1 },
4692 pid_socket_pair[2] = { -1, -1 },
4693 uuid_socket_pair[2] = { -1, -1 },
4694 notify_socket_pair[2] = { -1, -1 },
4695 uid_shift_socket_pair[2] = { -1, -1 },
4696 master_pty_socket_pair[2] = { -1, -1 },
4697 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4698
4699 _cleanup_close_ int notify_socket = -1;
4700 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4701 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4702 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4703 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4704 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4705 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4706 _cleanup_free_ uid_t *bind_user_uid = NULL;
4707 size_t n_bind_user_uid = 0;
4708 ContainerStatus container_status = 0;
4709 int ifi = 0, r;
4710 ssize_t l;
4711 sigset_t mask_chld;
4712 _cleanup_close_ int child_netns_fd = -1;
4713
4714 assert_se(sigemptyset(&mask_chld) == 0);
4715 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4716
4717 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4718 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4719 * check with getpwuid() if the specific user already exists. Note that /etc might be
4720 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4721 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4722 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4723 * really ours. */
4724
4725 etc_passwd_lock = take_etc_passwd_lock(NULL);
4726 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4727 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4728 }
4729
4730 r = barrier_create(&barrier);
4731 if (r < 0)
4732 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4733
4734 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4735 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4736
4737 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4738 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4739
4740 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4741 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4742
4743 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4744 return log_error_errno(errno, "Failed to create id socket pair: %m");
4745
4746 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4747 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4748
4749 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4750 return log_error_errno(errno, "Failed to create console socket pair: %m");
4751
4752 if (arg_userns_mode != USER_NAMESPACE_NO)
4753 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4754 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4755
4756 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4757 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4758 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4759
4760 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4761 * parent's blocking calls and give it a chance to call wait() and terminate. */
4762 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4763 if (r < 0)
4764 return log_error_errno(errno, "Failed to change the signal mask: %m");
4765
4766 r = sigaction(SIGCHLD, &sa, NULL);
4767 if (r < 0)
4768 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4769
4770 if (arg_network_namespace_path) {
4771 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4772 if (child_netns_fd < 0)
4773 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4774
4775 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4776 if (r == -EUCLEAN)
4777 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4778 else if (r < 0)
4779 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4780 else if (r == 0)
4781 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4782 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4783 }
4784
4785 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4786 if (*pid < 0)
4787 return log_error_errno(errno, "clone() failed%s: %m",
4788 errno == EINVAL ?
4789 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4790
4791 if (*pid == 0) {
4792 /* The outer child only has a file system namespace. */
4793 barrier_set_role(&barrier, BARRIER_CHILD);
4794
4795 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4796 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4797 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4798 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4799 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4800 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4801 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4802 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4803
4804 (void) reset_all_signal_handlers();
4805 (void) reset_signal_mask();
4806
4807 r = outer_child(&barrier,
4808 arg_directory,
4809 dissected_image,
4810 secondary,
4811 pid_socket_pair[1],
4812 uuid_socket_pair[1],
4813 notify_socket_pair[1],
4814 kmsg_socket_pair[1],
4815 rtnl_socket_pair[1],
4816 uid_shift_socket_pair[1],
4817 master_pty_socket_pair[1],
4818 unified_cgroup_hierarchy_socket_pair[1],
4819 fds,
4820 child_netns_fd);
4821 if (r < 0)
4822 _exit(EXIT_FAILURE);
4823
4824 _exit(EXIT_SUCCESS);
4825 }
4826
4827 barrier_set_role(&barrier, BARRIER_PARENT);
4828
4829 fdset_close(fds);
4830
4831 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4832 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4833 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4834 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4835 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4836 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4837 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4838 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4839
4840 if (arg_userns_mode != USER_NAMESPACE_NO) {
4841 /* The child just let us know the UID shift it might have read from the image. */
4842 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4843 if (l < 0)
4844 return log_error_errno(errno, "Failed to read UID shift: %m");
4845 if (l != sizeof arg_uid_shift)
4846 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4847
4848 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4849 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4850 * image, but if that's already in use, pick a new one, and report back to the child,
4851 * which one we now picked. */
4852
4853 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4854 if (r < 0)
4855 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4856
4857 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4858 if (l < 0)
4859 return log_error_errno(errno, "Failed to send UID shift: %m");
4860 if (l != sizeof arg_uid_shift)
4861 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4862 }
4863
4864 n_bind_user_uid = strv_length(arg_bind_user);
4865 if (n_bind_user_uid > 0) {
4866 /* Right after the UID shift, we'll receive the list of UID mappings for the
4867 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4868
4869 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4870 if (!bind_user_uid)
4871 return log_oom();
4872
4873 for (size_t i = 0; i < n_bind_user_uid; i++) {
4874 l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4875 if (l < 0)
4876 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4877 if (l != sizeof(uid_t)*4)
4878 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4879 SYNTHETIC_ERRNO(EIO),
4880 "Short read while reading bind user UID pairs.");
4881 }
4882 }
4883 }
4884
4885 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4886 /* The child let us know the support cgroup mode it might have read from the image. */
4887 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4888 if (l < 0)
4889 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4890 if (l != sizeof(arg_unified_cgroup_hierarchy))
4891 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4892 l, l == 0 ? " The child is most likely dead." : "");
4893 }
4894
4895 /* Wait for the outer child. */
4896 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4897 if (r < 0)
4898 return r;
4899 if (r != EXIT_SUCCESS)
4900 return -EIO;
4901
4902 /* And now retrieve the PID of the inner child. */
4903 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4904 if (l < 0)
4905 return log_error_errno(errno, "Failed to read inner child PID: %m");
4906 if (l != sizeof *pid)
4907 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4908
4909 /* We also retrieve container UUID in case it was generated by outer child */
4910 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4911 if (l < 0)
4912 return log_error_errno(errno, "Failed to read container machine ID: %m");
4913 if (l != sizeof(arg_uuid))
4914 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4915
4916 /* We also retrieve the socket used for notifications generated by outer child */
4917 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4918 if (notify_socket < 0)
4919 return log_error_errno(notify_socket,
4920 "Failed to receive notification socket from the outer child: %m");
4921
4922 log_debug("Init process invoked as PID "PID_FMT, *pid);
4923
4924 if (arg_userns_mode != USER_NAMESPACE_NO) {
4925 if (!barrier_place_and_sync(&barrier)) /* #1 */
4926 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4927
4928 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4929 if (r < 0)
4930 return r;
4931
4932 (void) barrier_place(&barrier); /* #2 */
4933 }
4934
4935 if (arg_private_network) {
4936 if (!arg_network_namespace_path) {
4937 /* Wait until the child has unshared its network namespace. */
4938 if (!barrier_place_and_sync(&barrier)) /* #3 */
4939 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4940 }
4941
4942 if (child_netns_fd < 0) {
4943 /* Make sure we have an open file descriptor to the child's network
4944 * namespace so it stays alive even if the child exits. */
4945 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4946 if (r < 0)
4947 return log_error_errno(r, "Failed to open child network namespace: %m");
4948 }
4949
4950 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4951 if (r < 0)
4952 return r;
4953
4954 if (arg_network_veth) {
4955 r = setup_veth(arg_machine, *pid, veth_name,
4956 arg_network_bridge || arg_network_zone);
4957 if (r < 0)
4958 return r;
4959 else if (r > 0)
4960 ifi = r;
4961
4962 if (arg_network_bridge) {
4963 /* Add the interface to a bridge */
4964 r = setup_bridge(veth_name, arg_network_bridge, false);
4965 if (r < 0)
4966 return r;
4967 if (r > 0)
4968 ifi = r;
4969 } else if (arg_network_zone) {
4970 /* Add the interface to a bridge, possibly creating it */
4971 r = setup_bridge(veth_name, arg_network_zone, true);
4972 if (r < 0)
4973 return r;
4974 if (r > 0)
4975 ifi = r;
4976 }
4977 }
4978
4979 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4980 if (r < 0)
4981 return r;
4982
4983 /* We created the primary and extra veth links now; let's remember this, so that we know to
4984 remove them later on. Note that we don't bother with removing veth links that were created
4985 here when their setup failed half-way, because in that case the kernel should be able to
4986 remove them on its own, since they cannot be referenced by anything yet. */
4987 *veth_created = true;
4988
4989 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4990 if (r < 0)
4991 return r;
4992
4993 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4994 if (r < 0)
4995 return r;
4996 }
4997
4998 if (arg_register || !arg_keep_unit) {
4999 r = sd_bus_default_system(&bus);
5000 if (r < 0)
5001 return log_error_errno(r, "Failed to open system bus: %m");
5002
5003 r = sd_bus_set_close_on_exit(bus, false);
5004 if (r < 0)
5005 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5006 }
5007
5008 if (!arg_keep_unit) {
5009 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5010 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5011 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5012
5013 r = sd_bus_match_signal_async(
5014 bus,
5015 NULL,
5016 "org.freedesktop.systemd1",
5017 NULL,
5018 "org.freedesktop.systemd1.Scope",
5019 "RequestStop",
5020 on_request_stop, NULL, PID_TO_PTR(*pid));
5021 if (r < 0)
5022 return log_error_errno(r, "Failed to request RequestStop match: %m");
5023 }
5024
5025 if (arg_register) {
5026 r = register_machine(
5027 bus,
5028 arg_machine,
5029 *pid,
5030 arg_directory,
5031 arg_uuid,
5032 ifi,
5033 arg_slice,
5034 arg_custom_mounts, arg_n_custom_mounts,
5035 arg_kill_signal,
5036 arg_property,
5037 arg_property_message,
5038 arg_keep_unit,
5039 arg_container_service_name);
5040 if (r < 0)
5041 return r;
5042
5043 } else if (!arg_keep_unit) {
5044 r = allocate_scope(
5045 bus,
5046 arg_machine,
5047 *pid,
5048 arg_slice,
5049 arg_custom_mounts, arg_n_custom_mounts,
5050 arg_kill_signal,
5051 arg_property,
5052 arg_property_message);
5053 if (r < 0)
5054 return r;
5055
5056 } else if (arg_slice || arg_property)
5057 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5058
5059 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5060 if (r < 0)
5061 return r;
5062
5063 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5064 if (r < 0)
5065 return r;
5066
5067 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5068 if (r < 0)
5069 return r;
5070
5071 /* Notify the child that the parent is ready with all
5072 * its setup (including cgroup-ification), and that
5073 * the child can now hand over control to the code to
5074 * run inside the container. */
5075 (void) barrier_place(&barrier); /* #4 */
5076
5077 /* Block SIGCHLD here, before notifying child.
5078 * process_pty() will handle it with the other signals. */
5079 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5080
5081 /* Reset signal to default */
5082 r = default_signals(SIGCHLD);
5083 if (r < 0)
5084 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5085
5086 r = sd_event_new(&event);
5087 if (r < 0)
5088 return log_error_errno(r, "Failed to get default event source: %m");
5089
5090 (void) sd_event_set_watchdog(event, true);
5091
5092 if (bus) {
5093 r = sd_bus_attach_event(bus, event, 0);
5094 if (r < 0)
5095 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5096 }
5097
5098 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5099 if (r < 0)
5100 return r;
5101
5102 /* Let the child know that we are ready and wait that the child is completely ready now. */
5103 if (!barrier_place_and_sync(&barrier)) /* #5 */
5104 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5105
5106 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5107 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5108 etc_passwd_lock = safe_close(etc_passwd_lock);
5109
5110 (void) sd_notifyf(false,
5111 "STATUS=Container running.\n"
5112 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5113 if (!arg_notify_ready)
5114 (void) sd_notify(false, "READY=1\n");
5115
5116 if (arg_kill_signal > 0) {
5117 /* Try to kill the init system on SIGINT or SIGTERM */
5118 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5119 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5120 } else {
5121 /* Immediately exit */
5122 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5123 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5124 }
5125
5126 /* Exit when the child exits */
5127 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5128
5129 if (arg_expose_ports) {
5130 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
5131 if (r < 0)
5132 return r;
5133
5134 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5135 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5136 }
5137
5138 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5139
5140 if (arg_console_mode != CONSOLE_PIPE) {
5141 _cleanup_close_ int fd = -1;
5142 PTYForwardFlags flags = 0;
5143
5144 /* Retrieve the master pty allocated by inner child */
5145 fd = receive_one_fd(master_pty_socket_pair[0], 0);
5146 if (fd < 0)
5147 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5148
5149 switch (arg_console_mode) {
5150
5151 case CONSOLE_READ_ONLY:
5152 flags |= PTY_FORWARD_READ_ONLY;
5153
5154 _fallthrough_;
5155
5156 case CONSOLE_INTERACTIVE:
5157 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5158
5159 r = pty_forward_new(event, fd, flags, &forward);
5160 if (r < 0)
5161 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5162
5163 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5164 (void) pty_forward_set_width_height(forward,
5165 arg_console_width,
5166 arg_console_height);
5167 break;
5168
5169 default:
5170 assert(arg_console_mode == CONSOLE_PASSIVE);
5171 }
5172
5173 *master = TAKE_FD(fd);
5174 }
5175
5176 r = sd_event_loop(event);
5177 if (r < 0)
5178 return log_error_errno(r, "Failed to run event loop: %m");
5179
5180 if (forward) {
5181 char last_char = 0;
5182
5183 (void) pty_forward_get_last_char(forward, &last_char);
5184 forward = pty_forward_free(forward);
5185
5186 if (!arg_quiet && last_char != '\n')
5187 putc('\n', stdout);
5188 }
5189
5190 /* Kill if it is not dead yet anyway */
5191 if (!arg_register && !arg_keep_unit && bus)
5192 terminate_scope(bus, arg_machine);
5193
5194 /* Normally redundant, but better safe than sorry */
5195 (void) kill(*pid, SIGKILL);
5196
5197 if (arg_private_network) {
5198 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5199 * to avoid having to move the parent to the child network namespace. */
5200 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5201 if (r < 0)
5202 return r;
5203
5204 if (r == 0) {
5205 _cleanup_close_ int parent_netns_fd = -1;
5206
5207 r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5208 if (r < 0) {
5209 log_error_errno(r, "Failed to open parent network namespace: %m");
5210 _exit(EXIT_FAILURE);
5211 }
5212
5213 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5214 if (r < 0) {
5215 log_error_errno(r, "Failed to enter child network namespace: %m");
5216 _exit(EXIT_FAILURE);
5217 }
5218
5219 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5220 if (r < 0)
5221 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5222
5223 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5224 }
5225 }
5226
5227 r = wait_for_container(*pid, &container_status);
5228 *pid = 0;
5229
5230 /* Tell machined that we are gone. */
5231 if (bus)
5232 (void) unregister_machine(bus, arg_machine);
5233
5234 if (r < 0)
5235 /* We failed to wait for the container, or the container exited abnormally. */
5236 return r;
5237 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5238 /* r > 0 → The container exited with a non-zero status.
5239 * As a special case, we need to replace 133 with a different value,
5240 * because 133 is special-cased in the service file to reboot the container.
5241 * otherwise → The container exited with zero status and a reboot was not requested.
5242 */
5243 if (r == EXIT_FORCE_RESTART)
5244 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5245 *ret = r;
5246 return 0; /* finito */
5247 }
5248
5249 /* CONTAINER_REBOOTED, loop again */
5250
5251 if (arg_keep_unit) {
5252 /* Special handling if we are running as a service: instead of simply
5253 * restarting the machine we want to restart the entire service, so let's
5254 * inform systemd about this with the special exit code 133. The service
5255 * file uses RestartForceExitStatus=133 so that this results in a full
5256 * nspawn restart. This is necessary since we might have cgroup parameters
5257 * set we want to have flushed out. */
5258 *ret = EXIT_FORCE_RESTART;
5259 return 0; /* finito */
5260 }
5261
5262 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5263 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5264
5265 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5266 *veth_created = false;
5267 return 1; /* loop again */
5268 }
5269
5270 static int initialize_rlimits(void) {
5271 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
5272 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5273 * container execution environments. */
5274
5275 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5276 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5277 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5278 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5279 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5280 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5281 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5282 [RLIMIT_MEMLOCK] = { 65536, 65536 },
5283 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5284 [RLIMIT_NICE] = { 0, 0 },
5285 [RLIMIT_NOFILE] = { 1024, 4096 },
5286 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5287 [RLIMIT_RTPRIO] = { 0, 0 },
5288 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5289 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5290
5291 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5292 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5293 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5294 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5295 * that PID 1 changes a number of other resource limits during early initialization which is why we
5296 * don't read the other limits from PID 1 but prefer the static table above. */
5297 };
5298
5299 int rl;
5300
5301 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5302 /* Let's only fill in what the user hasn't explicitly configured anyway */
5303 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5304 const struct rlimit *v;
5305 struct rlimit buffer;
5306
5307 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5308 /* For these two let's read the limits off PID 1. See above for an explanation. */
5309
5310 if (prlimit(1, rl, NULL, &buffer) < 0)
5311 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5312
5313 v = &buffer;
5314 } else
5315 v = kernel_defaults + rl;
5316
5317 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5318 if (!arg_rlimit[rl])
5319 return log_oom();
5320 }
5321
5322 if (DEBUG_LOGGING) {
5323 _cleanup_free_ char *k = NULL;
5324
5325 (void) rlimit_format(arg_rlimit[rl], &k);
5326 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5327 }
5328 }
5329
5330 return 0;
5331 }
5332
5333 static int cant_be_in_netns(void) {
5334 union sockaddr_union sa = {
5335 .un = {
5336 .sun_family = AF_UNIX,
5337 .sun_path = "/run/udev/control",
5338 },
5339 };
5340 char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5341 _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5342 _cleanup_close_ int fd = -1;
5343 struct ucred ucred;
5344 int r;
5345
5346 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5347 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5348 * nice message. */
5349
5350 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5351 return 0;
5352
5353 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5354 if (fd < 0)
5355 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5356
5357 if (connect(fd, &sa.un, SOCKADDR_UN_LEN(sa.un)) < 0) {
5358
5359 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5360 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5361 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5362
5363 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5364 }
5365
5366 r = getpeercred(fd, &ucred);
5367 if (r < 0)
5368 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5369
5370 xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5371 r = readlink_malloc(udev_path, &udev_ns);
5372 if (r < 0)
5373 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5374
5375 r = readlink_malloc("/proc/self/ns/net", &our_ns);
5376 if (r < 0)
5377 return log_error_errno(r, "Failed to read our own network namespace: %m");
5378
5379 if (!streq(our_ns, udev_ns))
5380 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5381 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5382 return 0;
5383 }
5384
5385 static int run(int argc, char *argv[]) {
5386 bool secondary = false, remove_directory = false, remove_image = false,
5387 veth_created = false, remove_tmprootdir = false;
5388 _cleanup_close_ int master = -1;
5389 _cleanup_fdset_free_ FDSet *fds = NULL;
5390 int r, n_fd_passed, ret = EXIT_SUCCESS;
5391 char veth_name[IFNAMSIZ] = "";
5392 struct ExposeArgs expose_args = {};
5393 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5394 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5395 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5396 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5397 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5398 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5399 pid_t pid = 0;
5400
5401 log_parse_environment();
5402 log_open();
5403
5404 r = parse_argv(argc, argv);
5405 if (r <= 0)
5406 goto finish;
5407
5408 if (geteuid() != 0) {
5409 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5410 argc >= 2 ? "Need to be root." :
5411 "Need to be root (and some arguments are usually required).\nHint: try --help");
5412 goto finish;
5413 }
5414
5415 r = cant_be_in_netns();
5416 if (r < 0)
5417 goto finish;
5418
5419 r = initialize_rlimits();
5420 if (r < 0)
5421 goto finish;
5422
5423 r = load_oci_bundle();
5424 if (r < 0)
5425 goto finish;
5426
5427 r = determine_names();
5428 if (r < 0)
5429 goto finish;
5430
5431 r = load_settings();
5432 if (r < 0)
5433 goto finish;
5434
5435 r = cg_unified();
5436 if (r < 0) {
5437 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5438 goto finish;
5439 }
5440
5441 r = verify_arguments();
5442 if (r < 0)
5443 goto finish;
5444
5445 /* Reapply environment settings. */
5446 (void) detect_unified_cgroup_hierarchy_from_environment();
5447
5448 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5449 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5450 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5451 (void) ignore_signals(SIGPIPE);
5452
5453 n_fd_passed = sd_listen_fds(false);
5454 if (n_fd_passed > 0) {
5455 r = fdset_new_listen_fds(&fds, false);
5456 if (r < 0) {
5457 log_error_errno(r, "Failed to collect file descriptors: %m");
5458 goto finish;
5459 }
5460 }
5461
5462 /* The "default" umask. This is appropriate for most file and directory
5463 * operations performed by nspawn, and is the umask that will be used for
5464 * the child. Functions like copy_devnodes() change the umask temporarily. */
5465 umask(0022);
5466
5467 if (arg_directory) {
5468 assert(!arg_image);
5469
5470 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5471 * /var from the host will propagate into container dynamically (because bad things happen if
5472 * two systems write to the same /var). Let's allow it for the special cases where /var is
5473 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5474 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5475 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5476 r = -EINVAL;
5477 goto finish;
5478 }
5479
5480 if (arg_ephemeral) {
5481 _cleanup_free_ char *np = NULL;
5482
5483 r = chase_symlinks_and_update(&arg_directory, 0);
5484 if (r < 0)
5485 goto finish;
5486
5487 /* If the specified path is a mount point we generate the new snapshot immediately
5488 * inside it under a random name. However if the specified is not a mount point we
5489 * create the new snapshot in the parent directory, just next to it. */
5490 r = path_is_mount_point(arg_directory, NULL, 0);
5491 if (r < 0) {
5492 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5493 goto finish;
5494 }
5495 if (r > 0)
5496 r = tempfn_random_child(arg_directory, "machine.", &np);
5497 else
5498 r = tempfn_random(arg_directory, "machine.", &np);
5499 if (r < 0) {
5500 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5501 goto finish;
5502 }
5503
5504 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5505 * only owned by us and no one else. */
5506 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5507 if (r < 0) {
5508 log_error_errno(r, "Failed to lock %s: %m", np);
5509 goto finish;
5510 }
5511
5512 {
5513 BLOCK_SIGNALS(SIGINT);
5514 r = btrfs_subvol_snapshot(arg_directory, np,
5515 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5516 BTRFS_SNAPSHOT_FALLBACK_COPY |
5517 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5518 BTRFS_SNAPSHOT_RECURSIVE |
5519 BTRFS_SNAPSHOT_QUOTA |
5520 BTRFS_SNAPSHOT_SIGINT);
5521 }
5522 if (r == -EINTR) {
5523 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5524 goto finish;
5525 }
5526 if (r < 0) {
5527 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5528 goto finish;
5529 }
5530
5531 free_and_replace(arg_directory, np);
5532 remove_directory = true;
5533 } else {
5534 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5535 if (r < 0)
5536 goto finish;
5537
5538 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5539 if (r == -EBUSY) {
5540 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5541 goto finish;
5542 }
5543 if (r < 0) {
5544 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5545 goto finish;
5546 }
5547
5548 if (arg_template) {
5549 r = chase_symlinks_and_update(&arg_template, 0);
5550 if (r < 0)
5551 goto finish;
5552
5553 {
5554 BLOCK_SIGNALS(SIGINT);
5555 r = btrfs_subvol_snapshot(arg_template, arg_directory,
5556 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5557 BTRFS_SNAPSHOT_FALLBACK_COPY |
5558 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5559 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5560 BTRFS_SNAPSHOT_RECURSIVE |
5561 BTRFS_SNAPSHOT_QUOTA |
5562 BTRFS_SNAPSHOT_SIGINT);
5563 }
5564 if (r == -EEXIST)
5565 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5566 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5567 else if (r == -EINTR) {
5568 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5569 goto finish;
5570 } else if (r < 0) {
5571 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5572 goto finish;
5573 } else
5574 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5575 "Populated %s from template %s.", arg_directory, arg_template);
5576 }
5577 }
5578
5579 if (arg_start_mode == START_BOOT) {
5580 const char *p;
5581
5582 if (arg_pivot_root_new)
5583 p = prefix_roota(arg_directory, arg_pivot_root_new);
5584 else
5585 p = arg_directory;
5586
5587 if (path_is_os_tree(p) <= 0) {
5588 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5589 r = -EINVAL;
5590 goto finish;
5591 }
5592 } else {
5593 const char *p, *q;
5594
5595 if (arg_pivot_root_new)
5596 p = prefix_roota(arg_directory, arg_pivot_root_new);
5597 else
5598 p = arg_directory;
5599
5600 q = strjoina(p, "/usr/");
5601
5602 if (laccess(q, F_OK) < 0) {
5603 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
5604 r = -EINVAL;
5605 goto finish;
5606 }
5607 }
5608
5609 } else {
5610 DissectImageFlags dissect_image_flags =
5611 DISSECT_IMAGE_GENERIC_ROOT |
5612 DISSECT_IMAGE_REQUIRE_ROOT |
5613 DISSECT_IMAGE_RELAX_VAR_CHECK |
5614 DISSECT_IMAGE_USR_NO_ROOT;
5615 assert(arg_image);
5616 assert(!arg_template);
5617
5618 r = chase_symlinks_and_update(&arg_image, 0);
5619 if (r < 0)
5620 goto finish;
5621
5622 if (arg_ephemeral) {
5623 _cleanup_free_ char *np = NULL;
5624
5625 r = tempfn_random(arg_image, "machine.", &np);
5626 if (r < 0) {
5627 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5628 goto finish;
5629 }
5630
5631 /* Always take an exclusive lock on our own ephemeral copy. */
5632 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5633 if (r < 0) {
5634 r = log_error_errno(r, "Failed to create image lock: %m");
5635 goto finish;
5636 }
5637
5638 {
5639 BLOCK_SIGNALS(SIGINT);
5640 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5641 }
5642 if (r == -EINTR) {
5643 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5644 goto finish;
5645 }
5646 if (r < 0) {
5647 r = log_error_errno(r, "Failed to copy image file: %m");
5648 goto finish;
5649 }
5650
5651 free_and_replace(arg_image, np);
5652 remove_image = true;
5653 } else {
5654 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5655 if (r == -EBUSY) {
5656 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5657 goto finish;
5658 }
5659 if (r < 0) {
5660 r = log_error_errno(r, "Failed to create image lock: %m");
5661 goto finish;
5662 }
5663
5664 r = verity_settings_load(
5665 &arg_verity_settings,
5666 arg_image, NULL, NULL);
5667 if (r < 0) {
5668 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5669 goto finish;
5670 }
5671
5672 if (arg_verity_settings.data_path)
5673 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5674 }
5675
5676 if (!mkdtemp(tmprootdir)) {
5677 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5678 goto finish;
5679 }
5680
5681 remove_tmprootdir = true;
5682
5683 arg_directory = strdup(tmprootdir);
5684 if (!arg_directory) {
5685 r = log_oom();
5686 goto finish;
5687 }
5688
5689 r = loop_device_make_by_path(
5690 arg_image,
5691 arg_read_only ? O_RDONLY : O_RDWR,
5692 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5693 &loop);
5694 if (r < 0) {
5695 log_error_errno(r, "Failed to set up loopback block device: %m");
5696 goto finish;
5697 }
5698
5699 r = dissect_image_and_warn(
5700 loop->fd,
5701 arg_image,
5702 &arg_verity_settings,
5703 NULL,
5704 loop->uevent_seqnum_not_before,
5705 loop->timestamp_not_before,
5706 dissect_image_flags,
5707 &dissected_image);
5708 if (r == -ENOPKG) {
5709 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5710 log_notice("Note that the disk image needs to\n"
5711 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5712 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5713 " c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
5714 " d) or contain a file system without a partition table\n"
5715 "in order to be bootable with systemd-nspawn.");
5716 goto finish;
5717 }
5718 if (r < 0)
5719 goto finish;
5720
5721 if (!arg_verity_settings.root_hash && dissected_image->can_verity)
5722 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5723
5724 r = dissected_image_decrypt_interactively(
5725 dissected_image,
5726 NULL,
5727 &arg_verity_settings,
5728 0,
5729 &decrypted_image);
5730 if (r < 0)
5731 goto finish;
5732
5733 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5734 if (remove_image && unlink(arg_image) >= 0)
5735 remove_image = false;
5736 }
5737
5738 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5739 if (r < 0)
5740 goto finish;
5741
5742 if (arg_console_mode < 0)
5743 arg_console_mode =
5744 isatty(STDIN_FILENO) > 0 &&
5745 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5746
5747 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5748 arg_quiet = true;
5749
5750 if (!arg_quiet)
5751 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5752 arg_machine, arg_image ?: arg_directory);
5753
5754 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5755
5756 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5757 r = log_error_errno(errno, "Failed to become subreaper: %m");
5758 goto finish;
5759 }
5760
5761 if (arg_expose_ports) {
5762 r = fw_ctx_new(&fw_ctx);
5763 if (r < 0) {
5764 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5765 goto finish;
5766 }
5767 expose_args.fw_ctx = fw_ctx;
5768 }
5769 for (;;) {
5770 r = run_container(dissected_image,
5771 secondary,
5772 fds,
5773 veth_name, &veth_created,
5774 &expose_args, &master,
5775 &pid, &ret);
5776 if (r <= 0)
5777 break;
5778 }
5779
5780 finish:
5781 (void) sd_notify(false,
5782 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5783 "STOPPING=1\nSTATUS=Terminating...");
5784
5785 if (pid > 0)
5786 (void) kill(pid, SIGKILL);
5787
5788 /* Try to flush whatever is still queued in the pty */
5789 if (master >= 0) {
5790 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5791 master = safe_close(master);
5792 }
5793
5794 if (pid > 0)
5795 (void) wait_for_terminate(pid, NULL);
5796
5797 pager_close();
5798
5799 if (remove_directory && arg_directory) {
5800 int k;
5801
5802 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5803 if (k < 0)
5804 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5805 }
5806
5807 if (remove_image && arg_image) {
5808 if (unlink(arg_image) < 0)
5809 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5810 }
5811
5812 if (remove_tmprootdir) {
5813 if (rmdir(tmprootdir) < 0)
5814 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5815 }
5816
5817 if (arg_machine) {
5818 const char *p;
5819
5820 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5821 (void) rm_rf(p, REMOVE_ROOT);
5822 }
5823
5824 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5825 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5826
5827 if (veth_created)
5828 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5829 (void) remove_bridge(arg_network_zone);
5830
5831 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5832 expose_port_free_all(arg_expose_ports);
5833 rlimit_free_all(arg_rlimit);
5834 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5835 credential_free_all(arg_credentials, arg_n_credentials);
5836
5837 if (r < 0)
5838 return r;
5839
5840 return ret;
5841 }
5842
5843 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);