]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #30616 from dtardon/docbook-valid-2
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "ether-addr-util.h"
28 #include "barrier.h"
29 #include "base-filesystem.h"
30 #include "blkid-util.h"
31 #include "btrfs-util.h"
32 #include "build.h"
33 #include "bus-error.h"
34 #include "bus-locator.h"
35 #include "bus-util.h"
36 #include "cap-list.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
39 #include "chase.h"
40 #include "common-signal.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "creds-util.h"
44 #include "dev-setup.h"
45 #include "discover-image.h"
46 #include "dissect-image.h"
47 #include "env-util.h"
48 #include "escape.h"
49 #include "fd-util.h"
50 #include "fdset.h"
51 #include "fileio.h"
52 #include "format-util.h"
53 #include "fs-util.h"
54 #include "gpt.h"
55 #include "hexdecoct.h"
56 #include "hostname-setup.h"
57 #include "hostname-util.h"
58 #include "id128-util.h"
59 #include "io-util.h"
60 #include "log.h"
61 #include "loop-util.h"
62 #include "loopback-setup.h"
63 #include "machine-credential.h"
64 #include "macro.h"
65 #include "main-func.h"
66 #include "missing_sched.h"
67 #include "mkdir.h"
68 #include "mount-util.h"
69 #include "mountpoint-util.h"
70 #include "namespace-util.h"
71 #include "netlink-util.h"
72 #include "nspawn-bind-user.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-def.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-oci.h"
79 #include "nspawn-patch-uid.h"
80 #include "nspawn-register.h"
81 #include "nspawn-seccomp.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "nspawn-util.h"
86 #include "nspawn.h"
87 #include "nulstr-util.h"
88 #include "os-util.h"
89 #include "pager.h"
90 #include "parse-argument.h"
91 #include "parse-util.h"
92 #include "pretty-print.h"
93 #include "process-util.h"
94 #include "ptyfwd.h"
95 #include "random-util.h"
96 #include "raw-clone.h"
97 #include "resolve-util.h"
98 #include "rlimit-util.h"
99 #include "rm-rf.h"
100 #include "seccomp-util.h"
101 #include "selinux-util.h"
102 #include "signal-util.h"
103 #include "socket-util.h"
104 #include "stat-util.h"
105 #include "stdio-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "sysctl-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-name.h"
114 #include "user-util.h"
115
116 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
118 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED,
125 } ContainerStatus;
126
127 static char *arg_directory = NULL;
128 static char *arg_template = NULL;
129 static char *arg_chdir = NULL;
130 static char *arg_pivot_root_new = NULL;
131 static char *arg_pivot_root_old = NULL;
132 static char *arg_user = NULL;
133 static uid_t arg_uid = UID_INVALID;
134 static gid_t arg_gid = GID_INVALID;
135 static gid_t* arg_supplementary_gids = NULL;
136 static size_t arg_n_supplementary_gids = 0;
137 static sd_id128_t arg_uuid = {};
138 static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 static char *arg_hostname = NULL; /* The name the payload sees by default */
140 static const char *arg_selinux_context = NULL;
141 static const char *arg_selinux_apifs_context = NULL;
142 static char *arg_slice = NULL;
143 static bool arg_private_network = false;
144 static bool arg_read_only = false;
145 static StartMode arg_start_mode = START_PID1;
146 static bool arg_ephemeral = false;
147 static LinkJournal arg_link_journal = LINK_AUTO;
148 static bool arg_link_journal_try = false;
149 static uint64_t arg_caps_retain =
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
161 (1ULL << CAP_MKNOD) |
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
165 (1ULL << CAP_SETFCAP) |
166 (1ULL << CAP_SETGID) |
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
170 (1ULL << CAP_SYS_BOOT) |
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
174 (1ULL << CAP_SYS_RESOURCE) |
175 (1ULL << CAP_SYS_TTY_CONFIG);
176 static uint64_t arg_caps_ambient = 0;
177 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
178 static CustomMount *arg_custom_mounts = NULL;
179 static size_t arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static char *arg_network_namespace_path = NULL;
192 struct ether_addr arg_network_provided_mac = {};
193 static PagerFlags arg_pager_flags = 0;
194 static unsigned long arg_personality = PERSONALITY_INVALID;
195 static char *arg_image = NULL;
196 static char *arg_oci_bundle = NULL;
197 static VolatileMode arg_volatile_mode = VOLATILE_NO;
198 static ExposePort *arg_expose_ports = NULL;
199 static char **arg_property = NULL;
200 static sd_bus_message *arg_property_message = NULL;
201 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
202 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
203 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
204 static int arg_kill_signal = 0;
205 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
206 static SettingsMask arg_settings_mask = 0;
207 static int arg_settings_trusted = -1;
208 static char **arg_parameters = NULL;
209 static const char *arg_container_service_name = "systemd-nspawn";
210 static bool arg_notify_ready = false;
211 static bool arg_use_cgns = true;
212 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
213 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
214 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
215 static char **arg_syscall_allow_list = NULL;
216 static char **arg_syscall_deny_list = NULL;
217 #if HAVE_SECCOMP
218 static scmp_filter_ctx arg_seccomp = NULL;
219 #endif
220 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
221 static bool arg_no_new_privileges = false;
222 static int arg_oom_score_adjust = 0;
223 static bool arg_oom_score_adjust_set = false;
224 static CPUSet arg_cpu_set = {};
225 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
226 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
227 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
228 static DeviceNode* arg_extra_nodes = NULL;
229 static size_t arg_n_extra_nodes = 0;
230 static char **arg_sysctl = NULL;
231 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
232 static MachineCredentialContext arg_credentials = {};
233 static char **arg_bind_user = NULL;
234 static bool arg_suppress_sync = false;
235 static char *arg_settings_filename = NULL;
236 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
237 static ImagePolicy *arg_image_policy = NULL;
238
239 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
259 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
261 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
262 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
263 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
264 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
265 #if HAVE_SECCOMP
266 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
267 #endif
268 STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
269 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
270 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
271 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
272 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
273 STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
274
275 static int handle_arg_console(const char *arg) {
276 if (streq(arg, "help")) {
277 puts("autopipe\n"
278 "interactive\n"
279 "passive\n"
280 "pipe\n"
281 "read-only");
282 return 0;
283 }
284
285 if (streq(arg, "interactive"))
286 arg_console_mode = CONSOLE_INTERACTIVE;
287 else if (streq(arg, "read-only"))
288 arg_console_mode = CONSOLE_READ_ONLY;
289 else if (streq(arg, "passive"))
290 arg_console_mode = CONSOLE_PASSIVE;
291 else if (streq(arg, "pipe")) {
292 if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
293 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
297
298 arg_console_mode = CONSOLE_PIPE;
299 } else if (streq(arg, "autopipe")) {
300 if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
301 arg_console_mode = CONSOLE_INTERACTIVE;
302 else
303 arg_console_mode = CONSOLE_PIPE;
304 } else
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
306
307 arg_settings_mask |= SETTING_CONSOLE_MODE;
308 return 1;
309 }
310
311 static int help(void) {
312 _cleanup_free_ char *link = NULL;
313 int r;
314
315 pager_open(arg_pager_flags);
316
317 r = terminal_urlify_man("systemd-nspawn", "1", &link);
318 if (r < 0)
319 return log_oom();
320
321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
323 " -h --help Show this help\n"
324 " --version Print version string\n"
325 " -q --quiet Do not show status information\n"
326 " --no-pager Do not pipe output into a pager\n"
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
328 "%3$sImage:%4$s\n"
329 " -D --directory=PATH Root directory for the container\n"
330 " --template=PATH Initialize root directory from template directory,\n"
331 " if missing\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
334 " -i --image=PATH Root file system disk image (or device node) for\n"
335 " the container\n"
336 " --image-policy=POLICY Specify disk image dissection policy\n"
337 " --oci-bundle=PATH OCI bundle directory\n"
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
340 " --root-hash=HASH Specify verity root hash for root disk image\n"
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
344 " 'base64:'\n"
345 " --verity-data=PATH Specify hash device for verity\n"
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
350 " -b --boot Boot up full system (i.e. invoke init)\n"
351 " --chdir=PATH Set working directory in the container\n"
352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
358 "%3$sSystem Identity:%4$s\n"
359 " -M --machine=NAME Set the machine name for the container\n"
360 " --hostname=NAME Override the hostname for the container\n"
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
363 " -S --slice=SLICE Place the container in the specified slice\n"
364 " --property=NAME=VALUE Set scope unit property\n"
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
373 " Similar, but with user configured UID/GID range\n"
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
379 "%3$sNetworking:%4$s\n"
380 " --private-network Disable network in container\n"
381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
382 " Assign an existing network interface to the\n"
383 " container\n"
384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
388 " Create an ipvlan network interface based on an\n"
389 " existing network interface to the container\n"
390 " -n --network-veth Add a virtual Ethernet connection between host\n"
391 " and container\n"
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
395 " --network-bridge=INTERFACE\n"
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
406 " --capability=CAP In addition to the default, retain specified\n"
407 " capability\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
427 "%3$sIntegration:%4$s\n"
428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
433 "%3$sMounts:%4$s\n"
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
436 " the container\n"
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
440 " it\n"
441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
444 " the container\n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
448 "%3$sInput/Output:%4$s\n"
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name,
460 link,
461 ansi_underline(),
462 ansi_normal(),
463 ansi_highlight(),
464 ansi_normal());
465
466 return 0;
467 }
468
469 static int custom_mount_check_all(void) {
470 size_t i;
471
472 for (i = 0; i < arg_n_custom_mounts; i++) {
473 CustomMount *m = &arg_custom_mounts[i];
474
475 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
476 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
478 "--private-users-ownership=own may not be combined with custom root mounts.");
479 if (arg_uid_shift == UID_INVALID)
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
482 }
483 }
484
485 return 0;
486 }
487
488 static int detect_unified_cgroup_hierarchy_from_environment(void) {
489 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
490 int r;
491
492 /* Allow the user to control whether the unified hierarchy is used */
493
494 e = getenv(var);
495 if (!e) {
496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
497 var = "UNIFIED_CGROUP_HIERARCHY";
498 e = getenv(var);
499 }
500
501 if (!isempty(e)) {
502 r = parse_boolean(e);
503 if (r < 0)
504 return log_error_errno(r, "Failed to parse $%s: %m", var);
505 if (r > 0)
506 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
507 else
508 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
509 }
510
511 return 0;
512 }
513
514 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
515 int r;
516
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
519 r = cg_all_unified();
520 if (r < 0)
521 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
522 if (r > 0) {
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
525 r = systemd_installation_has_version(directory, "230");
526 if (r < 0)
527 return log_error_errno(r, "Failed to determine systemd version in container: %m");
528 if (r > 0)
529 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
530 else
531 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
533 /* Mixed cgroup hierarchy support was added in 233 */
534 r = systemd_installation_has_version(directory, "233");
535 if (r < 0)
536 return log_error_errno(r, "Failed to determine systemd version in container: %m");
537 if (r > 0)
538 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
539 else
540 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
541 } else
542 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
543
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
546 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
547
548 return 0;
549 }
550
551 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
552 uint64_t mask = 0;
553 int r;
554
555 for (;;) {
556 _cleanup_free_ char *t = NULL;
557
558 r = extract_first_word(&spec, &t, ",", 0);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse capability %s.", t);
561 if (r == 0)
562 break;
563
564 if (streq(t, "help")) {
565 for (int i = 0; i < capability_list_length(); i++) {
566 const char *name;
567
568 name = capability_to_name(i);
569 if (name)
570 puts(name);
571 }
572
573 return 0; /* quit */
574 }
575
576 if (streq(t, "all"))
577 mask = UINT64_MAX;
578 else {
579 r = capability_from_name(t);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse capability %s.", t);
582
583 mask |= 1ULL << r;
584 }
585 }
586
587 *ret_mask = mask;
588 return 1; /* continue */
589 }
590
591 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
592 int r;
593
594 r = getenv_bool(name);
595 if (r == -ENXIO)
596 return 0;
597 if (r < 0)
598 return log_error_errno(r, "Failed to parse $%s: %m", name);
599
600 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
601 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
602 return 0;
603 }
604
605 static int parse_mount_settings_env(void) {
606 const char *e;
607 int r;
608
609 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
610 if (r < 0 && r != -ENXIO)
611 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
612 if (r >= 0)
613 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
614
615 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
616 if (streq_ptr(e, "network"))
617 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
618
619 else if (e) {
620 r = parse_boolean(e);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
623
624 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
626 }
627
628 return 0;
629 }
630
631 static int parse_environment(void) {
632 const char *e;
633 int r;
634
635 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
636 if (r < 0)
637 return r;
638 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
639 if (r < 0)
640 return r;
641 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
642 if (r < 0)
643 return r;
644 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
645 if (r < 0)
646 return r;
647
648 r = parse_mount_settings_env();
649 if (r < 0)
650 return r;
651
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
654 if (!cg_ns_supported())
655 arg_use_cgns = false;
656 else {
657 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
658 if (r < 0) {
659 if (r != -ENXIO)
660 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
661
662 arg_use_cgns = true;
663 } else {
664 arg_use_cgns = r > 0;
665 arg_settings_mask |= SETTING_USE_CGNS;
666 }
667 }
668
669 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
670 if (e)
671 arg_container_service_name = e;
672
673 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
674 if (e) {
675 r = parse_ether_addr(e, &arg_network_provided_mac);
676 if (r < 0)
677 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
678 }
679
680 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
681 if (r >= 0)
682 arg_suppress_sync = r;
683 else if (r != -ENXIO)
684 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
685
686 return detect_unified_cgroup_hierarchy_from_environment();
687 }
688
689 static int parse_argv(int argc, char *argv[]) {
690 enum {
691 ARG_VERSION = 0x100,
692 ARG_PRIVATE_NETWORK,
693 ARG_UUID,
694 ARG_READ_ONLY,
695 ARG_CAPABILITY,
696 ARG_AMBIENT_CAPABILITY,
697 ARG_DROP_CAPABILITY,
698 ARG_LINK_JOURNAL,
699 ARG_BIND,
700 ARG_BIND_RO,
701 ARG_TMPFS,
702 ARG_OVERLAY,
703 ARG_OVERLAY_RO,
704 ARG_INACCESSIBLE,
705 ARG_SHARE_SYSTEM,
706 ARG_REGISTER,
707 ARG_KEEP_UNIT,
708 ARG_NETWORK_INTERFACE,
709 ARG_NETWORK_MACVLAN,
710 ARG_NETWORK_IPVLAN,
711 ARG_NETWORK_BRIDGE,
712 ARG_NETWORK_ZONE,
713 ARG_NETWORK_VETH_EXTRA,
714 ARG_NETWORK_NAMESPACE_PATH,
715 ARG_PERSONALITY,
716 ARG_VOLATILE,
717 ARG_TEMPLATE,
718 ARG_PROPERTY,
719 ARG_PRIVATE_USERS,
720 ARG_KILL_SIGNAL,
721 ARG_SETTINGS,
722 ARG_CHDIR,
723 ARG_PIVOT_ROOT,
724 ARG_PRIVATE_USERS_CHOWN,
725 ARG_PRIVATE_USERS_OWNERSHIP,
726 ARG_NOTIFY_READY,
727 ARG_ROOT_HASH,
728 ARG_ROOT_HASH_SIG,
729 ARG_VERITY_DATA,
730 ARG_SYSTEM_CALL_FILTER,
731 ARG_RLIMIT,
732 ARG_HOSTNAME,
733 ARG_NO_NEW_PRIVILEGES,
734 ARG_OOM_SCORE_ADJUST,
735 ARG_CPU_AFFINITY,
736 ARG_RESOLV_CONF,
737 ARG_TIMEZONE,
738 ARG_CONSOLE,
739 ARG_PIPE,
740 ARG_OCI_BUNDLE,
741 ARG_NO_PAGER,
742 ARG_SET_CREDENTIAL,
743 ARG_LOAD_CREDENTIAL,
744 ARG_BIND_USER,
745 ARG_SUPPRESS_SYNC,
746 ARG_IMAGE_POLICY,
747 };
748
749 static const struct option options[] = {
750 { "help", no_argument, NULL, 'h' },
751 { "version", no_argument, NULL, ARG_VERSION },
752 { "directory", required_argument, NULL, 'D' },
753 { "template", required_argument, NULL, ARG_TEMPLATE },
754 { "ephemeral", no_argument, NULL, 'x' },
755 { "user", required_argument, NULL, 'u' },
756 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
757 { "as-pid2", no_argument, NULL, 'a' },
758 { "boot", no_argument, NULL, 'b' },
759 { "uuid", required_argument, NULL, ARG_UUID },
760 { "read-only", no_argument, NULL, ARG_READ_ONLY },
761 { "capability", required_argument, NULL, ARG_CAPABILITY },
762 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
763 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
764 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
765 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
766 { "bind", required_argument, NULL, ARG_BIND },
767 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
768 { "tmpfs", required_argument, NULL, ARG_TMPFS },
769 { "overlay", required_argument, NULL, ARG_OVERLAY },
770 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
771 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
772 { "machine", required_argument, NULL, 'M' },
773 { "hostname", required_argument, NULL, ARG_HOSTNAME },
774 { "slice", required_argument, NULL, 'S' },
775 { "setenv", required_argument, NULL, 'E' },
776 { "selinux-context", required_argument, NULL, 'Z' },
777 { "selinux-apifs-context", required_argument, NULL, 'L' },
778 { "quiet", no_argument, NULL, 'q' },
779 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
780 { "register", required_argument, NULL, ARG_REGISTER },
781 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
782 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
783 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
784 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
785 { "network-veth", no_argument, NULL, 'n' },
786 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
787 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
788 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
789 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
790 { "personality", required_argument, NULL, ARG_PERSONALITY },
791 { "image", required_argument, NULL, 'i' },
792 { "volatile", optional_argument, NULL, ARG_VOLATILE },
793 { "port", required_argument, NULL, 'p' },
794 { "property", required_argument, NULL, ARG_PROPERTY },
795 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
796 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
797 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
798 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
799 { "settings", required_argument, NULL, ARG_SETTINGS },
800 { "chdir", required_argument, NULL, ARG_CHDIR },
801 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
802 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
803 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
804 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
805 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
806 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
807 { "rlimit", required_argument, NULL, ARG_RLIMIT },
808 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
809 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
810 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
811 { "timezone", required_argument, NULL, ARG_TIMEZONE },
812 { "console", required_argument, NULL, ARG_CONSOLE },
813 { "pipe", no_argument, NULL, ARG_PIPE },
814 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
815 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
816 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
817 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
818 { "bind-user", required_argument, NULL, ARG_BIND_USER },
819 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
820 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
821 {}
822 };
823
824 int c, r;
825 uint64_t plus = 0, minus = 0;
826 bool mask_all_settings = false, mask_no_settings = false;
827
828 assert(argc >= 0);
829 assert(argv);
830
831 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
832 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
833 optind = 0;
834 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
835 switch (c) {
836
837 case 'h':
838 return help();
839
840 case ARG_VERSION:
841 return version();
842
843 case 'D':
844 r = parse_path_argument(optarg, false, &arg_directory);
845 if (r < 0)
846 return r;
847
848 arg_settings_mask |= SETTING_DIRECTORY;
849 break;
850
851 case ARG_TEMPLATE:
852 r = parse_path_argument(optarg, false, &arg_template);
853 if (r < 0)
854 return r;
855
856 arg_settings_mask |= SETTING_DIRECTORY;
857 break;
858
859 case 'i':
860 r = parse_path_argument(optarg, false, &arg_image);
861 if (r < 0)
862 return r;
863
864 arg_settings_mask |= SETTING_DIRECTORY;
865 break;
866
867 case ARG_OCI_BUNDLE:
868 r = parse_path_argument(optarg, false, &arg_oci_bundle);
869 if (r < 0)
870 return r;
871
872 break;
873
874 case 'x':
875 arg_ephemeral = true;
876 arg_settings_mask |= SETTING_EPHEMERAL;
877 break;
878
879 case 'u':
880 r = free_and_strdup(&arg_user, optarg);
881 if (r < 0)
882 return log_oom();
883
884 arg_settings_mask |= SETTING_USER;
885 break;
886
887 case ARG_NETWORK_ZONE: {
888 _cleanup_free_ char *j = NULL;
889
890 j = strjoin("vz-", optarg);
891 if (!j)
892 return log_oom();
893
894 if (!ifname_valid(j))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
896 "Network zone name not valid: %s", j);
897
898 free_and_replace(arg_network_zone, j);
899
900 arg_network_veth = true;
901 arg_private_network = true;
902 arg_settings_mask |= SETTING_NETWORK;
903 break;
904 }
905
906 case ARG_NETWORK_BRIDGE:
907
908 if (!ifname_valid(optarg))
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
910 "Bridge interface name not valid: %s", optarg);
911
912 r = free_and_strdup(&arg_network_bridge, optarg);
913 if (r < 0)
914 return log_oom();
915
916 _fallthrough_;
917 case 'n':
918 arg_network_veth = true;
919 arg_private_network = true;
920 arg_settings_mask |= SETTING_NETWORK;
921 break;
922
923 case ARG_NETWORK_VETH_EXTRA:
924 r = veth_extra_parse(&arg_network_veth_extra, optarg);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
927
928 arg_private_network = true;
929 arg_settings_mask |= SETTING_NETWORK;
930 break;
931
932 case ARG_NETWORK_INTERFACE:
933 r = interface_pair_parse(&arg_network_interfaces, optarg);
934 if (r < 0)
935 return r;
936
937 arg_private_network = true;
938 arg_settings_mask |= SETTING_NETWORK;
939 break;
940
941 case ARG_NETWORK_MACVLAN:
942 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
943 if (r < 0)
944 return r;
945
946 arg_private_network = true;
947 arg_settings_mask |= SETTING_NETWORK;
948 break;
949
950 case ARG_NETWORK_IPVLAN:
951 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
952 if (r < 0)
953 return r;
954
955 _fallthrough_;
956 case ARG_PRIVATE_NETWORK:
957 arg_private_network = true;
958 arg_settings_mask |= SETTING_NETWORK;
959 break;
960
961 case ARG_NETWORK_NAMESPACE_PATH:
962 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
963 if (r < 0)
964 return r;
965
966 arg_settings_mask |= SETTING_NETWORK;
967 break;
968
969 case 'b':
970 if (arg_start_mode == START_PID2)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
972 "--boot and --as-pid2 may not be combined.");
973
974 arg_start_mode = START_BOOT;
975 arg_settings_mask |= SETTING_START_MODE;
976 break;
977
978 case 'a':
979 if (arg_start_mode == START_BOOT)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
981 "--boot and --as-pid2 may not be combined.");
982
983 arg_start_mode = START_PID2;
984 arg_settings_mask |= SETTING_START_MODE;
985 break;
986
987 case ARG_UUID:
988 r = id128_from_string_nonzero(optarg, &arg_uuid);
989 if (r == -ENXIO)
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
991 "Machine UUID may not be all zeroes.");
992 if (r < 0)
993 return log_error_errno(r, "Invalid UUID: %s", optarg);
994
995 arg_settings_mask |= SETTING_MACHINE_ID;
996 break;
997
998 case 'S': {
999 _cleanup_free_ char *mangled = NULL;
1000
1001 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1002 if (r < 0)
1003 return log_oom();
1004
1005 free_and_replace(arg_slice, mangled);
1006 arg_settings_mask |= SETTING_SLICE;
1007 break;
1008 }
1009
1010 case 'M':
1011 if (isempty(optarg))
1012 arg_machine = mfree(arg_machine);
1013 else {
1014 if (!hostname_is_valid(optarg, 0))
1015 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1016 "Invalid machine name: %s", optarg);
1017
1018 r = free_and_strdup(&arg_machine, optarg);
1019 if (r < 0)
1020 return log_oom();
1021 }
1022 break;
1023
1024 case ARG_HOSTNAME:
1025 if (isempty(optarg))
1026 arg_hostname = mfree(arg_hostname);
1027 else {
1028 if (!hostname_is_valid(optarg, 0))
1029 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1030 "Invalid hostname: %s", optarg);
1031
1032 r = free_and_strdup(&arg_hostname, optarg);
1033 if (r < 0)
1034 return log_oom();
1035 }
1036
1037 arg_settings_mask |= SETTING_HOSTNAME;
1038 break;
1039
1040 case 'Z':
1041 arg_selinux_context = optarg;
1042 break;
1043
1044 case 'L':
1045 arg_selinux_apifs_context = optarg;
1046 break;
1047
1048 case ARG_READ_ONLY:
1049 arg_read_only = true;
1050 arg_settings_mask |= SETTING_READ_ONLY;
1051 break;
1052
1053 case ARG_AMBIENT_CAPABILITY: {
1054 uint64_t m;
1055 r = parse_capability_spec(optarg, &m);
1056 if (r <= 0)
1057 return r;
1058 arg_caps_ambient |= m;
1059 arg_settings_mask |= SETTING_CAPABILITY;
1060 break;
1061 }
1062 case ARG_CAPABILITY:
1063 case ARG_DROP_CAPABILITY: {
1064 uint64_t m;
1065 r = parse_capability_spec(optarg, &m);
1066 if (r <= 0)
1067 return r;
1068
1069 if (c == ARG_CAPABILITY)
1070 plus |= m;
1071 else
1072 minus |= m;
1073 arg_settings_mask |= SETTING_CAPABILITY;
1074 break;
1075 }
1076 case ARG_NO_NEW_PRIVILEGES:
1077 r = parse_boolean(optarg);
1078 if (r < 0)
1079 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1080
1081 arg_no_new_privileges = r;
1082 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1083 break;
1084
1085 case 'j':
1086 arg_link_journal = LINK_GUEST;
1087 arg_link_journal_try = true;
1088 arg_settings_mask |= SETTING_LINK_JOURNAL;
1089 break;
1090
1091 case ARG_LINK_JOURNAL:
1092 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1095
1096 arg_settings_mask |= SETTING_LINK_JOURNAL;
1097 break;
1098
1099 case ARG_BIND:
1100 case ARG_BIND_RO:
1101 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1104
1105 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1106 break;
1107
1108 case ARG_TMPFS:
1109 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1110 if (r < 0)
1111 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1112
1113 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1114 break;
1115
1116 case ARG_OVERLAY:
1117 case ARG_OVERLAY_RO:
1118 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1119 if (r == -EADDRNOTAVAIL)
1120 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1123
1124 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1125 break;
1126
1127 case ARG_INACCESSIBLE:
1128 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1131
1132 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1133 break;
1134
1135 case 'E':
1136 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1137 if (r < 0)
1138 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1139
1140 arg_settings_mask |= SETTING_ENVIRONMENT;
1141 break;
1142
1143 case 'q':
1144 arg_quiet = true;
1145 break;
1146
1147 case ARG_SHARE_SYSTEM:
1148 /* We don't officially support this anymore, except for compat reasons. People should use the
1149 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1150 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1151 arg_clone_ns_flags = 0;
1152 break;
1153
1154 case ARG_REGISTER:
1155 r = parse_boolean(optarg);
1156 if (r < 0) {
1157 log_error("Failed to parse --register= argument: %s", optarg);
1158 return r;
1159 }
1160
1161 arg_register = r;
1162 break;
1163
1164 case ARG_KEEP_UNIT:
1165 arg_keep_unit = true;
1166 break;
1167
1168 case ARG_PERSONALITY:
1169
1170 arg_personality = personality_from_string(optarg);
1171 if (arg_personality == PERSONALITY_INVALID)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1173 "Unknown or unsupported personality '%s'.", optarg);
1174
1175 arg_settings_mask |= SETTING_PERSONALITY;
1176 break;
1177
1178 case ARG_VOLATILE:
1179
1180 if (!optarg)
1181 arg_volatile_mode = VOLATILE_YES;
1182 else if (streq(optarg, "help")) {
1183 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1184 return 0;
1185 } else {
1186 VolatileMode m;
1187
1188 m = volatile_mode_from_string(optarg);
1189 if (m < 0)
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1191 "Failed to parse --volatile= argument: %s", optarg);
1192 else
1193 arg_volatile_mode = m;
1194 }
1195
1196 arg_settings_mask |= SETTING_VOLATILE_MODE;
1197 break;
1198
1199 case 'p':
1200 r = expose_port_parse(&arg_expose_ports, optarg);
1201 if (r == -EEXIST)
1202 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1203 if (r < 0)
1204 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1205
1206 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1207 break;
1208
1209 case ARG_PROPERTY:
1210 if (strv_extend(&arg_property, optarg) < 0)
1211 return log_oom();
1212
1213 break;
1214
1215 case ARG_PRIVATE_USERS: {
1216 int boolean;
1217
1218 if (!optarg)
1219 boolean = true;
1220 else if (!in_charset(optarg, DIGITS))
1221 /* do *not* parse numbers as booleans */
1222 boolean = parse_boolean(optarg);
1223 else
1224 boolean = -1;
1225
1226 if (boolean == 0) {
1227 /* no: User namespacing off */
1228 arg_userns_mode = USER_NAMESPACE_NO;
1229 arg_uid_shift = UID_INVALID;
1230 arg_uid_range = UINT32_C(0x10000);
1231 } else if (boolean > 0) {
1232 /* yes: User namespacing on, UID range is read from root dir */
1233 arg_userns_mode = USER_NAMESPACE_FIXED;
1234 arg_uid_shift = UID_INVALID;
1235 arg_uid_range = UINT32_C(0x10000);
1236 } else if (streq(optarg, "pick")) {
1237 /* pick: User namespacing on, UID range is picked randomly */
1238 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1239 * implied by USER_NAMESPACE_PICK
1240 * further down. */
1241 arg_uid_shift = UID_INVALID;
1242 arg_uid_range = UINT32_C(0x10000);
1243
1244 } else if (streq(optarg, "identity")) {
1245 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1246 * itself, i.e. we don't actually map anything, but do take benefit of
1247 * isolation of capability sets. */
1248 arg_userns_mode = USER_NAMESPACE_FIXED;
1249 arg_uid_shift = 0;
1250 arg_uid_range = UINT32_C(0x10000);
1251 } else {
1252 _cleanup_free_ char *buffer = NULL;
1253 const char *range, *shift;
1254
1255 /* anything else: User namespacing on, UID range is explicitly configured */
1256
1257 range = strchr(optarg, ':');
1258 if (range) {
1259 buffer = strndup(optarg, range - optarg);
1260 if (!buffer)
1261 return log_oom();
1262 shift = buffer;
1263
1264 range++;
1265 r = safe_atou32(range, &arg_uid_range);
1266 if (r < 0)
1267 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1268 } else
1269 shift = optarg;
1270
1271 r = parse_uid(shift, &arg_uid_shift);
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1274
1275 arg_userns_mode = USER_NAMESPACE_FIXED;
1276
1277 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1279 }
1280
1281 arg_settings_mask |= SETTING_USERNS;
1282 break;
1283 }
1284
1285 case 'U':
1286 if (userns_supported()) {
1287 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1288 * implied by USER_NAMESPACE_PICK
1289 * further down. */
1290 arg_uid_shift = UID_INVALID;
1291 arg_uid_range = UINT32_C(0x10000);
1292
1293 arg_settings_mask |= SETTING_USERNS;
1294 }
1295
1296 break;
1297
1298 case ARG_PRIVATE_USERS_CHOWN:
1299 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1300
1301 arg_settings_mask |= SETTING_USERNS;
1302 break;
1303
1304 case ARG_PRIVATE_USERS_OWNERSHIP:
1305 if (streq(optarg, "help")) {
1306 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1307 return 0;
1308 }
1309
1310 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1311 if (arg_userns_ownership < 0)
1312 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1313
1314 arg_settings_mask |= SETTING_USERNS;
1315 break;
1316
1317 case ARG_KILL_SIGNAL:
1318 if (streq(optarg, "help")) {
1319 DUMP_STRING_TABLE(signal, int, _NSIG);
1320 return 0;
1321 }
1322
1323 arg_kill_signal = signal_from_string(optarg);
1324 if (arg_kill_signal < 0)
1325 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1326
1327 arg_settings_mask |= SETTING_KILL_SIGNAL;
1328 break;
1329
1330 case ARG_SETTINGS:
1331
1332 /* no → do not read files
1333 * yes → read files, do not override cmdline, trust only subset
1334 * override → read files, override cmdline, trust only subset
1335 * trusted → read files, do not override cmdline, trust all
1336 */
1337
1338 r = parse_boolean(optarg);
1339 if (r < 0) {
1340 if (streq(optarg, "trusted")) {
1341 mask_all_settings = false;
1342 mask_no_settings = false;
1343 arg_settings_trusted = true;
1344
1345 } else if (streq(optarg, "override")) {
1346 mask_all_settings = false;
1347 mask_no_settings = true;
1348 arg_settings_trusted = -1;
1349 } else
1350 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1351 } else if (r > 0) {
1352 /* yes */
1353 mask_all_settings = false;
1354 mask_no_settings = false;
1355 arg_settings_trusted = -1;
1356 } else {
1357 /* no */
1358 mask_all_settings = true;
1359 mask_no_settings = false;
1360 arg_settings_trusted = false;
1361 }
1362
1363 break;
1364
1365 case ARG_CHDIR:
1366 if (!path_is_absolute(optarg))
1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1368 "Working directory %s is not an absolute path.", optarg);
1369
1370 r = free_and_strdup(&arg_chdir, optarg);
1371 if (r < 0)
1372 return log_oom();
1373
1374 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1375 break;
1376
1377 case ARG_PIVOT_ROOT:
1378 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1381
1382 arg_settings_mask |= SETTING_PIVOT_ROOT;
1383 break;
1384
1385 case ARG_NOTIFY_READY:
1386 r = parse_boolean(optarg);
1387 if (r < 0)
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1389 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1390 arg_notify_ready = r;
1391 arg_settings_mask |= SETTING_NOTIFY_READY;
1392 break;
1393
1394 case ARG_ROOT_HASH: {
1395 _cleanup_free_ void *k = NULL;
1396 size_t l;
1397
1398 r = unhexmem(optarg, strlen(optarg), &k, &l);
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1401 if (l < sizeof(sd_id128_t))
1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
1403
1404 free_and_replace(arg_verity_settings.root_hash, k);
1405 arg_verity_settings.root_hash_size = l;
1406 break;
1407 }
1408
1409 case ARG_ROOT_HASH_SIG: {
1410 char *value;
1411 size_t l;
1412 void *p;
1413
1414 if ((value = startswith(optarg, "base64:"))) {
1415 r = unbase64mem(value, strlen(value), &p, &l);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1418
1419 } else {
1420 r = read_full_file(optarg, (char**) &p, &l);
1421 if (r < 0)
1422 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1423 }
1424
1425 free_and_replace(arg_verity_settings.root_hash_sig, p);
1426 arg_verity_settings.root_hash_sig_size = l;
1427 break;
1428 }
1429
1430 case ARG_VERITY_DATA:
1431 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1432 if (r < 0)
1433 return r;
1434 break;
1435
1436 case ARG_SYSTEM_CALL_FILTER: {
1437 bool negative;
1438 const char *items;
1439
1440 negative = optarg[0] == '~';
1441 items = negative ? optarg + 1 : optarg;
1442
1443 for (;;) {
1444 _cleanup_free_ char *word = NULL;
1445
1446 r = extract_first_word(&items, &word, NULL, 0);
1447 if (r == 0)
1448 break;
1449 if (r == -ENOMEM)
1450 return log_oom();
1451 if (r < 0)
1452 return log_error_errno(r, "Failed to parse system call filter: %m");
1453
1454 if (negative)
1455 r = strv_extend(&arg_syscall_deny_list, word);
1456 else
1457 r = strv_extend(&arg_syscall_allow_list, word);
1458 if (r < 0)
1459 return log_oom();
1460 }
1461
1462 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1463 break;
1464 }
1465
1466 case ARG_RLIMIT: {
1467 const char *eq;
1468 _cleanup_free_ char *name = NULL;
1469 int rl;
1470
1471 if (streq(optarg, "help")) {
1472 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1473 return 0;
1474 }
1475
1476 eq = strchr(optarg, '=');
1477 if (!eq)
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1479 "--rlimit= expects an '=' assignment.");
1480
1481 name = strndup(optarg, eq - optarg);
1482 if (!name)
1483 return log_oom();
1484
1485 rl = rlimit_from_string_harder(name);
1486 if (rl < 0)
1487 return log_error_errno(rl, "Unknown resource limit: %s", name);
1488
1489 if (!arg_rlimit[rl]) {
1490 arg_rlimit[rl] = new0(struct rlimit, 1);
1491 if (!arg_rlimit[rl])
1492 return log_oom();
1493 }
1494
1495 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1498
1499 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1500 break;
1501 }
1502
1503 case ARG_OOM_SCORE_ADJUST:
1504 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1507
1508 arg_oom_score_adjust_set = true;
1509 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1510 break;
1511
1512 case ARG_CPU_AFFINITY: {
1513 CPUSet cpuset;
1514
1515 r = parse_cpu_set(optarg, &cpuset);
1516 if (r < 0)
1517 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1518
1519 cpu_set_reset(&arg_cpu_set);
1520 arg_cpu_set = cpuset;
1521 arg_settings_mask |= SETTING_CPU_AFFINITY;
1522 break;
1523 }
1524
1525 case ARG_RESOLV_CONF:
1526 if (streq(optarg, "help")) {
1527 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1528 return 0;
1529 }
1530
1531 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1532 if (arg_resolv_conf < 0)
1533 return log_error_errno(arg_resolv_conf,
1534 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1535
1536 arg_settings_mask |= SETTING_RESOLV_CONF;
1537 break;
1538
1539 case ARG_TIMEZONE:
1540 if (streq(optarg, "help")) {
1541 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1542 return 0;
1543 }
1544
1545 arg_timezone = timezone_mode_from_string(optarg);
1546 if (arg_timezone < 0)
1547 return log_error_errno(arg_timezone,
1548 "Failed to parse /etc/localtime mode: %s", optarg);
1549
1550 arg_settings_mask |= SETTING_TIMEZONE;
1551 break;
1552
1553 case ARG_CONSOLE:
1554 r = handle_arg_console(optarg);
1555 if (r <= 0)
1556 return r;
1557 break;
1558
1559 case 'P':
1560 case ARG_PIPE:
1561 r = handle_arg_console("pipe");
1562 if (r <= 0)
1563 return r;
1564 break;
1565
1566 case ARG_NO_PAGER:
1567 arg_pager_flags |= PAGER_DISABLE;
1568 break;
1569
1570 case ARG_SET_CREDENTIAL:
1571 r = machine_credential_set(&arg_credentials, optarg);
1572 if (r < 0)
1573 return r;
1574
1575 arg_settings_mask |= SETTING_CREDENTIALS;
1576 break;
1577
1578 case ARG_LOAD_CREDENTIAL:
1579 r = machine_credential_load(&arg_credentials, optarg);
1580 if (r < 0)
1581 return r;
1582
1583 arg_settings_mask |= SETTING_CREDENTIALS;
1584 break;
1585
1586 case ARG_BIND_USER:
1587 if (!valid_user_group_name(optarg, 0))
1588 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1589
1590 if (strv_extend(&arg_bind_user, optarg) < 0)
1591 return log_oom();
1592
1593 arg_settings_mask |= SETTING_BIND_USER;
1594 break;
1595
1596 case ARG_SUPPRESS_SYNC:
1597 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1598 if (r < 0)
1599 return r;
1600
1601 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1602 break;
1603
1604 case ARG_IMAGE_POLICY:
1605 r = parse_image_policy_argument(optarg, &arg_image_policy);
1606 if (r < 0)
1607 return r;
1608 break;
1609
1610 case '?':
1611 return -EINVAL;
1612
1613 default:
1614 assert_not_reached();
1615 }
1616
1617 if (argc > optind) {
1618 strv_free(arg_parameters);
1619 arg_parameters = strv_copy(argv + optind);
1620 if (!arg_parameters)
1621 return log_oom();
1622
1623 arg_settings_mask |= SETTING_START_MODE;
1624 }
1625
1626 if (arg_ephemeral && arg_template && !arg_directory)
1627 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1628 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1629 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1630 * --directory=". */
1631 arg_directory = TAKE_PTR(arg_template);
1632
1633 arg_caps_retain |= plus;
1634 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1635 arg_caps_retain &= ~minus;
1636
1637 /* Make sure to parse environment before we reset the settings mask below */
1638 r = parse_environment();
1639 if (r < 0)
1640 return r;
1641
1642 /* Load all settings from .nspawn files */
1643 if (mask_no_settings)
1644 arg_settings_mask = 0;
1645
1646 /* Don't load any settings from .nspawn files */
1647 if (mask_all_settings)
1648 arg_settings_mask = _SETTINGS_MASK_ALL;
1649
1650 return 1;
1651 }
1652
1653 static int verify_arguments(void) {
1654 int r;
1655
1656 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1657 /* If we are running the stub init in the container, we don't need to look at what the init
1658 * in the container supports, because we are not using it. Let's immediately pick the right
1659 * setting based on the host system configuration.
1660 *
1661 * We only do this, if the user didn't use an environment variable to override the detection.
1662 */
1663
1664 r = cg_all_unified();
1665 if (r < 0)
1666 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1667 if (r > 0)
1668 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1669 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1670 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1671 else
1672 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1673 }
1674
1675 if (arg_userns_mode != USER_NAMESPACE_NO)
1676 arg_mount_settings |= MOUNT_USE_USERNS;
1677
1678 if (arg_private_network)
1679 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1680
1681 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1682 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1683 arg_register = false;
1684 if (arg_start_mode != START_PID1)
1685 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1686 }
1687
1688 if (arg_userns_ownership < 0)
1689 arg_userns_ownership =
1690 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1691 USER_NAMESPACE_OWNERSHIP_OFF;
1692
1693 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1694 arg_kill_signal = SIGRTMIN+3;
1695
1696 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1697 arg_read_only = true;
1698
1699 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1700 arg_read_only = true;
1701
1702 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1703 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1704 * The latter is not technically a user session, but we don't need to labour the point. */
1705 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1706
1707 if (arg_directory && arg_image)
1708 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1709
1710 if (arg_template && arg_image)
1711 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1712
1713 if (arg_template && !(arg_directory || arg_machine))
1714 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1715
1716 if (arg_ephemeral && arg_template)
1717 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1718
1719 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1720 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1721
1722 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1723 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1724
1725 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1726 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1727 "--read-only and --private-users-ownership=chown may not be combined.");
1728
1729 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1730 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1731 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1732 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1734
1735 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1736 * we need to error out, to avoid conflicts between different network options. */
1737 if (arg_network_namespace_path &&
1738 (arg_network_interfaces || arg_network_macvlan ||
1739 arg_network_ipvlan || arg_network_veth_extra ||
1740 arg_network_bridge || arg_network_zone ||
1741 arg_network_veth))
1742 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1743
1744 if (arg_network_bridge && arg_network_zone)
1745 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1746 "--network-bridge= and --network-zone= may not be combined.");
1747
1748 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1749 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1750
1751 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1752 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1753
1754 if (arg_expose_ports && !arg_private_network)
1755 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1756
1757 if (arg_caps_ambient) {
1758 if (arg_caps_ambient == UINT64_MAX)
1759 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1760
1761 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1762 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1763
1764 if (arg_start_mode == START_BOOT)
1765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1766 }
1767
1768 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1769 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1770
1771 /* Drop duplicate --bind-user= entries */
1772 strv_uniq(arg_bind_user);
1773
1774 r = custom_mount_check_all();
1775 if (r < 0)
1776 return r;
1777
1778 return 0;
1779 }
1780
1781 static int verify_network_interfaces_initialized(void) {
1782 int r;
1783 r = test_network_interfaces_initialized(arg_network_interfaces);
1784 if (r < 0)
1785 return r;
1786
1787 r = test_network_interfaces_initialized(arg_network_macvlan);
1788 if (r < 0)
1789 return r;
1790
1791 r = test_network_interfaces_initialized(arg_network_ipvlan);
1792 if (r < 0)
1793 return r;
1794
1795 return 0;
1796 }
1797
1798 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1799 assert(p);
1800
1801 if (arg_userns_mode == USER_NAMESPACE_NO)
1802 return 0;
1803
1804 if (uid == UID_INVALID && gid == GID_INVALID)
1805 return 0;
1806
1807 if (uid != UID_INVALID) {
1808 uid += arg_uid_shift;
1809
1810 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1811 return -EOVERFLOW;
1812 }
1813
1814 if (gid != GID_INVALID) {
1815 gid += (gid_t) arg_uid_shift;
1816
1817 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1818 return -EOVERFLOW;
1819 }
1820
1821 return RET_NERRNO(lchown(p, uid, gid));
1822 }
1823
1824 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1825 const char *q;
1826 int r;
1827
1828 q = prefix_roota(root, path);
1829 r = RET_NERRNO(mkdir(q, mode));
1830 if (r == -EEXIST)
1831 return 0;
1832 if (r < 0)
1833 return r;
1834
1835 return userns_lchown(q, uid, gid);
1836 }
1837
1838 static const char *timezone_from_path(const char *path) {
1839 return PATH_STARTSWITH_SET(
1840 path,
1841 "../usr/share/zoneinfo/",
1842 "/usr/share/zoneinfo/");
1843 }
1844
1845 static bool etc_writable(void) {
1846 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1847 }
1848
1849 static int setup_timezone(const char *dest) {
1850 _cleanup_free_ char *p = NULL, *etc = NULL;
1851 const char *where, *check;
1852 TimezoneMode m;
1853 int r;
1854
1855 assert(dest);
1856
1857 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1858 r = readlink_malloc("/etc/localtime", &p);
1859 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1860 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1861 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1862 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1863 else if (r < 0) {
1864 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1865 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1866 * file.
1867 *
1868 * Example:
1869 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1870 */
1871 return 0;
1872 } else if (arg_timezone == TIMEZONE_AUTO)
1873 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1874 else
1875 m = arg_timezone;
1876 } else
1877 m = arg_timezone;
1878
1879 if (m == TIMEZONE_OFF)
1880 return 0;
1881
1882 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1883 if (r < 0) {
1884 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1885 return 0;
1886 }
1887
1888 where = strjoina(etc, "/localtime");
1889
1890 switch (m) {
1891
1892 case TIMEZONE_DELETE:
1893 if (unlink(where) < 0)
1894 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1895
1896 return 0;
1897
1898 case TIMEZONE_SYMLINK: {
1899 _cleanup_free_ char *q = NULL;
1900 const char *z, *what;
1901
1902 z = timezone_from_path(p);
1903 if (!z) {
1904 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1905 return 0;
1906 }
1907
1908 r = readlink_malloc(where, &q);
1909 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1910 return 0; /* Already pointing to the right place? Then do nothing .. */
1911
1912 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1913 r = chase(check, dest, 0, NULL, NULL);
1914 if (r < 0)
1915 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1916 else {
1917 if (unlink(where) < 0 && errno != ENOENT) {
1918 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1919 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1920 return 0;
1921 }
1922
1923 what = strjoina("../usr/share/zoneinfo/", z);
1924 if (symlink(what, where) < 0) {
1925 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1926 errno, "Failed to correct timezone of container, ignoring: %m");
1927 return 0;
1928 }
1929
1930 break;
1931 }
1932
1933 _fallthrough_;
1934 }
1935
1936 case TIMEZONE_BIND: {
1937 _cleanup_free_ char *resolved = NULL;
1938 int found;
1939
1940 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1941 if (found < 0) {
1942 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1943 return 0;
1944 }
1945
1946 if (found == 0) /* missing? */
1947 (void) touch(resolved);
1948
1949 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1950 if (r >= 0)
1951 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1952
1953 _fallthrough_;
1954 }
1955
1956 case TIMEZONE_COPY:
1957 /* If mounting failed, try to copy */
1958 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1959 if (r < 0) {
1960 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1961 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1962 return 0;
1963 }
1964
1965 break;
1966
1967 default:
1968 assert_not_reached();
1969 }
1970
1971 /* Fix permissions of the symlink or file copy we just created */
1972 r = userns_lchown(where, 0, 0);
1973 if (r < 0)
1974 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1975
1976 return 0;
1977 }
1978
1979 static int have_resolv_conf(const char *path) {
1980 assert(path);
1981
1982 if (access(path, F_OK) < 0) {
1983 if (errno == ENOENT)
1984 return 0;
1985
1986 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1987 }
1988
1989 return 1;
1990 }
1991
1992 static int resolved_listening(void) {
1993 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1994 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1995 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1996 int r;
1997
1998 /* Check if resolved is listening */
1999
2000 r = sd_bus_open_system(&bus);
2001 if (r < 0)
2002 return log_debug_errno(r, "Failed to open system bus: %m");
2003
2004 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2005 if (r < 0)
2006 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2007 if (r == 0)
2008 return 0;
2009
2010 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
2011 if (r < 0)
2012 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2013
2014 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2015 }
2016
2017 static int setup_resolv_conf(const char *dest) {
2018 _cleanup_free_ char *etc = NULL;
2019 const char *where, *what;
2020 ResolvConfMode m;
2021 int r;
2022
2023 assert(dest);
2024
2025 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2026 if (arg_private_network)
2027 m = RESOLV_CONF_OFF;
2028 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2029 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2030 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2031 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2032 else
2033 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2034
2035 } else
2036 m = arg_resolv_conf;
2037
2038 if (m == RESOLV_CONF_OFF)
2039 return 0;
2040
2041 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2042 if (r < 0) {
2043 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2044 return 0;
2045 }
2046
2047 where = strjoina(etc, "/resolv.conf");
2048
2049 if (m == RESOLV_CONF_DELETE) {
2050 if (unlink(where) < 0)
2051 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2052
2053 return 0;
2054 }
2055
2056 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2057 what = PRIVATE_STATIC_RESOLV_CONF;
2058 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2059 what = PRIVATE_UPLINK_RESOLV_CONF;
2060 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2061 what = PRIVATE_STUB_RESOLV_CONF;
2062 else
2063 what = "/etc/resolv.conf";
2064
2065 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2066 _cleanup_free_ char *resolved = NULL;
2067 int found;
2068
2069 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
2070 if (found < 0) {
2071 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2072 return 0;
2073 }
2074
2075 if (found == 0) /* missing? */
2076 (void) touch(resolved);
2077
2078 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2079 if (r >= 0)
2080 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2081
2082 /* If that didn't work, let's copy the file */
2083 }
2084
2085 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2086 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
2087 else
2088 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
2089 if (r < 0) {
2090 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2091 * resolved or something similar runs inside and the symlink points there.
2092 *
2093 * If the disk image is read-only, there's also no point in complaining.
2094 */
2095 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2096 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2097 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2098 return 0;
2099 }
2100
2101 r = userns_lchown(where, 0, 0);
2102 if (r < 0)
2103 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2104
2105 return 0;
2106 }
2107
2108 static int setup_boot_id(void) {
2109 _cleanup_(unlink_and_freep) char *from = NULL;
2110 _cleanup_free_ char *path = NULL;
2111 sd_id128_t rnd = SD_ID128_NULL;
2112 const char *to;
2113 int r;
2114
2115 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2116
2117 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2118 if (r < 0)
2119 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2120
2121 r = sd_id128_randomize(&rnd);
2122 if (r < 0)
2123 return log_error_errno(r, "Failed to generate random boot id: %m");
2124
2125 r = id128_write(path, ID128_FORMAT_UUID, rnd);
2126 if (r < 0)
2127 return log_error_errno(r, "Failed to write boot id: %m");
2128
2129 from = TAKE_PTR(path);
2130 to = "/proc/sys/kernel/random/boot_id";
2131
2132 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2133 if (r < 0)
2134 return r;
2135
2136 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2137 }
2138
2139 static int copy_devnodes(const char *dest) {
2140 static const char devnodes[] =
2141 "null\0"
2142 "zero\0"
2143 "full\0"
2144 "random\0"
2145 "urandom\0"
2146 "tty\0"
2147 "net/tun\0";
2148
2149 int r = 0;
2150
2151 assert(dest);
2152
2153 BLOCK_WITH_UMASK(0000);
2154
2155 /* Create /dev/net, so that we can create /dev/net/tun in it */
2156 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2157 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2158
2159 NULSTR_FOREACH(d, devnodes) {
2160 _cleanup_free_ char *from = NULL, *to = NULL;
2161 struct stat st;
2162
2163 from = path_join("/dev/", d);
2164 if (!from)
2165 return log_oom();
2166
2167 to = path_join(dest, from);
2168 if (!to)
2169 return log_oom();
2170
2171 if (stat(from, &st) < 0) {
2172
2173 if (errno != ENOENT)
2174 return log_error_errno(errno, "Failed to stat %s: %m", from);
2175
2176 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2177 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2178 "%s is not a char or block device, cannot copy.", from);
2179 else {
2180 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2181
2182 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2183 /* Explicitly warn the user when /dev is already populated. */
2184 if (errno == EEXIST)
2185 log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
2186 if (errno != EPERM)
2187 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2188
2189 /* Some systems abusively restrict mknod but allow bind mounts. */
2190 r = touch(to);
2191 if (r < 0)
2192 return log_error_errno(r, "touch (%s) failed: %m", to);
2193 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2194 if (r < 0)
2195 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2196 }
2197
2198 r = userns_lchown(to, 0, 0);
2199 if (r < 0)
2200 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2201
2202 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2203 if (!dn)
2204 return log_oom();
2205
2206 r = userns_mkdir(dest, dn, 0755, 0, 0);
2207 if (r < 0)
2208 return log_error_errno(r, "Failed to create '%s': %m", dn);
2209
2210 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2211 return log_oom();
2212
2213 prefixed = path_join(dest, sl);
2214 if (!prefixed)
2215 return log_oom();
2216
2217 t = path_join("..", d);
2218 if (!t)
2219 return log_oom();
2220
2221 if (symlink(t, prefixed) < 0)
2222 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2223 }
2224 }
2225
2226 return r;
2227 }
2228
2229 static int make_extra_nodes(const char *dest) {
2230 size_t i;
2231 int r;
2232
2233 BLOCK_WITH_UMASK(0000);
2234
2235 for (i = 0; i < arg_n_extra_nodes; i++) {
2236 _cleanup_free_ char *path = NULL;
2237 DeviceNode *n = arg_extra_nodes + i;
2238
2239 path = path_join(dest, n->path);
2240 if (!path)
2241 return log_oom();
2242
2243 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2244 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2245
2246 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2247 if (r < 0)
2248 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2249 }
2250
2251 return 0;
2252 }
2253
2254 static int setup_pts(const char *dest) {
2255 _cleanup_free_ char *options = NULL;
2256 const char *p;
2257 int r;
2258
2259 #if HAVE_SELINUX
2260 if (arg_selinux_apifs_context)
2261 (void) asprintf(&options,
2262 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2263 arg_uid_shift + TTY_GID,
2264 arg_selinux_apifs_context);
2265 else
2266 #endif
2267 (void) asprintf(&options,
2268 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2269 arg_uid_shift + TTY_GID);
2270
2271 if (!options)
2272 return log_oom();
2273
2274 /* Mount /dev/pts itself */
2275 p = prefix_roota(dest, "/dev/pts");
2276 r = RET_NERRNO(mkdir(p, 0755));
2277 if (r < 0)
2278 return log_error_errno(r, "Failed to create /dev/pts: %m");
2279
2280 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2281 if (r < 0)
2282 return r;
2283 r = userns_lchown(p, 0, 0);
2284 if (r < 0)
2285 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2286
2287 /* Create /dev/ptmx symlink */
2288 p = prefix_roota(dest, "/dev/ptmx");
2289 if (symlink("pts/ptmx", p) < 0)
2290 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2291 r = userns_lchown(p, 0, 0);
2292 if (r < 0)
2293 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2294
2295 /* And fix /dev/pts/ptmx ownership */
2296 p = prefix_roota(dest, "/dev/pts/ptmx");
2297 r = userns_lchown(p, 0, 0);
2298 if (r < 0)
2299 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2300
2301 return 0;
2302 }
2303
2304 static int setup_stdio_as_dev_console(void) {
2305 _cleanup_close_ int terminal = -EBADF;
2306 int r;
2307
2308 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2309 * explicitly, if we are configured to. */
2310 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2311 if (terminal < 0)
2312 return log_error_errno(terminal, "Failed to open console: %m");
2313
2314 /* Make sure we can continue logging to the original stderr, even if
2315 * stderr points elsewhere now */
2316 r = log_dup_console();
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to duplicate stderr: %m");
2319
2320 /* invalidates 'terminal' on success and failure */
2321 r = rearrange_stdio(terminal, terminal, terminal);
2322 TAKE_FD(terminal);
2323 if (r < 0)
2324 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2325
2326 return 0;
2327 }
2328
2329 static int setup_dev_console(const char *console) {
2330 _cleanup_free_ char *p = NULL;
2331 int r;
2332
2333 /* Create /dev/console symlink */
2334 r = path_make_relative("/dev", console, &p);
2335 if (r < 0)
2336 return log_error_errno(r, "Failed to create relative path: %m");
2337
2338 if (symlink(p, "/dev/console") < 0)
2339 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2340
2341 return 0;
2342 }
2343
2344 static int setup_keyring(void) {
2345 key_serial_t keyring;
2346
2347 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2348 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2349 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2350 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2351 * into the container. */
2352
2353 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2354 if (keyring == -1) {
2355 if (errno == ENOSYS)
2356 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2357 else if (ERRNO_IS_PRIVILEGE(errno))
2358 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2359 else
2360 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2361 }
2362
2363 return 0;
2364 }
2365
2366 static int setup_credentials(const char *root) {
2367 const char *q;
2368 int r;
2369
2370 if (arg_credentials.n_credentials == 0)
2371 return 0;
2372
2373 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to create /run/host: %m");
2376
2377 r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2380
2381 q = prefix_roota(root, "/run/host/credentials");
2382 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2383 if (r < 0)
2384 return r;
2385
2386 FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) {
2387 _cleanup_free_ char *j = NULL;
2388 _cleanup_close_ int fd = -EBADF;
2389
2390 j = path_join(q, cred->id);
2391 if (!j)
2392 return log_oom();
2393
2394 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2395 if (fd < 0)
2396 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2397
2398 r = loop_write(fd, cred->data, cred->size);
2399 if (r < 0)
2400 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2401
2402 if (fchmod(fd, 0400) < 0)
2403 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2404
2405 if (arg_userns_mode != USER_NAMESPACE_NO) {
2406 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2407 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2408 }
2409 }
2410
2411 if (chmod(q, 0500) < 0)
2412 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2413
2414 r = userns_lchown(q, 0, 0);
2415 if (r < 0)
2416 return r;
2417
2418 /* Make both mount and superblock read-only now */
2419 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2420 if (r < 0)
2421 return r;
2422
2423 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2424 }
2425
2426 static int setup_kmsg(int fd_inner_socket) {
2427 _cleanup_(unlink_and_freep) char *from = NULL;
2428 _cleanup_free_ char *fifo = NULL;
2429 _cleanup_close_ int fd = -EBADF;
2430 int r;
2431
2432 assert(fd_inner_socket >= 0);
2433
2434 BLOCK_WITH_UMASK(0000);
2435
2436 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2437 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2438 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2439 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2440
2441 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2442 if (r < 0)
2443 return log_error_errno(r, "Failed to generate kmsg path: %m");
2444
2445 if (mkfifo(fifo, 0600) < 0)
2446 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2447
2448 from = TAKE_PTR(fifo);
2449
2450 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2451 if (r < 0)
2452 return r;
2453
2454 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2455 if (fd < 0)
2456 return log_error_errno(errno, "Failed to open fifo: %m");
2457
2458 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2459 r = send_one_fd(fd_inner_socket, fd, 0);
2460 if (r < 0)
2461 return log_error_errno(r, "Failed to send FIFO fd: %m");
2462
2463 return 0;
2464 }
2465
2466 struct ExposeArgs {
2467 union in_addr_union address4;
2468 union in_addr_union address6;
2469 struct FirewallContext *fw_ctx;
2470 };
2471
2472 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2473 struct ExposeArgs *args = ASSERT_PTR(userdata);
2474
2475 assert(rtnl);
2476 assert(m);
2477
2478 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2479 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2480 return 0;
2481 }
2482
2483 static int setup_hostname(void) {
2484 int r;
2485
2486 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2487 return 0;
2488
2489 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2490 if (r < 0)
2491 return log_error_errno(r, "Failed to set hostname: %m");
2492
2493 return 0;
2494 }
2495
2496 static int setup_journal(const char *directory) {
2497 _cleanup_free_ char *d = NULL;
2498 const char *p, *q;
2499 sd_id128_t this_id;
2500 bool try;
2501 int r;
2502
2503 /* Don't link journals in ephemeral mode */
2504 if (arg_ephemeral)
2505 return 0;
2506
2507 if (arg_link_journal == LINK_NO)
2508 return 0;
2509
2510 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2511
2512 r = sd_id128_get_machine(&this_id);
2513 if (r < 0)
2514 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2515
2516 if (sd_id128_equal(arg_uuid, this_id)) {
2517 log_full(try ? LOG_WARNING : LOG_ERR,
2518 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2519 if (try)
2520 return 0;
2521 return -EEXIST;
2522 }
2523
2524 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2525 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2526 if (r < 0) {
2527 bool ignore = r == -EROFS && try;
2528 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2529 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2530 return ignore ? 0 : r;
2531 }
2532 }
2533
2534 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2535 q = prefix_roota(directory, p);
2536
2537 if (path_is_mount_point(p, NULL, 0) > 0) {
2538 if (try)
2539 return 0;
2540
2541 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2542 "%s: already a mount point, refusing to use for journal", p);
2543 }
2544
2545 if (path_is_mount_point(q, NULL, 0) > 0) {
2546 if (try)
2547 return 0;
2548
2549 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2550 "%s: already a mount point, refusing to use for journal", q);
2551 }
2552
2553 r = readlink_and_make_absolute(p, &d);
2554 if (r >= 0) {
2555 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2556 path_equal(d, q)) {
2557
2558 r = userns_mkdir(directory, p, 0755, 0, 0);
2559 if (r < 0)
2560 log_warning_errno(r, "Failed to create directory %s: %m", q);
2561 return 0;
2562 }
2563
2564 if (unlink(p) < 0)
2565 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2566 } else if (r == -EINVAL) {
2567
2568 if (arg_link_journal == LINK_GUEST &&
2569 rmdir(p) < 0) {
2570
2571 if (errno == ENOTDIR) {
2572 log_error("%s already exists and is neither a symlink nor a directory", p);
2573 return r;
2574 } else
2575 return log_error_errno(errno, "Failed to remove %s: %m", p);
2576 }
2577 } else if (r != -ENOENT)
2578 return log_error_errno(r, "readlink(%s) failed: %m", p);
2579
2580 if (arg_link_journal == LINK_GUEST) {
2581
2582 if (symlink(q, p) < 0) {
2583 if (try) {
2584 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2585 return 0;
2586 } else
2587 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2588 }
2589
2590 r = userns_mkdir(directory, p, 0755, 0, 0);
2591 if (r < 0)
2592 log_warning_errno(r, "Failed to create directory %s: %m", q);
2593 return 0;
2594 }
2595
2596 if (arg_link_journal == LINK_HOST) {
2597 /* don't create parents here — if the host doesn't have
2598 * permanent journal set up, don't force it here */
2599
2600 r = RET_NERRNO(mkdir(p, 0755));
2601 if (r < 0 && r != -EEXIST) {
2602 if (try) {
2603 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2604 return 0;
2605 } else
2606 return log_error_errno(r, "Failed to create %s: %m", p);
2607 }
2608
2609 } else if (access(p, F_OK) < 0)
2610 return 0;
2611
2612 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2613 log_warning("%s is not empty, proceeding anyway.", q);
2614
2615 r = userns_mkdir(directory, p, 0755, 0, 0);
2616 if (r < 0)
2617 return log_error_errno(r, "Failed to create %s: %m", q);
2618
2619 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2620 if (r < 0)
2621 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2622
2623 return 0;
2624 }
2625
2626 static int drop_capabilities(uid_t uid) {
2627 CapabilityQuintet q;
2628
2629 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2630 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2631 * arg_caps_retain. */
2632
2633 if (capability_quintet_is_set(&arg_full_capabilities)) {
2634 q = arg_full_capabilities;
2635
2636 if (q.bounding == UINT64_MAX)
2637 q.bounding = uid == 0 ? arg_caps_retain : 0;
2638
2639 if (q.effective == UINT64_MAX)
2640 q.effective = uid == 0 ? q.bounding : 0;
2641
2642 if (q.inheritable == UINT64_MAX)
2643 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2644
2645 if (q.permitted == UINT64_MAX)
2646 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2647
2648 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2649 q.ambient = arg_caps_ambient;
2650
2651 if (capability_quintet_mangle(&q))
2652 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2653
2654 } else {
2655 q = (CapabilityQuintet) {
2656 .bounding = arg_caps_retain,
2657 .effective = uid == 0 ? arg_caps_retain : 0,
2658 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2659 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2660 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2661 };
2662
2663 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2664 * in order to maintain the same behavior as systemd < 242. */
2665 if (capability_quintet_mangle(&q))
2666 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2667 "Some capabilities will not be set because they are not in the current bounding set.");
2668
2669 }
2670
2671 return capability_quintet_enforce(&q);
2672 }
2673
2674 static int reset_audit_loginuid(void) {
2675 _cleanup_free_ char *p = NULL;
2676 int r;
2677
2678 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2679 return 0;
2680
2681 r = read_one_line_file("/proc/self/loginuid", &p);
2682 if (r == -ENOENT)
2683 return 0;
2684 if (r < 0)
2685 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2686
2687 /* Already reset? */
2688 if (streq(p, "4294967295"))
2689 return 0;
2690
2691 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2692 if (r < 0) {
2693 log_error_errno(r,
2694 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2695 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2696 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2697 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2698 "using systemd-nspawn. Sleeping for 5s... (%m)");
2699
2700 sleep(5);
2701 }
2702
2703 return 0;
2704 }
2705
2706 static int mount_tunnel_dig(const char *root) {
2707 const char *p, *q;
2708 int r;
2709
2710 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2711 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2712 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2713 (void) mkdir_p(p, 0600);
2714
2715 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2716 if (r < 0)
2717 return log_error_errno(r, "Failed to create /run/host: %m");
2718
2719 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
2720 if (r < 0)
2721 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
2722
2723 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
2724 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2725 if (r < 0)
2726 return r;
2727
2728 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2729 if (r < 0)
2730 return r;
2731
2732 return 0;
2733 }
2734
2735 static int mount_tunnel_open(void) {
2736 int r;
2737
2738 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2739 if (r < 0)
2740 return r;
2741
2742 return 0;
2743 }
2744
2745 static int setup_machine_id(const char *directory) {
2746 int r;
2747
2748 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2749 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2750 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2751 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2752 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2753 * container behaves nicely). */
2754
2755 r = id128_get_machine(directory, &arg_uuid);
2756 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2757 /* If the file is missing, empty, or uninitialized, we don't mind */
2758 if (sd_id128_is_null(arg_uuid)) {
2759 r = sd_id128_randomize(&arg_uuid);
2760 if (r < 0)
2761 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2762 }
2763 } else if (r < 0)
2764 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2765
2766 return 0;
2767 }
2768
2769 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2770 int r;
2771
2772 assert(directory);
2773
2774 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2775 return 0;
2776
2777 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2778 if (r == -EOPNOTSUPP)
2779 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2780 if (r == -EBADE)
2781 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2782 if (r < 0)
2783 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2784 if (r == 0)
2785 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2786 else
2787 log_debug("Patched directory tree to match UID/GID range.");
2788
2789 return r;
2790 }
2791
2792 /*
2793 * Return values:
2794 * < 0 : wait_for_terminate() failed to get the state of the
2795 * container, the container was terminated by a signal, or
2796 * failed for an unknown reason. No change is made to the
2797 * container argument.
2798 * > 0 : The program executed in the container terminated with an
2799 * error. The exit code of the program executed in the
2800 * container is returned. The container argument has been set
2801 * to CONTAINER_TERMINATED.
2802 * 0 : The container is being rebooted, has been shut down or exited
2803 * successfully. The container argument has been set to either
2804 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2805 *
2806 * That is, success is indicated by a return value of zero, and an
2807 * error is indicated by a non-zero value.
2808 */
2809 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2810 siginfo_t status;
2811 int r;
2812
2813 r = wait_for_terminate(pid, &status);
2814 if (r < 0)
2815 return log_warning_errno(r, "Failed to wait for container: %m");
2816
2817 switch (status.si_code) {
2818
2819 case CLD_EXITED:
2820 if (status.si_status == 0)
2821 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2822 else
2823 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2824
2825 *container = CONTAINER_TERMINATED;
2826 return status.si_status;
2827
2828 case CLD_KILLED:
2829 if (status.si_status == SIGINT) {
2830 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2831 *container = CONTAINER_TERMINATED;
2832 return 0;
2833
2834 } else if (status.si_status == SIGHUP) {
2835 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2836 *container = CONTAINER_REBOOTED;
2837 return 0;
2838 }
2839
2840 _fallthrough_;
2841 case CLD_DUMPED:
2842 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2843 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2844
2845 default:
2846 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2847 "Container %s failed due to unknown reason.", arg_machine);
2848 }
2849 }
2850
2851 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2852 pid_t pid;
2853
2854 pid = PTR_TO_PID(userdata);
2855 if (pid > 0) {
2856 if (kill(pid, arg_kill_signal) >= 0) {
2857 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2858 sd_event_source_set_userdata(s, NULL);
2859 return 0;
2860 }
2861 }
2862
2863 sd_event_exit(sd_event_source_get_event(s), 0);
2864 return 0;
2865 }
2866
2867 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2868 pid_t pid;
2869
2870 assert(s);
2871 assert(ssi);
2872
2873 pid = PTR_TO_PID(userdata);
2874
2875 for (;;) {
2876 siginfo_t si = {};
2877
2878 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2879 return log_error_errno(errno, "Failed to waitid(): %m");
2880 if (si.si_pid == 0) /* No pending children. */
2881 break;
2882 if (si.si_pid == pid) {
2883 /* The main process we care for has exited. Return from
2884 * signal handler but leave the zombie. */
2885 sd_event_exit(sd_event_source_get_event(s), 0);
2886 break;
2887 }
2888
2889 /* Reap all other children. */
2890 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2891 }
2892
2893 return 0;
2894 }
2895
2896 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2897 pid_t pid;
2898
2899 assert(m);
2900
2901 pid = PTR_TO_PID(userdata);
2902
2903 if (arg_kill_signal > 0) {
2904 log_info("Container termination requested. Attempting to halt container.");
2905 (void) kill(pid, arg_kill_signal);
2906 } else {
2907 log_info("Container termination requested. Exiting.");
2908 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2909 }
2910
2911 return 0;
2912 }
2913
2914 static int determine_names(void) {
2915 int r;
2916
2917 if (arg_template && !arg_directory && arg_machine) {
2918
2919 /* If --template= was specified then we should not
2920 * search for a machine, but instead create a new one
2921 * in /var/lib/machine. */
2922
2923 arg_directory = path_join("/var/lib/machines", arg_machine);
2924 if (!arg_directory)
2925 return log_oom();
2926 }
2927
2928 if (!arg_image && !arg_directory) {
2929 if (arg_machine) {
2930 _cleanup_(image_unrefp) Image *i = NULL;
2931
2932 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
2933 if (r == -ENOENT)
2934 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2937
2938 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2939 r = free_and_strdup(&arg_image, i->path);
2940 else
2941 r = free_and_strdup(&arg_directory, i->path);
2942 if (r < 0)
2943 return log_oom();
2944
2945 if (!arg_ephemeral)
2946 arg_read_only = arg_read_only || i->read_only;
2947 } else {
2948 r = safe_getcwd(&arg_directory);
2949 if (r < 0)
2950 return log_error_errno(r, "Failed to determine current directory: %m");
2951 }
2952
2953 if (!arg_directory && !arg_image)
2954 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
2955 }
2956
2957 if (!arg_machine) {
2958 if (arg_directory && path_equal(arg_directory, "/"))
2959 arg_machine = gethostname_malloc();
2960 else if (arg_image) {
2961 char *e;
2962
2963 r = path_extract_filename(arg_image, &arg_machine);
2964 if (r < 0)
2965 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
2966
2967 /* Truncate suffix if there is one */
2968 e = endswith(arg_machine, ".raw");
2969 if (e)
2970 *e = 0;
2971 } else {
2972 r = path_extract_filename(arg_directory, &arg_machine);
2973 if (r < 0)
2974 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
2975 }
2976
2977 hostname_cleanup(arg_machine);
2978 if (!hostname_is_valid(arg_machine, 0))
2979 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
2980
2981 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
2982 * to match fixed config file names. */
2983 arg_settings_filename = strjoin(arg_machine, ".nspawn");
2984 if (!arg_settings_filename)
2985 return log_oom();
2986
2987 /* Add a random suffix when this is an ephemeral machine, so that we can run many
2988 * instances at once without manually having to specify -M each time. */
2989 if (arg_ephemeral)
2990 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
2991 return log_oom();
2992 } else {
2993 arg_settings_filename = strjoin(arg_machine, ".nspawn");
2994 if (!arg_settings_filename)
2995 return log_oom();
2996 }
2997
2998 return 0;
2999 }
3000
3001 static int chase_and_update(char **p, unsigned flags) {
3002 char *chased;
3003 int r;
3004
3005 assert(p);
3006
3007 if (!*p)
3008 return 0;
3009
3010 r = chase(*p, NULL, flags, &chased, NULL);
3011 if (r < 0)
3012 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3013
3014 return free_and_replace(*p, chased);
3015 }
3016
3017 static int determine_uid_shift(const char *directory) {
3018
3019 if (arg_userns_mode == USER_NAMESPACE_NO) {
3020 arg_uid_shift = 0;
3021 return 0;
3022 }
3023
3024 if (arg_uid_shift == UID_INVALID) {
3025 struct stat st;
3026
3027 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3028
3029 if (stat(directory, &st) < 0)
3030 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3031
3032 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3033
3034 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3035 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3036 "UID and GID base of %s don't match.", directory);
3037
3038 arg_uid_range = UINT32_C(0x10000);
3039
3040 if (arg_uid_shift != 0) {
3041 /* If the image is shifted already, then we'll fall back to classic chowning, for
3042 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3043
3044 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3045 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3046 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3047 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3048 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3049 "UID base of %s is not zero, UID mapping not supported.", directory);
3050 }
3051 }
3052
3053 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3054 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3055
3056 return 0;
3057 }
3058
3059 static unsigned long effective_clone_ns_flags(void) {
3060 unsigned long flags = arg_clone_ns_flags;
3061
3062 if (arg_private_network)
3063 flags |= CLONE_NEWNET;
3064 if (arg_use_cgns)
3065 flags |= CLONE_NEWCGROUP;
3066 if (arg_userns_mode != USER_NAMESPACE_NO)
3067 flags |= CLONE_NEWUSER;
3068
3069 return flags;
3070 }
3071
3072 static int patch_sysctl(void) {
3073
3074 /* This table is inspired by runc's sysctl() function */
3075 static const struct {
3076 const char *key;
3077 bool prefix;
3078 unsigned long clone_flags;
3079 } safe_sysctl[] = {
3080 { "kernel.hostname", false, CLONE_NEWUTS },
3081 { "kernel.domainname", false, CLONE_NEWUTS },
3082 { "kernel.msgmax", false, CLONE_NEWIPC },
3083 { "kernel.msgmnb", false, CLONE_NEWIPC },
3084 { "kernel.msgmni", false, CLONE_NEWIPC },
3085 { "kernel.sem", false, CLONE_NEWIPC },
3086 { "kernel.shmall", false, CLONE_NEWIPC },
3087 { "kernel.shmmax", false, CLONE_NEWIPC },
3088 { "kernel.shmmni", false, CLONE_NEWIPC },
3089 { "fs.mqueue.", true, CLONE_NEWIPC },
3090 { "net.", true, CLONE_NEWNET },
3091 };
3092
3093 unsigned long flags;
3094 int r;
3095
3096 flags = effective_clone_ns_flags();
3097
3098 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3099 bool good = false;
3100 size_t i;
3101
3102 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3103
3104 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3105 continue;
3106
3107 if (safe_sysctl[i].prefix)
3108 good = startswith(*k, safe_sysctl[i].key);
3109 else
3110 good = streq(*k, safe_sysctl[i].key);
3111
3112 if (good)
3113 break;
3114 }
3115
3116 if (!good)
3117 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3118
3119 r = sysctl_write(*k, *v);
3120 if (r < 0)
3121 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3122 }
3123
3124 return 0;
3125 }
3126
3127 static int inner_child(
3128 Barrier *barrier,
3129 int fd_inner_socket,
3130 FDSet *fds,
3131 char **os_release_pairs) {
3132
3133 _cleanup_free_ char *home = NULL;
3134 size_t n_env = 1;
3135 char *envp[] = {
3136 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3137 NULL, /* container */
3138 NULL, /* TERM */
3139 NULL, /* HOME */
3140 NULL, /* USER */
3141 NULL, /* LOGNAME */
3142 NULL, /* container_uuid */
3143 NULL, /* LISTEN_FDS */
3144 NULL, /* LISTEN_PID */
3145 NULL, /* NOTIFY_SOCKET */
3146 NULL, /* CREDENTIALS_DIRECTORY */
3147 NULL, /* LANG */
3148 NULL
3149 };
3150 const char *exec_target;
3151 _cleanup_strv_free_ char **env_use = NULL;
3152 int r, which_failed;
3153
3154 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3155 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3156 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3157 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3158 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3159 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3160 * namespace.
3161 *
3162 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3163 * unshare(). See below. */
3164
3165 assert(barrier);
3166 assert(fd_inner_socket >= 0);
3167
3168 log_debug("Inner child is initializing.");
3169
3170 if (arg_userns_mode != USER_NAMESPACE_NO) {
3171 /* Tell the parent, that it now can write the UID map. */
3172 (void) barrier_place(barrier); /* #1 */
3173
3174 /* Wait until the parent wrote the UID map */
3175 if (!barrier_place_and_sync(barrier)) /* #2 */
3176 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3177
3178 /* Become the new root user inside our namespace */
3179 r = reset_uid_gid();
3180 if (r < 0)
3181 return log_error_errno(r, "Couldn't become new root: %m");
3182
3183 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3184 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3185 * propagation, but simply create new peer groups for all our mounts). */
3186 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3187 if (r < 0)
3188 return r;
3189 }
3190
3191 r = mount_all(NULL,
3192 arg_mount_settings | MOUNT_IN_USERNS,
3193 arg_uid_shift,
3194 arg_selinux_apifs_context);
3195 if (r < 0)
3196 return r;
3197
3198 if (!arg_network_namespace_path && arg_private_network) {
3199 r = unshare(CLONE_NEWNET);
3200 if (r < 0)
3201 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3202
3203 /* Tell the parent that it can setup network interfaces. */
3204 (void) barrier_place(barrier); /* #3 */
3205 }
3206
3207 r = mount_sysfs(NULL, arg_mount_settings);
3208 if (r < 0)
3209 return r;
3210
3211 /* Wait until we are cgroup-ified, so that we
3212 * can mount the right cgroup path writable */
3213 if (!barrier_place_and_sync(barrier)) /* #4 */
3214 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3215 "Parent died too early");
3216
3217 if (arg_use_cgns) {
3218 r = unshare(CLONE_NEWCGROUP);
3219 if (r < 0)
3220 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3221 r = mount_cgroups(
3222 "",
3223 arg_unified_cgroup_hierarchy,
3224 arg_userns_mode != USER_NAMESPACE_NO,
3225 arg_uid_shift,
3226 arg_uid_range,
3227 arg_selinux_apifs_context,
3228 true);
3229 } else
3230 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3231 if (r < 0)
3232 return r;
3233
3234 r = setup_boot_id();
3235 if (r < 0)
3236 return r;
3237
3238 r = setup_kmsg(fd_inner_socket);
3239 if (r < 0)
3240 return r;
3241
3242 r = mount_custom(
3243 "/",
3244 arg_custom_mounts,
3245 arg_n_custom_mounts,
3246 0,
3247 0,
3248 arg_selinux_apifs_context,
3249 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3250 if (r < 0)
3251 return r;
3252
3253 if (setsid() < 0)
3254 return log_error_errno(errno, "setsid() failed: %m");
3255
3256 if (arg_private_network)
3257 (void) loopback_setup();
3258
3259 if (arg_expose_ports) {
3260 r = expose_port_send_rtnl(fd_inner_socket);
3261 if (r < 0)
3262 return r;
3263 }
3264
3265 if (arg_console_mode != CONSOLE_PIPE) {
3266 _cleanup_close_ int master = -EBADF;
3267 _cleanup_free_ char *console = NULL;
3268
3269 /* Allocate a pty and make it available as /dev/console. */
3270 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3271 if (master < 0)
3272 return log_error_errno(master, "Failed to allocate a pty: %m");
3273
3274 r = setup_dev_console(console);
3275 if (r < 0)
3276 return log_error_errno(r, "Failed to set up /dev/console: %m");
3277
3278 r = send_one_fd(fd_inner_socket, master, 0);
3279 if (r < 0)
3280 return log_error_errno(r, "Failed to send master fd: %m");
3281
3282 r = setup_stdio_as_dev_console();
3283 if (r < 0)
3284 return r;
3285 }
3286
3287 r = patch_sysctl();
3288 if (r < 0)
3289 return r;
3290
3291 if (arg_oom_score_adjust_set) {
3292 r = set_oom_score_adjust(arg_oom_score_adjust);
3293 if (r < 0)
3294 return log_error_errno(r, "Failed to adjust OOM score: %m");
3295 }
3296
3297 if (arg_cpu_set.set)
3298 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3299 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3300
3301 (void) setup_hostname();
3302
3303 if (arg_personality != PERSONALITY_INVALID) {
3304 r = safe_personality(arg_personality);
3305 if (r < 0)
3306 return log_error_errno(r, "personality() failed: %m");
3307 #ifdef ARCHITECTURE_SECONDARY
3308 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
3309 r = safe_personality(PER_LINUX32);
3310 if (r < 0)
3311 return log_error_errno(r, "personality() failed: %m");
3312 #endif
3313 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3314 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3315 "invocation with qemu userspace emulator (or equivalent) in effect.",
3316 architecture_to_string(arg_architecture));
3317
3318 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3319 if (r < 0)
3320 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3321
3322 #if HAVE_SECCOMP
3323 if (arg_seccomp) {
3324
3325 if (is_seccomp_available()) {
3326 r = seccomp_load(arg_seccomp);
3327 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3328 return log_error_errno(r, "Failed to install seccomp filter: %m");
3329 if (r < 0)
3330 log_debug_errno(r, "Failed to install seccomp filter: %m");
3331 }
3332 } else
3333 #endif
3334 {
3335 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3336 if (r < 0)
3337 return r;
3338 }
3339
3340 if (arg_suppress_sync) {
3341 #if HAVE_SECCOMP
3342 r = seccomp_suppress_sync();
3343 if (r < 0)
3344 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3345 #else
3346 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3347 #endif
3348 }
3349
3350 #if HAVE_SELINUX
3351 if (arg_selinux_context)
3352 if (setexeccon(arg_selinux_context) < 0)
3353 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3354 #endif
3355
3356 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3357 * if we need to later on. */
3358 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3359 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3360
3361 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3362 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3363 else
3364 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3365 if (r < 0)
3366 return r;
3367
3368 r = drop_capabilities(getuid());
3369 if (r < 0)
3370 return log_error_errno(r, "Dropping capabilities failed: %m");
3371
3372 if (arg_no_new_privileges)
3373 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3374 return log_error_errno(errno, "Failed to disable new privileges: %m");
3375
3376 /* LXC sets container=lxc, so follow the scheme here */
3377 envp[n_env++] = strjoina("container=", arg_container_service_name);
3378
3379 envp[n_env] = strv_find_prefix(environ, "TERM=");
3380 if (envp[n_env])
3381 n_env++;
3382
3383 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3384 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3385 return log_oom();
3386
3387 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3388 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3389 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
3390 return log_oom();
3391
3392 assert(!sd_id128_is_null(arg_uuid));
3393
3394 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3395 return log_oom();
3396
3397 if (!fdset_isempty(fds)) {
3398 r = fdset_cloexec(fds, false);
3399 if (r < 0)
3400 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3401
3402 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3403 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3404 return log_oom();
3405 }
3406 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3407 return log_oom();
3408
3409 if (arg_credentials.n_credentials > 0) {
3410 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3411 if (!envp[n_env])
3412 return log_oom();
3413 n_env++;
3414 }
3415
3416 if (arg_start_mode != START_BOOT) {
3417 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3418 if (!envp[n_env])
3419 return log_oom();
3420 n_env++;
3421 }
3422
3423 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3424 if (!env_use)
3425 return log_oom();
3426
3427 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
3428 if (!barrier_place_and_sync(barrier)) /* #5 */
3429 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3430
3431 if (arg_chdir)
3432 if (chdir(arg_chdir) < 0)
3433 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3434
3435 if (arg_start_mode == START_PID2) {
3436 r = stub_pid1(arg_uuid);
3437 if (r < 0)
3438 return r;
3439 }
3440
3441 if (arg_console_mode != CONSOLE_PIPE) {
3442 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3443 * are configured for that. Acquire it as controlling tty. */
3444 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3445 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3446 }
3447
3448 log_debug("Inner child completed, invoking payload.");
3449
3450 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3451 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3452 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3453 log_close();
3454 log_set_open_when_needed(true);
3455 log_settle_target();
3456
3457 (void) fdset_close_others(fds);
3458
3459 if (arg_start_mode == START_BOOT) {
3460 char **a;
3461 size_t m;
3462
3463 /* Automatically search for the init system */
3464
3465 m = strv_length(arg_parameters);
3466 a = newa(char*, m + 2);
3467 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3468 a[1 + m] = NULL;
3469
3470 FOREACH_STRING(init,
3471 "/usr/lib/systemd/systemd",
3472 "/lib/systemd/systemd",
3473 "/sbin/init") {
3474 a[0] = (char*) init;
3475 execve(a[0], a, env_use);
3476 }
3477
3478 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3479 } else if (!strv_isempty(arg_parameters)) {
3480 const char *dollar_path;
3481
3482 exec_target = arg_parameters[0];
3483
3484 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3485 * binary. */
3486 dollar_path = strv_env_get(env_use, "PATH");
3487 if (dollar_path) {
3488 if (setenv("PATH", dollar_path, 1) < 0)
3489 return log_error_errno(errno, "Failed to update $PATH: %m");
3490 }
3491
3492 execvpe(arg_parameters[0], arg_parameters, env_use);
3493 } else {
3494 if (!arg_chdir)
3495 /* If we cannot change the directory, we'll end up in /, that is expected. */
3496 (void) chdir(home ?: "/root");
3497
3498 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3499 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3500 execle("/bin/bash", "-bash", NULL, env_use);
3501 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3502 execle("/bin/sh", "-sh", NULL, env_use);
3503
3504 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3505 }
3506
3507 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3508 }
3509
3510 static int setup_notify_child(void) {
3511 _cleanup_close_ int fd = -EBADF;
3512 static const union sockaddr_union sa = {
3513 .un.sun_family = AF_UNIX,
3514 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3515 };
3516 int r;
3517
3518 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3519 if (fd < 0)
3520 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3521
3522 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3523 (void) sockaddr_un_unlink(&sa.un);
3524
3525 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3526 if (r < 0)
3527 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3528
3529 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3530 if (r < 0)
3531 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3532
3533 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3534 if (r < 0)
3535 return log_error_errno(r, "SO_PASSCRED failed: %m");
3536
3537 return TAKE_FD(fd);
3538 }
3539
3540 static int outer_child(
3541 Barrier *barrier,
3542 const char *directory,
3543 DissectedImage *dissected_image,
3544 int fd_outer_socket,
3545 int fd_inner_socket,
3546 FDSet *fds,
3547 int netns_fd) {
3548
3549 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3550 _cleanup_strv_free_ char **os_release_pairs = NULL;
3551 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
3552 bool idmap = false;
3553 const char *p;
3554 pid_t pid;
3555 ssize_t l;
3556 int r;
3557
3558 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3559 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3560 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3561 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3562 * forked off it, and it exits. */
3563
3564 assert(barrier);
3565 assert(directory);
3566 assert(fd_outer_socket >= 0);
3567 assert(fd_inner_socket >= 0);
3568
3569 log_debug("Outer child is initializing.");
3570
3571 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3572 if (r < 0)
3573 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3574
3575 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3576 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3577
3578 r = reset_audit_loginuid();
3579 if (r < 0)
3580 return r;
3581
3582 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3583 * mounts to the real root. */
3584 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3585 if (r < 0)
3586 return r;
3587
3588 if (dissected_image) {
3589 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3590 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3591 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3592 * right place right away. This makes sure ESP partitions and userns are compatible. */
3593
3594 r = dissected_image_mount_and_warn(
3595 dissected_image,
3596 directory,
3597 arg_uid_shift,
3598 arg_uid_range,
3599 /* userns_fd= */ -EBADF,
3600 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3601 DISSECT_IMAGE_DISCARD_ON_LOOP|
3602 DISSECT_IMAGE_USR_NO_ROOT|
3603 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3604 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3605 if (r < 0)
3606 return r;
3607 }
3608
3609 r = determine_uid_shift(directory);
3610 if (r < 0)
3611 return r;
3612
3613 if (arg_userns_mode != USER_NAMESPACE_NO) {
3614 r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
3615 if (r < 0)
3616 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3617
3618 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
3619 if (l < 0)
3620 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3621 mntns_fd = safe_close(mntns_fd);
3622
3623 /* Let the parent know which UID shift we read from the image */
3624 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3625 if (l < 0)
3626 return log_error_errno(errno, "Failed to send UID shift: %m");
3627 if (l != sizeof(arg_uid_shift))
3628 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3629 "Short write while sending UID shift.");
3630
3631 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3632 /* When we are supposed to pick the UID shift, the parent will check now whether the
3633 * UID shift we just read from the image is available. If yes, it will send the UID
3634 * shift back to us, if not it will pick a different one, and send it back to us. */
3635
3636 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3637 if (l < 0)
3638 return log_error_errno(errno, "Failed to recv UID shift: %m");
3639 if (l != sizeof(arg_uid_shift))
3640 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3641 "Short read while receiving UID shift.");
3642 }
3643
3644 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3645 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3646 }
3647
3648 if (path_equal(directory, "/")) {
3649 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3650 * place, so that we can make changes to its mount structure (for example, to implement
3651 * --volatile=) without this interfering with our ability to access files such as
3652 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3653 * (instead of a temporary directory, since we are living in our own mount namespace here
3654 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3655 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3656
3657 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3658 if (r < 0)
3659 return r;
3660
3661 directory = "/run/systemd/nspawn-root";
3662 }
3663
3664 /* Make sure we always have a mount that we can move to root later on. */
3665 r = make_mount_point(directory);
3666 if (r < 0)
3667 return r;
3668
3669 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3670 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3671 * we'll live in our own little world from now on, and propagation from the host may only happen via
3672 * the mount tunnel dir, or not at all. */
3673 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3674 if (r < 0)
3675 return r;
3676
3677 r = setup_pivot_root(
3678 directory,
3679 arg_pivot_root_new,
3680 arg_pivot_root_old);
3681 if (r < 0)
3682 return r;
3683
3684 r = setup_volatile_mode(
3685 directory,
3686 arg_volatile_mode,
3687 arg_uid_shift,
3688 arg_selinux_apifs_context);
3689 if (r < 0)
3690 return r;
3691
3692 r = bind_user_prepare(
3693 directory,
3694 arg_bind_user,
3695 arg_uid_shift,
3696 arg_uid_range,
3697 &arg_custom_mounts, &arg_n_custom_mounts,
3698 &bind_user_context);
3699 if (r < 0)
3700 return r;
3701
3702 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3703 /* Send the user maps we determined to the parent, so that it installs it in our user
3704 * namespace UID map table */
3705
3706 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3707 uid_t map[] = {
3708 bind_user_context->data[i].payload_user->uid,
3709 bind_user_context->data[i].host_user->uid,
3710 (uid_t) bind_user_context->data[i].payload_group->gid,
3711 (uid_t) bind_user_context->data[i].host_group->gid,
3712 };
3713
3714 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
3715 if (l < 0)
3716 return log_error_errno(errno, "Failed to send user UID map: %m");
3717 if (l != sizeof(map))
3718 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3719 "Short write while sending user UID map.");
3720 }
3721 }
3722
3723 r = mount_custom(
3724 directory,
3725 arg_custom_mounts,
3726 arg_n_custom_mounts,
3727 arg_uid_shift,
3728 arg_uid_range,
3729 arg_selinux_apifs_context,
3730 MOUNT_ROOT_ONLY);
3731 if (r < 0)
3732 return r;
3733
3734 if (arg_userns_mode != USER_NAMESPACE_NO &&
3735 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3736 arg_uid_shift != 0) {
3737 _cleanup_free_ char *usr_subtree = NULL;
3738 char *dirs[3];
3739 size_t i = 0;
3740
3741 dirs[i++] = (char*) directory;
3742
3743 if (dissected_image && dissected_image->partitions[PARTITION_USR].found) {
3744 usr_subtree = path_join(directory, "/usr");
3745 if (!usr_subtree)
3746 return log_oom();
3747
3748 dirs[i++] = usr_subtree;
3749 }
3750
3751 dirs[i] = NULL;
3752
3753 r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
3754 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3755 /* This might fail because the kernel or file system doesn't support idmapping. We
3756 * can't really distinguish this nicely, nor do we have any guarantees about the
3757 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3758 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3759 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3760 "ID mapped mounts are apparently not available, sorry.");
3761
3762 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3763 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3764 } else if (r < 0)
3765 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3766 else {
3767 log_debug("ID mapped mounts available, making use of them.");
3768 idmap = true;
3769 }
3770 }
3771
3772 if (dissected_image) {
3773 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3774 r = dissected_image_mount(
3775 dissected_image,
3776 directory,
3777 arg_uid_shift,
3778 arg_uid_range,
3779 /* userns_fd= */ -EBADF,
3780 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3781 DISSECT_IMAGE_DISCARD_ON_LOOP|
3782 DISSECT_IMAGE_USR_NO_ROOT|
3783 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3784 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3785 if (r == -EUCLEAN)
3786 return log_error_errno(r, "File system check for image failed: %m");
3787 if (r < 0)
3788 return log_error_errno(r, "Failed to mount image file system: %m");
3789 }
3790
3791 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3792 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3793
3794 r = detect_unified_cgroup_hierarchy_from_image(directory);
3795 if (r < 0)
3796 return r;
3797
3798 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3799 if (l < 0)
3800 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3801 if (l != sizeof(arg_unified_cgroup_hierarchy))
3802 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3803 "Short write while sending cgroup mode.");
3804 }
3805
3806 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3807 if (r < 0)
3808 return r;
3809
3810 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3811 if (r < 0)
3812 return r;
3813
3814 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3815 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3816 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3817 if (r < 0)
3818 return log_error_errno(r, "Failed to make tree read-only: %m");
3819 }
3820
3821 r = mount_all(directory,
3822 arg_mount_settings,
3823 arg_uid_shift,
3824 arg_selinux_apifs_context);
3825 if (r < 0)
3826 return r;
3827
3828 r = copy_devnodes(directory);
3829 if (r < 0)
3830 return r;
3831
3832 r = make_extra_nodes(directory);
3833 if (r < 0)
3834 return r;
3835
3836 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3837
3838 p = prefix_roota(directory, "/run/host");
3839 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3840
3841 r = setup_pts(directory);
3842 if (r < 0)
3843 return r;
3844
3845 r = mount_tunnel_dig(directory);
3846 if (r < 0)
3847 return r;
3848
3849 r = setup_keyring();
3850 if (r < 0)
3851 return r;
3852
3853 r = setup_credentials(directory);
3854 if (r < 0)
3855 return r;
3856
3857 r = bind_user_setup(bind_user_context, directory);
3858 if (r < 0)
3859 return r;
3860
3861 r = mount_custom(
3862 directory,
3863 arg_custom_mounts,
3864 arg_n_custom_mounts,
3865 arg_uid_shift,
3866 arg_uid_range,
3867 arg_selinux_apifs_context,
3868 MOUNT_NON_ROOT_ONLY);
3869 if (r < 0)
3870 return r;
3871
3872 r = setup_timezone(directory);
3873 if (r < 0)
3874 return r;
3875
3876 r = setup_resolv_conf(directory);
3877 if (r < 0)
3878 return r;
3879
3880 r = setup_machine_id(directory);
3881 if (r < 0)
3882 return r;
3883
3884 r = setup_journal(directory);
3885 if (r < 0)
3886 return r;
3887
3888 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3889 p = prefix_roota(directory, "/run/host/container-manager");
3890 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3891
3892 /* The same stuff as the $container_uuid env var */
3893 p = prefix_roota(directory, "/run/host/container-uuid");
3894 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3895
3896 if (!arg_use_cgns) {
3897 r = mount_cgroups(
3898 directory,
3899 arg_unified_cgroup_hierarchy,
3900 arg_userns_mode != USER_NAMESPACE_NO,
3901 arg_uid_shift,
3902 arg_uid_range,
3903 arg_selinux_apifs_context,
3904 false);
3905 if (r < 0)
3906 return r;
3907 }
3908
3909 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3910 * mounts available in systemd services inside the container that create a new mount namespace. See
3911 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3912 * will inherit the shared propagation mode.
3913 *
3914 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3915 * directory mount to root later on.
3916 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3917 */
3918 r = mount_switch_root(directory, MS_SHARED);
3919 if (r < 0)
3920 return log_error_errno(r, "Failed to move root directory: %m");
3921
3922 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
3923 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
3924 * the container. */
3925 r = mount_tunnel_open();
3926 if (r < 0)
3927 return r;
3928
3929 if (arg_userns_mode != USER_NAMESPACE_NO) {
3930 /* In order to mount procfs and sysfs in an unprivileged container the kernel
3931 * requires that a fully visible instance is already present in the target mount
3932 * namespace. Mount one here so the inner child can mount its own instances. Later
3933 * we umount the temporary instances created here before we actually exec the
3934 * payload. Since the rootfs is shared the umount will propagate into the container.
3935 * Note, the inner child wouldn't be able to unmount the instances on its own since
3936 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
3937 * this. */
3938 r = pin_fully_visible_fs();
3939 if (r < 0)
3940 return r;
3941 }
3942
3943 fd = setup_notify_child();
3944 if (fd < 0)
3945 return fd;
3946
3947 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3948 arg_clone_ns_flags |
3949 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3950 if (pid < 0)
3951 return log_error_errno(errno, "Failed to fork inner child: %m");
3952 if (pid == 0) {
3953 fd_outer_socket = safe_close(fd_outer_socket);
3954
3955 /* The inner child has all namespaces that are requested, so that we all are owned by the
3956 * user if user namespaces are turned on. */
3957
3958 if (arg_network_namespace_path) {
3959 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3960 if (r < 0)
3961 return log_error_errno(r, "Failed to join network namespace: %m");
3962 }
3963
3964 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
3965 if (r < 0)
3966 _exit(EXIT_FAILURE);
3967
3968 _exit(EXIT_SUCCESS);
3969 }
3970
3971 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3972 if (l < 0)
3973 return log_error_errno(errno, "Failed to send PID: %m");
3974 if (l != sizeof(pid))
3975 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3976 "Short write while sending PID.");
3977
3978 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3979 if (l < 0)
3980 return log_error_errno(errno, "Failed to send machine ID: %m");
3981 if (l != sizeof(arg_uuid))
3982 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3983 "Short write while sending machine ID.");
3984
3985 l = send_one_fd(fd_outer_socket, fd, 0);
3986 if (l < 0)
3987 return log_error_errno(l, "Failed to send notify fd: %m");
3988
3989 fd_outer_socket = safe_close(fd_outer_socket);
3990 fd_inner_socket = safe_close(fd_inner_socket);
3991 netns_fd = safe_close(netns_fd);
3992
3993 return 0;
3994 }
3995
3996 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3997 bool tried_hashed = false;
3998 unsigned n_tries = 100;
3999 uid_t candidate;
4000 int r;
4001
4002 assert(shift);
4003 assert(ret_lock_file);
4004 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4005 assert(arg_uid_range == 0x10000U);
4006
4007 candidate = *shift;
4008
4009 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4010
4011 for (;;) {
4012 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4013 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4014
4015 if (--n_tries <= 0)
4016 return -EBUSY;
4017
4018 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4019 goto next;
4020 if ((candidate & UINT32_C(0xFFFF)) != 0)
4021 goto next;
4022
4023 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4024 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4025 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4026 goto next;
4027 if (r < 0)
4028 return r;
4029
4030 /* Make some superficial checks whether the range is currently known in the user database */
4031 if (getpwuid(candidate))
4032 goto next;
4033 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4034 goto next;
4035 if (getgrgid(candidate))
4036 goto next;
4037 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4038 goto next;
4039
4040 *ret_lock_file = lf;
4041 lf = (struct LockFile) LOCK_FILE_INIT;
4042 *shift = candidate;
4043 return 0;
4044
4045 next:
4046 if (arg_machine && !tried_hashed) {
4047 /* Try to hash the base from the container name */
4048
4049 static const uint8_t hash_key[] = {
4050 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4051 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4052 };
4053
4054 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4055
4056 tried_hashed = true;
4057 } else
4058 random_bytes(&candidate, sizeof(candidate));
4059
4060 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4061 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4062 }
4063 }
4064
4065 static int add_one_uid_map(
4066 char **p,
4067 uid_t container_uid,
4068 uid_t host_uid,
4069 uid_t range) {
4070
4071 return strextendf(p,
4072 UID_FMT " " UID_FMT " " UID_FMT "\n",
4073 container_uid, host_uid, range);
4074 }
4075
4076 static int make_uid_map_string(
4077 const uid_t bind_user_uid[],
4078 size_t n_bind_user_uid,
4079 size_t offset,
4080 char **ret) {
4081
4082 _cleanup_free_ char *s = NULL;
4083 uid_t previous_uid = 0;
4084 int r;
4085
4086 assert(n_bind_user_uid == 0 || bind_user_uid);
4087 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4088 assert(ret);
4089
4090 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4091 * quadruplet, consisting of host and container UID + GID. */
4092
4093 for (size_t i = 0; i < n_bind_user_uid; i++) {
4094 uid_t payload_uid = bind_user_uid[i*4+offset],
4095 host_uid = bind_user_uid[i*4+offset+1];
4096
4097 assert(previous_uid <= payload_uid);
4098 assert(payload_uid < arg_uid_range);
4099
4100 /* Add a range to close the gap to previous entry */
4101 if (payload_uid > previous_uid) {
4102 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4103 if (r < 0)
4104 return r;
4105 }
4106
4107 /* Map this specific user */
4108 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4109 if (r < 0)
4110 return r;
4111
4112 previous_uid = payload_uid + 1;
4113 }
4114
4115 /* And add a range to close the gap to finish the range */
4116 if (arg_uid_range > previous_uid) {
4117 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4118 if (r < 0)
4119 return r;
4120 }
4121
4122 assert(s);
4123
4124 *ret = TAKE_PTR(s);
4125 return 0;
4126 }
4127
4128 static int setup_uid_map(
4129 pid_t pid,
4130 const uid_t bind_user_uid[],
4131 size_t n_bind_user_uid) {
4132
4133 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4134 _cleanup_free_ char *s = NULL;
4135 int r;
4136
4137 assert(pid > 1);
4138
4139 /* Build the UID map string */
4140 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4141 return log_oom();
4142
4143 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4144 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4145 if (r < 0)
4146 return log_error_errno(r, "Failed to write UID map: %m");
4147
4148 /* And now build the GID map string */
4149 s = mfree(s);
4150 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4151 return log_oom();
4152
4153 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4154 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4155 if (r < 0)
4156 return log_error_errno(r, "Failed to write GID map: %m");
4157
4158 return 0;
4159 }
4160
4161 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4162 char buf[NOTIFY_BUFFER_MAX+1];
4163 char *p = NULL;
4164 struct iovec iovec = {
4165 .iov_base = buf,
4166 .iov_len = sizeof(buf)-1,
4167 };
4168 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4169 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4170 struct msghdr msghdr = {
4171 .msg_iov = &iovec,
4172 .msg_iovlen = 1,
4173 .msg_control = &control,
4174 .msg_controllen = sizeof(control),
4175 };
4176 struct ucred *ucred;
4177 ssize_t n;
4178 pid_t inner_child_pid;
4179 _cleanup_strv_free_ char **tags = NULL;
4180 int r;
4181
4182 assert(userdata);
4183
4184 inner_child_pid = PTR_TO_PID(userdata);
4185
4186 if (revents != EPOLLIN) {
4187 log_warning("Got unexpected poll event for notify fd.");
4188 return 0;
4189 }
4190
4191 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4192 if (ERRNO_IS_NEG_TRANSIENT(n))
4193 return 0;
4194 else if (n == -EXFULL) {
4195 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4196 return 0;
4197 } else if (n < 0)
4198 return log_warning_errno(n, "Couldn't read notification socket: %m");
4199
4200 cmsg_close_all(&msghdr);
4201
4202 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4203 if (!ucred || ucred->pid != inner_child_pid) {
4204 log_debug("Received notify message without valid credentials. Ignoring.");
4205 return 0;
4206 }
4207
4208 if ((size_t) n >= sizeof(buf)) {
4209 log_warning("Received notify message exceeded maximum size. Ignoring.");
4210 return 0;
4211 }
4212
4213 buf[n] = 0;
4214 tags = strv_split(buf, "\n\r");
4215 if (!tags)
4216 return log_oom();
4217
4218 if (strv_contains(tags, "READY=1")) {
4219 r = sd_notify(false, "READY=1\n");
4220 if (r < 0)
4221 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4222 }
4223
4224 p = strv_find_startswith(tags, "STATUS=");
4225 if (p)
4226 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4227
4228 return 0;
4229 }
4230
4231 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4232 int r;
4233
4234 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4235 if (r < 0)
4236 return log_error_errno(r, "Failed to allocate notify event source: %m");
4237
4238 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4239
4240 return 0;
4241 }
4242
4243 static int merge_settings(Settings *settings, const char *path) {
4244 int rl;
4245
4246 assert(settings);
4247 assert(path);
4248
4249 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4250 * that this steals the fields of the Settings* structure, and hence modifies it. */
4251
4252 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4253 settings->start_mode >= 0) {
4254 arg_start_mode = settings->start_mode;
4255 strv_free_and_replace(arg_parameters, settings->parameters);
4256 }
4257
4258 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4259 settings->ephemeral >= 0)
4260 arg_ephemeral = settings->ephemeral;
4261
4262 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4263 settings->root) {
4264
4265 if (!arg_settings_trusted)
4266 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4267 else
4268 free_and_replace(arg_directory, settings->root);
4269 }
4270
4271 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4272 settings->pivot_root_new) {
4273 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4274 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4275 }
4276
4277 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4278 settings->working_directory)
4279 free_and_replace(arg_chdir, settings->working_directory);
4280
4281 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4282 settings->environment)
4283 strv_free_and_replace(arg_setenv, settings->environment);
4284
4285 if ((arg_settings_mask & SETTING_USER) == 0) {
4286
4287 if (settings->user)
4288 free_and_replace(arg_user, settings->user);
4289
4290 if (uid_is_valid(settings->uid))
4291 arg_uid = settings->uid;
4292 if (gid_is_valid(settings->gid))
4293 arg_gid = settings->gid;
4294 if (settings->n_supplementary_gids > 0) {
4295 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4296 arg_n_supplementary_gids = settings->n_supplementary_gids;
4297 }
4298 }
4299
4300 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4301 uint64_t plus, minus;
4302 uint64_t network_minus = 0;
4303 uint64_t ambient;
4304
4305 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4306 * Settings structure */
4307
4308 plus = settings->capability;
4309 minus = settings->drop_capability;
4310
4311 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4312 settings_network_configured(settings)) {
4313 if (settings_private_network(settings))
4314 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4315 else
4316 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4317 }
4318
4319 if (!arg_settings_trusted && plus != 0) {
4320 if (settings->capability != 0)
4321 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4322 } else {
4323 arg_caps_retain &= ~network_minus;
4324 arg_caps_retain |= plus;
4325 }
4326
4327 arg_caps_retain &= ~minus;
4328
4329 /* Copy the full capabilities over too */
4330 if (capability_quintet_is_set(&settings->full_capabilities)) {
4331 if (!arg_settings_trusted)
4332 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4333 else
4334 arg_full_capabilities = settings->full_capabilities;
4335 }
4336
4337 ambient = settings->ambient_capability;
4338 if (!arg_settings_trusted && ambient != 0)
4339 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4340 else
4341 arg_caps_ambient |= ambient;
4342 }
4343
4344 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4345 settings->kill_signal > 0)
4346 arg_kill_signal = settings->kill_signal;
4347
4348 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4349 settings->personality != PERSONALITY_INVALID)
4350 arg_personality = settings->personality;
4351
4352 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4353 !sd_id128_is_null(settings->machine_id)) {
4354
4355 if (!arg_settings_trusted)
4356 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4357 else
4358 arg_uuid = settings->machine_id;
4359 }
4360
4361 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4362 settings->read_only >= 0)
4363 arg_read_only = settings->read_only;
4364
4365 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4366 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4367 arg_volatile_mode = settings->volatile_mode;
4368
4369 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4370 settings->n_custom_mounts > 0) {
4371
4372 if (!arg_settings_trusted)
4373 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4374 else {
4375 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4376 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4377 arg_n_custom_mounts = settings->n_custom_mounts;
4378 settings->n_custom_mounts = 0;
4379 }
4380 }
4381
4382 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4383 settings_network_configured(settings)) {
4384
4385 if (!arg_settings_trusted)
4386 log_warning("Ignoring network settings, file %s is not trusted.", path);
4387 else {
4388 arg_network_veth = settings_network_veth(settings);
4389 arg_private_network = settings_private_network(settings);
4390
4391 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4392 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4393 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4394 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4395
4396 free_and_replace(arg_network_bridge, settings->network_bridge);
4397 free_and_replace(arg_network_zone, settings->network_zone);
4398
4399 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4400 }
4401 }
4402
4403 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4404 settings->expose_ports) {
4405
4406 if (!arg_settings_trusted)
4407 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4408 else {
4409 expose_port_free_all(arg_expose_ports);
4410 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4411 }
4412 }
4413
4414 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4415 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4416
4417 if (!arg_settings_trusted)
4418 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4419 else {
4420 arg_userns_mode = settings->userns_mode;
4421 arg_uid_shift = settings->uid_shift;
4422 arg_uid_range = settings->uid_range;
4423 arg_userns_ownership = settings->userns_ownership;
4424 }
4425 }
4426
4427 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4428 !strv_isempty(settings->bind_user))
4429 strv_free_and_replace(arg_bind_user, settings->bind_user);
4430
4431 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4432 settings->notify_ready >= 0)
4433 arg_notify_ready = settings->notify_ready;
4434
4435 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4436
4437 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4438 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4439 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4440 else {
4441 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4442 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4443 }
4444 }
4445
4446 #if HAVE_SECCOMP
4447 if (settings->seccomp) {
4448 if (!arg_settings_trusted)
4449 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4450 else {
4451 seccomp_release(arg_seccomp);
4452 arg_seccomp = TAKE_PTR(settings->seccomp);
4453 }
4454 }
4455 #endif
4456 }
4457
4458 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4459 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4460 continue;
4461
4462 if (!settings->rlimit[rl])
4463 continue;
4464
4465 if (!arg_settings_trusted) {
4466 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4467 continue;
4468 }
4469
4470 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4471 }
4472
4473 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4474 settings->hostname)
4475 free_and_replace(arg_hostname, settings->hostname);
4476
4477 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4478 settings->no_new_privileges >= 0)
4479 arg_no_new_privileges = settings->no_new_privileges;
4480
4481 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4482 settings->oom_score_adjust_set) {
4483
4484 if (!arg_settings_trusted)
4485 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4486 else {
4487 arg_oom_score_adjust = settings->oom_score_adjust;
4488 arg_oom_score_adjust_set = true;
4489 }
4490 }
4491
4492 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4493 settings->cpu_set.set) {
4494
4495 if (!arg_settings_trusted)
4496 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4497 else {
4498 cpu_set_reset(&arg_cpu_set);
4499 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
4500 }
4501 }
4502
4503 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4504 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4505 arg_resolv_conf = settings->resolv_conf;
4506
4507 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4508 settings->link_journal != _LINK_JOURNAL_INVALID) {
4509
4510 if (!arg_settings_trusted)
4511 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4512 else {
4513 arg_link_journal = settings->link_journal;
4514 arg_link_journal_try = settings->link_journal_try;
4515 }
4516 }
4517
4518 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4519 settings->timezone != _TIMEZONE_MODE_INVALID)
4520 arg_timezone = settings->timezone;
4521
4522 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4523 settings->slice) {
4524
4525 if (!arg_settings_trusted)
4526 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4527 else
4528 free_and_replace(arg_slice, settings->slice);
4529 }
4530
4531 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4532 settings->use_cgns >= 0) {
4533
4534 if (!arg_settings_trusted)
4535 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4536 else
4537 arg_use_cgns = settings->use_cgns;
4538 }
4539
4540 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4541 settings->clone_ns_flags != ULONG_MAX) {
4542
4543 if (!arg_settings_trusted)
4544 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4545 else
4546 arg_clone_ns_flags = settings->clone_ns_flags;
4547 }
4548
4549 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4550 settings->console_mode >= 0) {
4551
4552 if (!arg_settings_trusted)
4553 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4554 else
4555 arg_console_mode = settings->console_mode;
4556 }
4557
4558 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4559 settings->suppress_sync >= 0)
4560 arg_suppress_sync = settings->suppress_sync;
4561
4562 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4563 * don't consult arg_settings_mask for them. */
4564
4565 sd_bus_message_unref(arg_property_message);
4566 arg_property_message = TAKE_PTR(settings->properties);
4567
4568 arg_console_width = settings->console_width;
4569 arg_console_height = settings->console_height;
4570
4571 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4572 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4573 arg_n_extra_nodes = settings->n_extra_nodes;
4574 settings->n_extra_nodes = 0;
4575
4576 return 0;
4577 }
4578
4579 static int load_settings(void) {
4580 _cleanup_(settings_freep) Settings *settings = NULL;
4581 _cleanup_fclose_ FILE *f = NULL;
4582 _cleanup_free_ char *p = NULL;
4583 int r;
4584
4585 if (arg_oci_bundle)
4586 return 0;
4587
4588 /* If all settings are masked, there's no point in looking for
4589 * the settings file */
4590 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4591 return 0;
4592
4593 /* We first look in the admin's directories in /etc and /run */
4594 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4595 _cleanup_free_ char *j = NULL;
4596
4597 j = path_join(i, arg_settings_filename);
4598 if (!j)
4599 return log_oom();
4600
4601 f = fopen(j, "re");
4602 if (f) {
4603 p = TAKE_PTR(j);
4604
4605 /* By default, we trust configuration from /etc and /run */
4606 if (arg_settings_trusted < 0)
4607 arg_settings_trusted = true;
4608
4609 break;
4610 }
4611
4612 if (errno != ENOENT)
4613 return log_error_errno(errno, "Failed to open %s: %m", j);
4614 }
4615
4616 if (!f) {
4617 /* After that, let's look for a file next to the
4618 * actual image we shall boot. */
4619
4620 if (arg_image) {
4621 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4622 if (r < 0)
4623 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4624 } else if (arg_directory) {
4625 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4626 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4627 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
4628 }
4629
4630 if (p) {
4631 f = fopen(p, "re");
4632 if (!f && errno != ENOENT)
4633 return log_error_errno(errno, "Failed to open %s: %m", p);
4634
4635 /* By default, we do not trust configuration from /var/lib/machines */
4636 if (arg_settings_trusted < 0)
4637 arg_settings_trusted = false;
4638 }
4639 }
4640
4641 if (!f)
4642 return 0;
4643
4644 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4645
4646 r = settings_load(f, p, &settings);
4647 if (r < 0)
4648 return r;
4649
4650 return merge_settings(settings, p);
4651 }
4652
4653 static int load_oci_bundle(void) {
4654 _cleanup_(settings_freep) Settings *settings = NULL;
4655 int r;
4656
4657 if (!arg_oci_bundle)
4658 return 0;
4659
4660 /* By default let's trust OCI bundles */
4661 if (arg_settings_trusted < 0)
4662 arg_settings_trusted = true;
4663
4664 r = oci_load(NULL, arg_oci_bundle, &settings);
4665 if (r < 0)
4666 return r;
4667
4668 return merge_settings(settings, arg_oci_bundle);
4669 }
4670
4671 static int run_container(
4672 DissectedImage *dissected_image,
4673 FDSet *fds,
4674 char veth_name[IFNAMSIZ], bool *veth_created,
4675 struct ExposeArgs *expose_args,
4676 int *master, pid_t *pid, int *ret) {
4677
4678 static const struct sigaction sa = {
4679 .sa_handler = nop_signal_handler,
4680 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4681 };
4682
4683 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4684 _cleanup_close_ int etc_passwd_lock = -EBADF;
4685 _cleanup_close_pair_ int
4686 fd_inner_socket_pair[2] = EBADF_PAIR,
4687 fd_outer_socket_pair[2] = EBADF_PAIR;
4688
4689 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
4690 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4691 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4692 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4693 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4694 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4695 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4696 _cleanup_free_ uid_t *bind_user_uid = NULL;
4697 size_t n_bind_user_uid = 0;
4698 ContainerStatus container_status = 0;
4699 int ifi = 0, r;
4700 ssize_t l;
4701 sigset_t mask_chld;
4702 _cleanup_close_ int child_netns_fd = -EBADF;
4703
4704 assert_se(sigemptyset(&mask_chld) == 0);
4705 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4706
4707 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4708 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4709 * check with getpwuid() if the specific user already exists. Note that /etc might be
4710 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4711 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4712 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4713 * really ours. */
4714
4715 etc_passwd_lock = take_etc_passwd_lock(NULL);
4716 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4717 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4718 }
4719
4720 r = barrier_create(&barrier);
4721 if (r < 0)
4722 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4723
4724 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
4725 return log_error_errno(errno, "Failed to create inner socket pair: %m");
4726
4727 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
4728 return log_error_errno(errno, "Failed to create outer socket pair: %m");
4729
4730 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4731 * parent's blocking calls and give it a chance to call wait() and terminate. */
4732 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4733 if (r < 0)
4734 return log_error_errno(errno, "Failed to change the signal mask: %m");
4735
4736 r = sigaction(SIGCHLD, &sa, NULL);
4737 if (r < 0)
4738 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4739
4740 if (arg_network_namespace_path) {
4741 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4742 if (child_netns_fd < 0)
4743 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4744
4745 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4746 if (r == -EUCLEAN)
4747 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4748 else if (r < 0)
4749 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4750 else if (r == 0)
4751 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4752 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4753 }
4754
4755 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4756 if (*pid < 0)
4757 return log_error_errno(errno, "clone() failed%s: %m",
4758 errno == EINVAL ?
4759 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4760
4761 if (*pid == 0) {
4762 /* The outer child only has a file system namespace. */
4763 barrier_set_role(&barrier, BARRIER_CHILD);
4764
4765 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
4766 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
4767
4768 (void) reset_all_signal_handlers();
4769 (void) reset_signal_mask();
4770
4771 r = outer_child(&barrier,
4772 arg_directory,
4773 dissected_image,
4774 fd_outer_socket_pair[1],
4775 fd_inner_socket_pair[1],
4776 fds,
4777 child_netns_fd);
4778 if (r < 0)
4779 _exit(EXIT_FAILURE);
4780
4781 _exit(EXIT_SUCCESS);
4782 }
4783
4784 barrier_set_role(&barrier, BARRIER_PARENT);
4785
4786 fdset_close(fds);
4787
4788 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
4789 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
4790
4791 if (arg_userns_mode != USER_NAMESPACE_NO) {
4792 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
4793 if (mntns_fd < 0)
4794 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
4795
4796 /* The child just let us know the UID shift it might have read from the image. */
4797 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4798 if (l < 0)
4799 return log_error_errno(errno, "Failed to read UID shift: %m");
4800 if (l != sizeof arg_uid_shift)
4801 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4802
4803 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4804 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4805 * image, but if that's already in use, pick a new one, and report back to the child,
4806 * which one we now picked. */
4807
4808 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4809 if (r < 0)
4810 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4811
4812 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4813 if (l < 0)
4814 return log_error_errno(errno, "Failed to send UID shift: %m");
4815 if (l != sizeof arg_uid_shift)
4816 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4817 }
4818
4819 n_bind_user_uid = strv_length(arg_bind_user);
4820 if (n_bind_user_uid > 0) {
4821 /* Right after the UID shift, we'll receive the list of UID mappings for the
4822 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4823
4824 bind_user_uid = new(uid_t, n_bind_user_uid*4);
4825 if (!bind_user_uid)
4826 return log_oom();
4827
4828 for (size_t i = 0; i < n_bind_user_uid; i++) {
4829 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4830 if (l < 0)
4831 return log_error_errno(errno, "Failed to read user UID map pair: %m");
4832 if (l != sizeof(uid_t)*4)
4833 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4834 SYNTHETIC_ERRNO(EIO),
4835 "Short read while reading bind user UID pairs.");
4836 }
4837 }
4838 }
4839
4840 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4841 /* The child let us know the support cgroup mode it might have read from the image. */
4842 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4843 if (l < 0)
4844 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4845 if (l != sizeof(arg_unified_cgroup_hierarchy))
4846 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
4847 l, l == 0 ? " The child is most likely dead." : "");
4848 }
4849
4850 /* Wait for the outer child. */
4851 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4852 if (r < 0)
4853 return r;
4854 if (r != EXIT_SUCCESS)
4855 return -EIO;
4856
4857 /* And now retrieve the PID of the inner child. */
4858 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
4859 if (l < 0)
4860 return log_error_errno(errno, "Failed to read inner child PID: %m");
4861 if (l != sizeof *pid)
4862 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4863
4864 /* We also retrieve container UUID in case it was generated by outer child */
4865 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4866 if (l < 0)
4867 return log_error_errno(errno, "Failed to read container machine ID: %m");
4868 if (l != sizeof(arg_uuid))
4869 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4870
4871 /* We also retrieve the socket used for notifications generated by outer child */
4872 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
4873 if (notify_socket < 0)
4874 return log_error_errno(notify_socket,
4875 "Failed to receive notification socket from the outer child: %m");
4876
4877 log_debug("Init process invoked as PID "PID_FMT, *pid);
4878
4879 if (arg_userns_mode != USER_NAMESPACE_NO) {
4880 if (!barrier_place_and_sync(&barrier)) /* #1 */
4881 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4882
4883 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4884 if (r < 0)
4885 return r;
4886
4887 (void) barrier_place(&barrier); /* #2 */
4888 }
4889
4890 if (arg_private_network) {
4891 if (!arg_network_namespace_path) {
4892 /* Wait until the child has unshared its network namespace. */
4893 if (!barrier_place_and_sync(&barrier)) /* #3 */
4894 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4895 }
4896
4897 if (child_netns_fd < 0) {
4898 /* Make sure we have an open file descriptor to the child's network
4899 * namespace so it stays alive even if the child exits. */
4900 r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4901 if (r < 0)
4902 return log_error_errno(r, "Failed to open child network namespace: %m");
4903 }
4904
4905 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4906 if (r < 0)
4907 return r;
4908
4909 if (arg_network_veth) {
4910 r = setup_veth(arg_machine, *pid, veth_name,
4911 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
4912 if (r < 0)
4913 return r;
4914 else if (r > 0)
4915 ifi = r;
4916
4917 if (arg_network_bridge) {
4918 /* Add the interface to a bridge */
4919 r = setup_bridge(veth_name, arg_network_bridge, false);
4920 if (r < 0)
4921 return r;
4922 if (r > 0)
4923 ifi = r;
4924 } else if (arg_network_zone) {
4925 /* Add the interface to a bridge, possibly creating it */
4926 r = setup_bridge(veth_name, arg_network_zone, true);
4927 if (r < 0)
4928 return r;
4929 if (r > 0)
4930 ifi = r;
4931 }
4932 }
4933
4934 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4935 if (r < 0)
4936 return r;
4937
4938 /* We created the primary and extra veth links now; let's remember this, so that we know to
4939 remove them later on. Note that we don't bother with removing veth links that were created
4940 here when their setup failed half-way, because in that case the kernel should be able to
4941 remove them on its own, since they cannot be referenced by anything yet. */
4942 *veth_created = true;
4943
4944 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4945 if (r < 0)
4946 return r;
4947
4948 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4949 if (r < 0)
4950 return r;
4951 }
4952
4953 if (arg_register || !arg_keep_unit) {
4954 r = sd_bus_default_system(&bus);
4955 if (r < 0)
4956 return log_error_errno(r, "Failed to open system bus: %m");
4957
4958 r = sd_bus_set_close_on_exit(bus, false);
4959 if (r < 0)
4960 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4961 }
4962
4963 if (!arg_keep_unit) {
4964 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4965 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4966 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4967
4968 r = sd_bus_match_signal_async(
4969 bus,
4970 NULL,
4971 "org.freedesktop.systemd1",
4972 NULL,
4973 "org.freedesktop.systemd1.Scope",
4974 "RequestStop",
4975 on_request_stop, NULL, PID_TO_PTR(*pid));
4976 if (r < 0)
4977 return log_error_errno(r, "Failed to request RequestStop match: %m");
4978 }
4979
4980 if (arg_register) {
4981 r = register_machine(
4982 bus,
4983 arg_machine,
4984 *pid,
4985 arg_directory,
4986 arg_uuid,
4987 ifi,
4988 arg_slice,
4989 arg_custom_mounts, arg_n_custom_mounts,
4990 arg_kill_signal,
4991 arg_property,
4992 arg_property_message,
4993 arg_keep_unit,
4994 arg_container_service_name,
4995 arg_start_mode);
4996 if (r < 0)
4997 return r;
4998
4999 } else if (!arg_keep_unit) {
5000 r = allocate_scope(
5001 bus,
5002 arg_machine,
5003 *pid,
5004 arg_slice,
5005 arg_custom_mounts, arg_n_custom_mounts,
5006 arg_kill_signal,
5007 arg_property,
5008 arg_property_message,
5009 /* allow_pidfds= */ true,
5010 arg_start_mode);
5011 if (r < 0)
5012 return r;
5013
5014 } else if (arg_slice || arg_property)
5015 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5016
5017 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5018 if (r < 0)
5019 return r;
5020
5021 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5022 if (r < 0)
5023 return r;
5024
5025 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5026 if (r < 0)
5027 return r;
5028
5029 /* Notify the child that the parent is ready with all
5030 * its setup (including cgroup-ification), and that
5031 * the child can now hand over control to the code to
5032 * run inside the container. */
5033 (void) barrier_place(&barrier); /* #4 */
5034
5035 /* Block SIGCHLD here, before notifying child.
5036 * process_pty() will handle it with the other signals. */
5037 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5038
5039 /* Reset signal to default */
5040 r = default_signals(SIGCHLD);
5041 if (r < 0)
5042 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5043
5044 r = sd_event_new(&event);
5045 if (r < 0)
5046 return log_error_errno(r, "Failed to get default event source: %m");
5047
5048 (void) sd_event_set_watchdog(event, true);
5049
5050 if (bus) {
5051 r = sd_bus_attach_event(bus, event, 0);
5052 if (r < 0)
5053 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5054 }
5055
5056 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5057 if (r < 0)
5058 return r;
5059
5060 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5061 * before we take the fully visible instances away. */
5062 if (!barrier_sync(&barrier)) /* #5.1 */
5063 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5064
5065 if (arg_userns_mode != USER_NAMESPACE_NO) {
5066 r = wipe_fully_visible_fs(mntns_fd);
5067 if (r < 0)
5068 return r;
5069 mntns_fd = safe_close(mntns_fd);
5070 }
5071
5072 /* And now let the child know that we completed removing the procfs instances, and it can start the
5073 * payload. */
5074 if (!barrier_place(&barrier)) /* #5.2 */
5075 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5076
5077 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5078 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5079 etc_passwd_lock = safe_close(etc_passwd_lock);
5080
5081 (void) sd_notifyf(false,
5082 "STATUS=Container running.\n"
5083 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5084 if (!arg_notify_ready) {
5085 r = sd_notify(false, "READY=1\n");
5086 if (r < 0)
5087 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5088 }
5089
5090 if (arg_kill_signal > 0) {
5091 /* Try to kill the init system on SIGINT or SIGTERM */
5092 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5093 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5094 } else {
5095 /* Immediately exit */
5096 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5097 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5098 }
5099
5100 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5101
5102 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5103 if (r < 0)
5104 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5105
5106 /* Exit when the child exits */
5107 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5108
5109 /* Retrieve the kmsg fifo allocated by inner child */
5110 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5111 if (fd_kmsg_fifo < 0)
5112 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5113
5114 if (arg_expose_ports) {
5115 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
5116 if (r < 0)
5117 return r;
5118
5119 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5120 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5121 }
5122
5123 if (arg_console_mode != CONSOLE_PIPE) {
5124 _cleanup_close_ int fd = -EBADF;
5125 PTYForwardFlags flags = 0;
5126
5127 /* Retrieve the master pty allocated by inner child */
5128 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5129 if (fd < 0)
5130 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5131
5132 switch (arg_console_mode) {
5133
5134 case CONSOLE_READ_ONLY:
5135 flags |= PTY_FORWARD_READ_ONLY;
5136
5137 _fallthrough_;
5138
5139 case CONSOLE_INTERACTIVE:
5140 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5141
5142 r = pty_forward_new(event, fd, flags, &forward);
5143 if (r < 0)
5144 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5145
5146 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5147 (void) pty_forward_set_width_height(forward,
5148 arg_console_width,
5149 arg_console_height);
5150 break;
5151
5152 default:
5153 assert(arg_console_mode == CONSOLE_PASSIVE);
5154 }
5155
5156 *master = TAKE_FD(fd);
5157 }
5158
5159 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5160
5161 r = sd_event_loop(event);
5162 if (r < 0)
5163 return log_error_errno(r, "Failed to run event loop: %m");
5164
5165 if (forward) {
5166 char last_char = 0;
5167
5168 (void) pty_forward_get_last_char(forward, &last_char);
5169 forward = pty_forward_free(forward);
5170
5171 if (!arg_quiet && last_char != '\n')
5172 putc('\n', stdout);
5173 }
5174
5175 /* Kill if it is not dead yet anyway */
5176 if (!arg_register && !arg_keep_unit && bus)
5177 terminate_scope(bus, arg_machine);
5178
5179 /* Normally redundant, but better safe than sorry */
5180 (void) kill(*pid, SIGKILL);
5181
5182 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5183
5184 if (arg_private_network) {
5185 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5186 * to avoid having to move the parent to the child network namespace. */
5187 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_LOG, NULL);
5188 if (r < 0)
5189 return r;
5190
5191 if (r == 0) {
5192 _cleanup_close_ int parent_netns_fd = -EBADF;
5193
5194 r = namespace_open(getpid_cached(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5195 if (r < 0) {
5196 log_error_errno(r, "Failed to open parent network namespace: %m");
5197 _exit(EXIT_FAILURE);
5198 }
5199
5200 r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5201 if (r < 0) {
5202 log_error_errno(r, "Failed to enter child network namespace: %m");
5203 _exit(EXIT_FAILURE);
5204 }
5205
5206 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5207 * This is about ensuring interfaces get their old name back when being moved back. */
5208 arg_network_interfaces = strv_reverse(arg_network_interfaces);
5209
5210 r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5211 if (r < 0)
5212 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5213
5214 _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5215 }
5216 }
5217
5218 r = wait_for_container(TAKE_PID(*pid), &container_status);
5219
5220 /* Tell machined that we are gone. */
5221 if (bus)
5222 (void) unregister_machine(bus, arg_machine);
5223
5224 if (r < 0)
5225 /* We failed to wait for the container, or the container exited abnormally. */
5226 return r;
5227 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5228 /* r > 0 → The container exited with a non-zero status.
5229 * As a special case, we need to replace 133 with a different value,
5230 * because 133 is special-cased in the service file to reboot the container.
5231 * otherwise → The container exited with zero status and a reboot was not requested.
5232 */
5233 if (r == EXIT_FORCE_RESTART)
5234 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5235 *ret = r;
5236 return 0; /* finito */
5237 }
5238
5239 /* CONTAINER_REBOOTED, loop again */
5240
5241 if (arg_keep_unit) {
5242 /* Special handling if we are running as a service: instead of simply
5243 * restarting the machine we want to restart the entire service, so let's
5244 * inform systemd about this with the special exit code 133. The service
5245 * file uses RestartForceExitStatus=133 so that this results in a full
5246 * nspawn restart. This is necessary since we might have cgroup parameters
5247 * set we want to have flushed out. */
5248 *ret = EXIT_FORCE_RESTART;
5249 return 0; /* finito */
5250 }
5251
5252 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5253 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5254
5255 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5256 *veth_created = false;
5257 return 1; /* loop again */
5258 }
5259
5260 static int initialize_rlimits(void) {
5261 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5262 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5263 * container execution environments. */
5264
5265 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5266 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5267 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5268 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5269 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5270 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5271 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5272 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5273 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5274 [RLIMIT_NICE] = { 0, 0 },
5275 [RLIMIT_NOFILE] = { 1024, 4096 },
5276 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5277 [RLIMIT_RTPRIO] = { 0, 0 },
5278 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5279 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5280
5281 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5282 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5283 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5284 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5285 * that PID 1 changes a number of other resource limits during early initialization which is why we
5286 * don't read the other limits from PID 1 but prefer the static table above. */
5287 };
5288
5289 int rl, r;
5290
5291 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5292 /* Let's only fill in what the user hasn't explicitly configured anyway */
5293 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5294 const struct rlimit *v;
5295 struct rlimit buffer;
5296
5297 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5298 /* For these two let's read the limits off PID 1. See above for an explanation. */
5299
5300 r = pid_getrlimit(1, rl, &buffer);
5301 if (r < 0)
5302 return log_error_errno(r, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5303
5304 v = &buffer;
5305 } else if (rl == RLIMIT_NOFILE) {
5306 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5307 * userspace. Given that nspawn containers are often run without our PID 1,
5308 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5309 * so that container userspace gets similar resources as host userspace
5310 * gets. */
5311 buffer = kernel_defaults[rl];
5312 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5313 v = &buffer;
5314 } else
5315 v = kernel_defaults + rl;
5316
5317 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5318 if (!arg_rlimit[rl])
5319 return log_oom();
5320 }
5321
5322 if (DEBUG_LOGGING) {
5323 _cleanup_free_ char *k = NULL;
5324
5325 (void) rlimit_format(arg_rlimit[rl], &k);
5326 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5327 }
5328 }
5329
5330 return 0;
5331 }
5332
5333 static int cant_be_in_netns(void) {
5334 _cleanup_close_ int fd = -EBADF;
5335 struct ucred ucred;
5336 int r;
5337
5338 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5339 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5340 * nice message. */
5341
5342 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5343 return 0;
5344
5345 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5346 if (fd < 0)
5347 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5348
5349 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5350 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5351 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5352 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5353 if (r < 0)
5354 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5355
5356 r = getpeercred(fd, &ucred);
5357 if (r < 0)
5358 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5359
5360 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
5361 if (r < 0)
5362 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5363 if (r == 0)
5364 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5365 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5366 return 0;
5367 }
5368
5369 static int run(int argc, char *argv[]) {
5370 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5371 _cleanup_close_ int master = -EBADF;
5372 _cleanup_fdset_free_ FDSet *fds = NULL;
5373 int r, n_fd_passed, ret = EXIT_SUCCESS;
5374 char veth_name[IFNAMSIZ] = "";
5375 struct ExposeArgs expose_args = {};
5376 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5377 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5378 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5379 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5380 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5381 pid_t pid = 0;
5382
5383 log_parse_environment();
5384 log_open();
5385
5386 r = parse_argv(argc, argv);
5387 if (r <= 0)
5388 goto finish;
5389
5390 if (geteuid() != 0) {
5391 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5392 argc >= 2 ? "Need to be root." :
5393 "Need to be root (and some arguments are usually required).\nHint: try --help");
5394 goto finish;
5395 }
5396
5397 r = cant_be_in_netns();
5398 if (r < 0)
5399 goto finish;
5400
5401 r = initialize_rlimits();
5402 if (r < 0)
5403 goto finish;
5404
5405 r = load_oci_bundle();
5406 if (r < 0)
5407 goto finish;
5408
5409 r = determine_names();
5410 if (r < 0)
5411 goto finish;
5412
5413 r = load_settings();
5414 if (r < 0)
5415 goto finish;
5416
5417 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
5418 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
5419 * indicate that. */
5420 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
5421 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
5422
5423 r = cg_unified();
5424 if (r < 0) {
5425 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5426 goto finish;
5427 }
5428
5429 r = verify_arguments();
5430 if (r < 0)
5431 goto finish;
5432
5433 r = verify_network_interfaces_initialized();
5434 if (r < 0)
5435 goto finish;
5436
5437 /* Reapply environment settings. */
5438 (void) detect_unified_cgroup_hierarchy_from_environment();
5439
5440 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5441 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5442 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5443 (void) ignore_signals(SIGPIPE);
5444
5445 n_fd_passed = sd_listen_fds(false);
5446 if (n_fd_passed > 0) {
5447 r = fdset_new_listen_fds(&fds, false);
5448 if (r < 0) {
5449 log_error_errno(r, "Failed to collect file descriptors: %m");
5450 goto finish;
5451 }
5452 }
5453
5454 /* The "default" umask. This is appropriate for most file and directory
5455 * operations performed by nspawn, and is the umask that will be used for
5456 * the child. Functions like copy_devnodes() change the umask temporarily. */
5457 umask(0022);
5458
5459 if (arg_directory) {
5460 assert(!arg_image);
5461
5462 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5463 * /var from the host will propagate into container dynamically (because bad things happen if
5464 * two systems write to the same /var). Let's allow it for the special cases where /var is
5465 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5466 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5467 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5468 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5469 goto finish;
5470 }
5471
5472 if (arg_ephemeral) {
5473 _cleanup_free_ char *np = NULL;
5474
5475 r = chase_and_update(&arg_directory, 0);
5476 if (r < 0)
5477 goto finish;
5478
5479 /* If the specified path is a mount point we generate the new snapshot immediately
5480 * inside it under a random name. However if the specified is not a mount point we
5481 * create the new snapshot in the parent directory, just next to it. */
5482 r = path_is_mount_point(arg_directory, NULL, 0);
5483 if (r < 0) {
5484 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5485 goto finish;
5486 }
5487 if (r > 0)
5488 r = tempfn_random_child(arg_directory, "machine.", &np);
5489 else
5490 r = tempfn_random(arg_directory, "machine.", &np);
5491 if (r < 0) {
5492 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5493 goto finish;
5494 }
5495
5496 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5497 * only owned by us and no one else. */
5498 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5499 if (r < 0) {
5500 log_error_errno(r, "Failed to lock %s: %m", np);
5501 goto finish;
5502 }
5503
5504 {
5505 BLOCK_SIGNALS(SIGINT);
5506 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5507 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5508 BTRFS_SNAPSHOT_FALLBACK_COPY |
5509 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5510 BTRFS_SNAPSHOT_RECURSIVE |
5511 BTRFS_SNAPSHOT_QUOTA |
5512 BTRFS_SNAPSHOT_SIGINT);
5513 }
5514 if (r == -EINTR) {
5515 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5516 goto finish;
5517 }
5518 if (r < 0) {
5519 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5520 goto finish;
5521 }
5522
5523 free_and_replace(arg_directory, np);
5524 remove_directory = true;
5525 } else {
5526 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5527 if (r < 0)
5528 goto finish;
5529
5530 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5531 if (r == -EBUSY) {
5532 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5533 goto finish;
5534 }
5535 if (r < 0) {
5536 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5537 goto finish;
5538 }
5539
5540 if (arg_template) {
5541 r = chase_and_update(&arg_template, 0);
5542 if (r < 0)
5543 goto finish;
5544
5545 {
5546 BLOCK_SIGNALS(SIGINT);
5547 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5548 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5549 BTRFS_SNAPSHOT_FALLBACK_COPY |
5550 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5551 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5552 BTRFS_SNAPSHOT_RECURSIVE |
5553 BTRFS_SNAPSHOT_QUOTA |
5554 BTRFS_SNAPSHOT_SIGINT);
5555 }
5556 if (r == -EEXIST)
5557 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5558 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5559 else if (r == -EINTR) {
5560 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5561 goto finish;
5562 } else if (r < 0) {
5563 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5564 goto finish;
5565 } else
5566 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5567 "Populated %s from template %s.", arg_directory, arg_template);
5568 }
5569 }
5570
5571 if (arg_start_mode == START_BOOT) {
5572 _cleanup_free_ char *b = NULL;
5573 const char *p;
5574 int check_os_release, is_os_tree;
5575
5576 if (arg_pivot_root_new) {
5577 b = path_join(arg_directory, arg_pivot_root_new);
5578 if (!b) {
5579 r = log_oom();
5580 goto finish;
5581 }
5582
5583 p = b;
5584 } else
5585 p = arg_directory;
5586
5587 check_os_release = getenv_bool("SYSTEMD_NSPAWN_CHECK_OS_RELEASE");
5588 if (check_os_release < 0 && check_os_release != -ENXIO) {
5589 r = log_error_errno(check_os_release, "Failed to parse $SYSTEMD_NSPAWN_CHECK_OS_RELEASE: %m");
5590 goto finish;
5591 }
5592
5593 is_os_tree = path_is_os_tree(p);
5594 if (is_os_tree == 0 && check_os_release == 0)
5595 log_debug("Directory %s is missing an os-release file, continuing anyway.", p);
5596 else if (is_os_tree <= 0) {
5597 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5598 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5599 goto finish;
5600 }
5601 } else {
5602 _cleanup_free_ char *p = NULL;
5603
5604 if (arg_pivot_root_new)
5605 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5606 else
5607 p = path_join(arg_directory, "/usr/");
5608 if (!p) {
5609 r = log_oom();
5610 goto finish;
5611 }
5612
5613 if (laccess(p, F_OK) < 0) {
5614 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5615 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5616 goto finish;
5617 }
5618 }
5619
5620 } else {
5621 DissectImageFlags dissect_image_flags =
5622 DISSECT_IMAGE_GENERIC_ROOT |
5623 DISSECT_IMAGE_REQUIRE_ROOT |
5624 DISSECT_IMAGE_RELAX_VAR_CHECK |
5625 DISSECT_IMAGE_USR_NO_ROOT |
5626 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
5627 DISSECT_IMAGE_PIN_PARTITION_DEVICES;
5628 assert(arg_image);
5629 assert(!arg_template);
5630
5631 r = chase_and_update(&arg_image, 0);
5632 if (r < 0)
5633 goto finish;
5634
5635 if (arg_ephemeral) {
5636 _cleanup_free_ char *np = NULL;
5637
5638 r = tempfn_random(arg_image, "machine.", &np);
5639 if (r < 0) {
5640 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5641 goto finish;
5642 }
5643
5644 /* Always take an exclusive lock on our own ephemeral copy. */
5645 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5646 if (r < 0) {
5647 log_error_errno(r, "Failed to create image lock: %m");
5648 goto finish;
5649 }
5650
5651 {
5652 BLOCK_SIGNALS(SIGINT);
5653 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
5654 FS_NOCOW_FL, FS_NOCOW_FL,
5655 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
5656 NULL, NULL);
5657 }
5658 if (r == -EINTR) {
5659 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5660 goto finish;
5661 }
5662 if (r < 0) {
5663 r = log_error_errno(r, "Failed to copy image file: %m");
5664 goto finish;
5665 }
5666
5667 free_and_replace(arg_image, np);
5668 remove_image = true;
5669 } else {
5670 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5671 if (r == -EBUSY) {
5672 log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5673 goto finish;
5674 }
5675 if (r < 0) {
5676 log_error_errno(r, "Failed to create image lock: %m");
5677 goto finish;
5678 }
5679
5680 r = verity_settings_load(
5681 &arg_verity_settings,
5682 arg_image, NULL, NULL);
5683 if (r < 0) {
5684 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5685 goto finish;
5686 }
5687
5688 if (arg_verity_settings.data_path)
5689 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5690 }
5691
5692 if (!mkdtemp(tmprootdir)) {
5693 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5694 goto finish;
5695 }
5696
5697 remove_tmprootdir = true;
5698
5699 arg_directory = strdup(tmprootdir);
5700 if (!arg_directory) {
5701 r = log_oom();
5702 goto finish;
5703 }
5704
5705 r = loop_device_make_by_path(
5706 arg_image,
5707 arg_read_only ? O_RDONLY : O_RDWR,
5708 /* sector_size= */ UINT32_MAX,
5709 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5710 LOCK_SH,
5711 &loop);
5712 if (r < 0) {
5713 log_error_errno(r, "Failed to set up loopback block device: %m");
5714 goto finish;
5715 }
5716
5717 r = dissect_loop_device_and_warn(
5718 loop,
5719 &arg_verity_settings,
5720 /* mount_options=*/ NULL,
5721 arg_image_policy ?: &image_policy_container,
5722 dissect_image_flags,
5723 &dissected_image);
5724 if (r == -ENOPKG) {
5725 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5726 log_notice("Note that the disk image needs to\n"
5727 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5728 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5729 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5730 " d) or contain a file system without a partition table\n"
5731 "in order to be bootable with systemd-nspawn.");
5732 goto finish;
5733 }
5734 if (r < 0)
5735 goto finish;
5736
5737 r = dissected_image_load_verity_sig_partition(
5738 dissected_image,
5739 loop->fd,
5740 &arg_verity_settings);
5741 if (r < 0)
5742 goto finish;
5743
5744 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5745 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5746 "root hash signature found! Proceeding without integrity checking.", arg_image);
5747
5748 r = dissected_image_decrypt_interactively(
5749 dissected_image,
5750 NULL,
5751 &arg_verity_settings,
5752 0);
5753 if (r < 0)
5754 goto finish;
5755
5756 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5757 if (remove_image && unlink(arg_image) >= 0)
5758 remove_image = false;
5759
5760 if (arg_architecture < 0)
5761 arg_architecture = dissected_image_architecture(dissected_image);
5762 }
5763
5764 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5765 if (r < 0)
5766 goto finish;
5767
5768 if (arg_console_mode < 0)
5769 arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
5770 CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5771
5772 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5773 arg_quiet = true;
5774
5775 if (!arg_quiet)
5776 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
5777 arg_machine, arg_image ?: arg_directory);
5778
5779 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
5780
5781 r = make_reaper_process(true);
5782 if (r < 0) {
5783 log_error_errno(r, "Failed to become subreaper: %m");
5784 goto finish;
5785 }
5786
5787 if (arg_expose_ports) {
5788 r = fw_ctx_new(&fw_ctx);
5789 if (r < 0) {
5790 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5791 goto finish;
5792 }
5793 expose_args.fw_ctx = fw_ctx;
5794 }
5795 for (;;) {
5796 r = run_container(dissected_image,
5797 fds,
5798 veth_name, &veth_created,
5799 &expose_args, &master,
5800 &pid, &ret);
5801 if (r <= 0)
5802 break;
5803 }
5804
5805 finish:
5806 (void) sd_notify(false,
5807 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5808 "STOPPING=1\nSTATUS=Terminating...");
5809
5810 if (pid > 0)
5811 (void) kill(pid, SIGKILL);
5812
5813 /* Try to flush whatever is still queued in the pty */
5814 if (master >= 0) {
5815 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5816 master = safe_close(master);
5817 }
5818
5819 if (pid > 0)
5820 (void) wait_for_terminate(pid, NULL);
5821
5822 pager_close();
5823
5824 if (remove_directory && arg_directory) {
5825 int k;
5826
5827 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5828 if (k < 0)
5829 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5830 }
5831
5832 if (remove_image && arg_image) {
5833 if (unlink(arg_image) < 0)
5834 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5835 }
5836
5837 if (remove_tmprootdir) {
5838 if (rmdir(tmprootdir) < 0)
5839 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5840 }
5841
5842 if (arg_machine) {
5843 const char *p;
5844
5845 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5846 (void) rm_rf(p, REMOVE_ROOT);
5847 }
5848
5849 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
5850 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5851
5852 if (veth_created)
5853 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5854 (void) remove_bridge(arg_network_zone);
5855
5856 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5857 expose_port_free_all(arg_expose_ports);
5858 rlimit_free_all(arg_rlimit);
5859 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5860
5861 if (r < 0)
5862 return r;
5863
5864 return ret;
5865 }
5866
5867 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);