]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
man/run0: remove @ syntax for --machine=
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <getopt.h>
5 #include <linux/loop.h>
6 #if HAVE_SELINUX
7 #include <selinux/selinux.h>
8 #endif
9 #include <stdlib.h>
10 #include <sys/file.h>
11 #include <sys/ioctl.h>
12 #include <sys/mount.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <termios.h>
18 #include <unistd.h>
19
20 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
21
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25
26 #include "alloc-util.h"
27 #include "ether-addr-util.h"
28 #include "barrier.h"
29 #include "base-filesystem.h"
30 #include "blkid-util.h"
31 #include "btrfs-util.h"
32 #include "build.h"
33 #include "bus-error.h"
34 #include "bus-locator.h"
35 #include "bus-util.h"
36 #include "cap-list.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
39 #include "chase.h"
40 #include "common-signal.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "creds-util.h"
44 #include "dev-setup.h"
45 #include "discover-image.h"
46 #include "dissect-image.h"
47 #include "env-util.h"
48 #include "escape.h"
49 #include "fd-util.h"
50 #include "fdset.h"
51 #include "fileio.h"
52 #include "format-util.h"
53 #include "fs-util.h"
54 #include "gpt.h"
55 #include "hexdecoct.h"
56 #include "hostname-setup.h"
57 #include "hostname-util.h"
58 #include "id128-util.h"
59 #include "io-util.h"
60 #include "log.h"
61 #include "loop-util.h"
62 #include "loopback-setup.h"
63 #include "machine-credential.h"
64 #include "macro.h"
65 #include "main-func.h"
66 #include "missing_sched.h"
67 #include "mkdir.h"
68 #include "mount-util.h"
69 #include "mountpoint-util.h"
70 #include "namespace-util.h"
71 #include "netlink-util.h"
72 #include "nspawn-bind-user.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-def.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-oci.h"
79 #include "nspawn-patch-uid.h"
80 #include "nspawn-register.h"
81 #include "nspawn-seccomp.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "nspawn-util.h"
86 #include "nspawn.h"
87 #include "nsresource.h"
88 #include "nulstr-util.h"
89 #include "os-util.h"
90 #include "pager.h"
91 #include "parse-argument.h"
92 #include "parse-util.h"
93 #include "pretty-print.h"
94 #include "process-util.h"
95 #include "ptyfwd.h"
96 #include "random-util.h"
97 #include "raw-clone.h"
98 #include "resolve-util.h"
99 #include "rlimit-util.h"
100 #include "rm-rf.h"
101 #include "seccomp-util.h"
102 #include "selinux-util.h"
103 #include "signal-util.h"
104 #include "socket-util.h"
105 #include "stat-util.h"
106 #include "stdio-util.h"
107 #include "string-table.h"
108 #include "string-util.h"
109 #include "strv.h"
110 #include "sysctl-util.h"
111 #include "terminal-util.h"
112 #include "tmpfile-util.h"
113 #include "umask-util.h"
114 #include "unit-name.h"
115 #include "user-util.h"
116 #include "vpick.h"
117
118 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
119 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
120 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
121
122 #define EXIT_FORCE_RESTART 133
123
124 typedef enum ContainerStatus {
125 CONTAINER_TERMINATED,
126 CONTAINER_REBOOTED,
127 } ContainerStatus;
128
129 static char *arg_directory = NULL;
130 static char *arg_template = NULL;
131 static char *arg_chdir = NULL;
132 static char *arg_pivot_root_new = NULL;
133 static char *arg_pivot_root_old = NULL;
134 static char *arg_user = NULL;
135 static uid_t arg_uid = UID_INVALID;
136 static gid_t arg_gid = GID_INVALID;
137 static gid_t* arg_supplementary_gids = NULL;
138 static size_t arg_n_supplementary_gids = 0;
139 static sd_id128_t arg_uuid = {};
140 static char *arg_machine = NULL; /* The name used by the host to refer to this */
141 static char *arg_hostname = NULL; /* The name the payload sees by default */
142 static const char *arg_selinux_context = NULL;
143 static const char *arg_selinux_apifs_context = NULL;
144 static char *arg_slice = NULL;
145 static bool arg_private_network = false;
146 static bool arg_read_only = false;
147 static StartMode arg_start_mode = START_PID1;
148 static bool arg_ephemeral = false;
149 static LinkJournal arg_link_journal = LINK_AUTO;
150 static bool arg_link_journal_try = false;
151 static uint64_t arg_caps_retain =
152 (1ULL << CAP_AUDIT_CONTROL) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_CHOWN) |
155 (1ULL << CAP_DAC_OVERRIDE) |
156 (1ULL << CAP_DAC_READ_SEARCH) |
157 (1ULL << CAP_FOWNER) |
158 (1ULL << CAP_FSETID) |
159 (1ULL << CAP_IPC_OWNER) |
160 (1ULL << CAP_KILL) |
161 (1ULL << CAP_LEASE) |
162 (1ULL << CAP_LINUX_IMMUTABLE) |
163 (1ULL << CAP_MKNOD) |
164 (1ULL << CAP_NET_BIND_SERVICE) |
165 (1ULL << CAP_NET_BROADCAST) |
166 (1ULL << CAP_NET_RAW) |
167 (1ULL << CAP_SETFCAP) |
168 (1ULL << CAP_SETGID) |
169 (1ULL << CAP_SETPCAP) |
170 (1ULL << CAP_SETUID) |
171 (1ULL << CAP_SYS_ADMIN) |
172 (1ULL << CAP_SYS_BOOT) |
173 (1ULL << CAP_SYS_CHROOT) |
174 (1ULL << CAP_SYS_NICE) |
175 (1ULL << CAP_SYS_PTRACE) |
176 (1ULL << CAP_SYS_RESOURCE) |
177 (1ULL << CAP_SYS_TTY_CONFIG);
178 static uint64_t arg_caps_ambient = 0;
179 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
180 static CustomMount *arg_custom_mounts = NULL;
181 static size_t arg_n_custom_mounts = 0;
182 static char **arg_setenv = NULL;
183 static bool arg_quiet = false;
184 static bool arg_register = true;
185 static bool arg_keep_unit = false;
186 static char **arg_network_interfaces = NULL;
187 static char **arg_network_macvlan = NULL;
188 static char **arg_network_ipvlan = NULL;
189 static bool arg_network_veth = false;
190 static char **arg_network_veth_extra = NULL;
191 static char *arg_network_bridge = NULL;
192 static char *arg_network_zone = NULL;
193 static char *arg_network_namespace_path = NULL;
194 struct ether_addr arg_network_provided_mac = {};
195 static PagerFlags arg_pager_flags = 0;
196 static unsigned long arg_personality = PERSONALITY_INVALID;
197 static char *arg_image = NULL;
198 static char *arg_oci_bundle = NULL;
199 static VolatileMode arg_volatile_mode = VOLATILE_NO;
200 static ExposePort *arg_expose_ports = NULL;
201 static char **arg_property = NULL;
202 static sd_bus_message *arg_property_message = NULL;
203 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
206 static int arg_kill_signal = 0;
207 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
208 static SettingsMask arg_settings_mask = 0;
209 static int arg_settings_trusted = -1;
210 static char **arg_parameters = NULL;
211 static const char *arg_container_service_name = "systemd-nspawn";
212 static bool arg_notify_ready = false;
213 static bool arg_use_cgns = true;
214 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
215 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
216 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
217 static char **arg_syscall_allow_list = NULL;
218 static char **arg_syscall_deny_list = NULL;
219 #if HAVE_SECCOMP
220 static scmp_filter_ctx arg_seccomp = NULL;
221 #endif
222 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
223 static bool arg_no_new_privileges = false;
224 static int arg_oom_score_adjust = 0;
225 static bool arg_oom_score_adjust_set = false;
226 static CPUSet arg_cpu_set = {};
227 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
228 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
229 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
230 static DeviceNode* arg_extra_nodes = NULL;
231 static size_t arg_n_extra_nodes = 0;
232 static char **arg_sysctl = NULL;
233 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
234 static MachineCredentialContext arg_credentials = {};
235 static char **arg_bind_user = NULL;
236 static bool arg_suppress_sync = false;
237 static char *arg_settings_filename = NULL;
238 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
239 static ImagePolicy *arg_image_policy = NULL;
240 static char *arg_background = NULL;
241 static bool arg_privileged = false;
242
243 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
259 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
261 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
262 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
263 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
264 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
265 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
266 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
267 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
268 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
269 #if HAVE_SECCOMP
270 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
271 #endif
272 STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
273 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
274 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
275 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
276 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
277 STATIC_DESTRUCTOR_REGISTER(arg_image_policy, image_policy_freep);
278 STATIC_DESTRUCTOR_REGISTER(arg_background, freep);
279
280 static int handle_arg_console(const char *arg) {
281 if (streq(arg, "help")) {
282 puts("autopipe\n"
283 "interactive\n"
284 "passive\n"
285 "pipe\n"
286 "read-only");
287 return 0;
288 }
289
290 if (streq(arg, "interactive"))
291 arg_console_mode = CONSOLE_INTERACTIVE;
292 else if (streq(arg, "read-only"))
293 arg_console_mode = CONSOLE_READ_ONLY;
294 else if (streq(arg, "passive"))
295 arg_console_mode = CONSOLE_PASSIVE;
296 else if (streq(arg, "pipe")) {
297 if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
298 log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
299 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
300 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
301 "Proceeding anyway.");
302
303 arg_console_mode = CONSOLE_PIPE;
304 } else if (streq(arg, "autopipe")) {
305 if (isatty(STDIN_FILENO) && isatty(STDOUT_FILENO))
306 arg_console_mode = CONSOLE_INTERACTIVE;
307 else
308 arg_console_mode = CONSOLE_PIPE;
309 } else
310 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
311
312 arg_settings_mask |= SETTING_CONSOLE_MODE;
313 return 1;
314 }
315
316 static int help(void) {
317 _cleanup_free_ char *link = NULL;
318 int r;
319
320 pager_open(arg_pager_flags);
321
322 r = terminal_urlify_man("systemd-nspawn", "1", &link);
323 if (r < 0)
324 return log_oom();
325
326 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
327 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
328 " -h --help Show this help\n"
329 " --version Print version string\n"
330 " -q --quiet Do not show status information\n"
331 " --no-pager Do not pipe output into a pager\n"
332 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
333 "\n%3$sImage:%4$s\n"
334 " -D --directory=PATH Root directory for the container\n"
335 " --template=PATH Initialize root directory from template directory,\n"
336 " if missing\n"
337 " -x --ephemeral Run container with snapshot of root directory, and\n"
338 " remove it after exit\n"
339 " -i --image=PATH Root file system disk image (or device node) for\n"
340 " the container\n"
341 " --image-policy=POLICY Specify disk image dissection policy\n"
342 " --oci-bundle=PATH OCI bundle directory\n"
343 " --read-only Mount the root directory read-only\n"
344 " --volatile[=MODE] Run the system in volatile mode\n"
345 " --root-hash=HASH Specify verity root hash for root disk image\n"
346 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
347 " as a DER encoded PKCS7, either as a path to a file\n"
348 " or as an ASCII base64 encoded string prefixed by\n"
349 " 'base64:'\n"
350 " --verity-data=PATH Specify hash device for verity\n"
351 " --pivot-root=PATH[:PATH]\n"
352 " Pivot root to given directory in the container\n"
353 "\n%3$sExecution:%4$s\n"
354 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
355 " -b --boot Boot up full system (i.e. invoke init)\n"
356 " --chdir=PATH Set working directory in the container\n"
357 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
358 " -u --user=USER Run the command under specified user or UID\n"
359 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
360 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
361 " --suppress-sync=BOOLEAN\n"
362 " Suppress any form of disk data synchronization\n"
363 "\n%3$sSystem Identity:%4$s\n"
364 " -M --machine=NAME Set the machine name for the container\n"
365 " --hostname=NAME Override the hostname for the container\n"
366 " --uuid=UUID Set a specific machine UUID for the container\n"
367 "\n%3$sProperties:%4$s\n"
368 " -S --slice=SLICE Place the container in the specified slice\n"
369 " --property=NAME=VALUE Set scope unit property\n"
370 " --register=BOOLEAN Register container as machine\n"
371 " --keep-unit Do not register a scope for the machine, reuse\n"
372 " the service unit nspawn is running in\n"
373 "\n%3$sUser Namespacing:%4$s\n"
374 " --private-users=no Run without user namespacing\n"
375 " --private-users=yes|pick|identity\n"
376 " Run within user namespace, autoselect UID/GID range\n"
377 " --private-users=UIDBASE[:NUIDS]\n"
378 " Similar, but with user configured UID/GID range\n"
379 " --private-users-ownership=MODE\n"
380 " Adjust ('chown') or map ('map') OS tree ownership\n"
381 " to private UID/GID range\n"
382 " -U Equivalent to --private-users=pick and\n"
383 " --private-users-ownership=auto\n"
384 "\n%3$sNetworking:%4$s\n"
385 " --private-network Disable network in container\n"
386 " --network-interface=HOSTIF[:CONTAINERIF]\n"
387 " Assign an existing network interface to the\n"
388 " container\n"
389 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
390 " Create a macvlan network interface based on an\n"
391 " existing network interface to the container\n"
392 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
393 " Create an ipvlan network interface based on an\n"
394 " existing network interface to the container\n"
395 " -n --network-veth Add a virtual Ethernet connection between host\n"
396 " and container\n"
397 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
398 " Add an additional virtual Ethernet link between\n"
399 " host and container\n"
400 " --network-bridge=INTERFACE\n"
401 " Add a virtual Ethernet connection to the container\n"
402 " and attach it to an existing bridge on the host\n"
403 " --network-zone=NAME Similar, but attach the new interface to an\n"
404 " an automatically managed bridge interface\n"
405 " --network-namespace-path=PATH\n"
406 " Set network namespace to the one represented by\n"
407 " the specified kernel namespace file node\n"
408 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
409 " Expose a container IP port on the host\n"
410 "\n%3$sSecurity:%4$s\n"
411 " --capability=CAP In addition to the default, retain specified\n"
412 " capability\n"
413 " --drop-capability=CAP Drop the specified capability from the default set\n"
414 " --ambient-capability=CAP\n"
415 " Sets the specified capability for the started\n"
416 " process. Not useful if booting a machine.\n"
417 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
418 " --system-call-filter=LIST|~LIST\n"
419 " Permit/prohibit specific system calls\n"
420 " -Z --selinux-context=SECLABEL\n"
421 " Set the SELinux security context to be used by\n"
422 " processes in the container\n"
423 " -L --selinux-apifs-context=SECLABEL\n"
424 " Set the SELinux security context to be used by\n"
425 " API/tmpfs file systems in the container\n"
426 "\n%3$sResources:%4$s\n"
427 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
428 " --oom-score-adjust=VALUE\n"
429 " Adjust the OOM score value for the payload\n"
430 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
431 " --personality=ARCH Pick personality for this container\n"
432 "\n%3$sIntegration:%4$s\n"
433 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
434 " --timezone=MODE Select mode of /etc/localtime initialization\n"
435 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
436 " host, try-guest, try-host\n"
437 " -j Equivalent to --link-journal=try-guest\n"
438 "\n%3$sMounts:%4$s\n"
439 " --bind=PATH[:PATH[:OPTIONS]]\n"
440 " Bind mount a file or directory from the host into\n"
441 " the container\n"
442 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
443 " Similar, but creates a read-only bind mount\n"
444 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
445 " it\n"
446 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
447 " --overlay=PATH[:PATH...]:PATH\n"
448 " Create an overlay mount from the host to \n"
449 " the container\n"
450 " --overlay-ro=PATH[:PATH...]:PATH\n"
451 " Similar, but creates a read-only overlay mount\n"
452 " --bind-user=NAME Bind user from host to container\n"
453 "\n%3$sInput/Output:%4$s\n"
454 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
455 " set up for the container.\n"
456 " -P --pipe Equivalent to --console=pipe\n"
457 " --background=COLOR Set ANSI color for background\n"
458 "\n%3$sCredentials:%4$s\n"
459 " --set-credential=ID:VALUE\n"
460 " Pass a credential with literal value to container.\n"
461 " --load-credential=ID:PATH\n"
462 " Load credential to pass to container from file or\n"
463 " AF_UNIX stream socket.\n"
464 "\nSee the %2$s for details.\n",
465 program_invocation_short_name,
466 link,
467 ansi_underline(),
468 ansi_normal(),
469 ansi_highlight(),
470 ansi_normal());
471
472 return 0;
473 }
474
475 static int custom_mount_check_all(void) {
476 size_t i;
477
478 for (i = 0; i < arg_n_custom_mounts; i++) {
479 CustomMount *m = &arg_custom_mounts[i];
480
481 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
482 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
483 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
484 "--private-users-ownership=own may not be combined with custom root mounts.");
485 if (arg_uid_shift == UID_INVALID)
486 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
487 "--private-users with automatic UID shift may not be combined with custom root mounts.");
488 }
489 }
490
491 return 0;
492 }
493
494 static int detect_unified_cgroup_hierarchy_from_environment(void) {
495 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
496 int r;
497
498 /* Allow the user to control whether the unified hierarchy is used */
499
500 e = getenv(var);
501 if (!e) {
502 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
503 var = "UNIFIED_CGROUP_HIERARCHY";
504 e = getenv(var);
505 }
506
507 if (!isempty(e)) {
508 r = parse_boolean(e);
509 if (r < 0)
510 return log_error_errno(r, "Failed to parse $%s: %m", var);
511 if (r > 0)
512 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
513 else
514 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
515 }
516
517 return 0;
518 }
519
520 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
521 int r;
522
523 if (!arg_privileged) {
524 /* We only support the unified mode when running unprivileged */
525 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
526 return 0;
527 }
528
529 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
530 * in the image actually supports. */
531 r = cg_all_unified();
532 if (r < 0)
533 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
534 if (r > 0) {
535 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
536 * routine only detects 231, so we'll have a false negative here for 230. */
537 r = systemd_installation_has_version(directory, "230");
538 if (r < 0)
539 return log_error_errno(r, "Failed to determine systemd version in container: %m");
540 if (r > 0)
541 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
542 else
543 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
544 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
545 /* Mixed cgroup hierarchy support was added in 233 */
546 r = systemd_installation_has_version(directory, "233");
547 if (r < 0)
548 return log_error_errno(r, "Failed to determine systemd version in container: %m");
549 if (r > 0)
550 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
551 else
552 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
553 } else
554 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
555
556 log_debug("Using %s hierarchy for container.",
557 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
558 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
559
560 return 0;
561 }
562
563 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
564 uint64_t mask = 0;
565 int r;
566
567 for (;;) {
568 _cleanup_free_ char *t = NULL;
569
570 r = extract_first_word(&spec, &t, ",", 0);
571 if (r < 0)
572 return log_error_errno(r, "Failed to parse capability %s.", t);
573 if (r == 0)
574 break;
575
576 if (streq(t, "help")) {
577 for (int i = 0; i < capability_list_length(); i++) {
578 const char *name;
579
580 name = capability_to_name(i);
581 if (name)
582 puts(name);
583 }
584
585 return 0; /* quit */
586 }
587
588 if (streq(t, "all"))
589 mask = UINT64_MAX;
590 else {
591 r = capability_from_name(t);
592 if (r < 0)
593 return log_error_errno(r, "Failed to parse capability %s.", t);
594
595 mask |= 1ULL << r;
596 }
597 }
598
599 *ret_mask = mask;
600 return 1; /* continue */
601 }
602
603 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
604 int r;
605
606 r = getenv_bool(name);
607 if (r == -ENXIO)
608 return 0;
609 if (r < 0)
610 return log_error_errno(r, "Failed to parse $%s: %m", name);
611
612 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
613 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
614 return 0;
615 }
616
617 static int parse_mount_settings_env(void) {
618 const char *e;
619 int r;
620
621 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
622 if (r < 0 && r != -ENXIO)
623 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
624 if (r >= 0)
625 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
626
627 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
628 if (streq_ptr(e, "network"))
629 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
630 else if (e) {
631 r = parse_boolean(e);
632 if (r < 0)
633 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
634
635 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
636 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
637 }
638
639 return 0;
640 }
641
642 static int parse_environment(void) {
643 const char *e;
644 int r;
645
646 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
647 if (r < 0)
648 return r;
649 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
650 if (r < 0)
651 return r;
652 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
653 if (r < 0)
654 return r;
655 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
656 if (r < 0)
657 return r;
658
659 r = parse_mount_settings_env();
660 if (r < 0)
661 return r;
662
663 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
664 * even if it is supported. If not supported, it has no effect. */
665 if (!cg_ns_supported())
666 arg_use_cgns = false;
667 else {
668 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
669 if (r < 0) {
670 if (r != -ENXIO)
671 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
672
673 arg_use_cgns = true;
674 } else {
675 arg_use_cgns = r > 0;
676 arg_settings_mask |= SETTING_USE_CGNS;
677 }
678 }
679
680 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
681 if (e)
682 arg_container_service_name = e;
683
684 e = getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
685 if (e) {
686 r = parse_ether_addr(e, &arg_network_provided_mac);
687 if (r < 0)
688 return log_error_errno(r, "Failed to parse provided MAC address via environment variable");
689 }
690
691 r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
692 if (r >= 0)
693 arg_suppress_sync = r;
694 else if (r != -ENXIO)
695 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
696
697 return detect_unified_cgroup_hierarchy_from_environment();
698 }
699
700 static int parse_argv(int argc, char *argv[]) {
701 enum {
702 ARG_VERSION = 0x100,
703 ARG_PRIVATE_NETWORK,
704 ARG_UUID,
705 ARG_READ_ONLY,
706 ARG_CAPABILITY,
707 ARG_AMBIENT_CAPABILITY,
708 ARG_DROP_CAPABILITY,
709 ARG_LINK_JOURNAL,
710 ARG_BIND,
711 ARG_BIND_RO,
712 ARG_TMPFS,
713 ARG_OVERLAY,
714 ARG_OVERLAY_RO,
715 ARG_INACCESSIBLE,
716 ARG_SHARE_SYSTEM,
717 ARG_REGISTER,
718 ARG_KEEP_UNIT,
719 ARG_NETWORK_INTERFACE,
720 ARG_NETWORK_MACVLAN,
721 ARG_NETWORK_IPVLAN,
722 ARG_NETWORK_BRIDGE,
723 ARG_NETWORK_ZONE,
724 ARG_NETWORK_VETH_EXTRA,
725 ARG_NETWORK_NAMESPACE_PATH,
726 ARG_PERSONALITY,
727 ARG_VOLATILE,
728 ARG_TEMPLATE,
729 ARG_PROPERTY,
730 ARG_PRIVATE_USERS,
731 ARG_KILL_SIGNAL,
732 ARG_SETTINGS,
733 ARG_CHDIR,
734 ARG_PIVOT_ROOT,
735 ARG_PRIVATE_USERS_CHOWN,
736 ARG_PRIVATE_USERS_OWNERSHIP,
737 ARG_NOTIFY_READY,
738 ARG_ROOT_HASH,
739 ARG_ROOT_HASH_SIG,
740 ARG_VERITY_DATA,
741 ARG_SYSTEM_CALL_FILTER,
742 ARG_RLIMIT,
743 ARG_HOSTNAME,
744 ARG_NO_NEW_PRIVILEGES,
745 ARG_OOM_SCORE_ADJUST,
746 ARG_CPU_AFFINITY,
747 ARG_RESOLV_CONF,
748 ARG_TIMEZONE,
749 ARG_CONSOLE,
750 ARG_PIPE,
751 ARG_OCI_BUNDLE,
752 ARG_NO_PAGER,
753 ARG_SET_CREDENTIAL,
754 ARG_LOAD_CREDENTIAL,
755 ARG_BIND_USER,
756 ARG_SUPPRESS_SYNC,
757 ARG_IMAGE_POLICY,
758 ARG_BACKGROUND,
759 };
760
761 static const struct option options[] = {
762 { "help", no_argument, NULL, 'h' },
763 { "version", no_argument, NULL, ARG_VERSION },
764 { "directory", required_argument, NULL, 'D' },
765 { "template", required_argument, NULL, ARG_TEMPLATE },
766 { "ephemeral", no_argument, NULL, 'x' },
767 { "user", required_argument, NULL, 'u' },
768 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
769 { "as-pid2", no_argument, NULL, 'a' },
770 { "boot", no_argument, NULL, 'b' },
771 { "uuid", required_argument, NULL, ARG_UUID },
772 { "read-only", no_argument, NULL, ARG_READ_ONLY },
773 { "capability", required_argument, NULL, ARG_CAPABILITY },
774 { "ambient-capability", required_argument, NULL, ARG_AMBIENT_CAPABILITY },
775 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
776 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
777 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
778 { "bind", required_argument, NULL, ARG_BIND },
779 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
780 { "tmpfs", required_argument, NULL, ARG_TMPFS },
781 { "overlay", required_argument, NULL, ARG_OVERLAY },
782 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
783 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
784 { "machine", required_argument, NULL, 'M' },
785 { "hostname", required_argument, NULL, ARG_HOSTNAME },
786 { "slice", required_argument, NULL, 'S' },
787 { "setenv", required_argument, NULL, 'E' },
788 { "selinux-context", required_argument, NULL, 'Z' },
789 { "selinux-apifs-context", required_argument, NULL, 'L' },
790 { "quiet", no_argument, NULL, 'q' },
791 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
792 { "register", required_argument, NULL, ARG_REGISTER },
793 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
794 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
795 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
796 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
797 { "network-veth", no_argument, NULL, 'n' },
798 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
799 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
800 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
801 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
802 { "personality", required_argument, NULL, ARG_PERSONALITY },
803 { "image", required_argument, NULL, 'i' },
804 { "volatile", optional_argument, NULL, ARG_VOLATILE },
805 { "port", required_argument, NULL, 'p' },
806 { "property", required_argument, NULL, ARG_PROPERTY },
807 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
808 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, /* obsolete */
809 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
810 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
811 { "settings", required_argument, NULL, ARG_SETTINGS },
812 { "chdir", required_argument, NULL, ARG_CHDIR },
813 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
814 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
815 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
816 { "root-hash-sig", required_argument, NULL, ARG_ROOT_HASH_SIG },
817 { "verity-data", required_argument, NULL, ARG_VERITY_DATA },
818 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
819 { "rlimit", required_argument, NULL, ARG_RLIMIT },
820 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
821 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
822 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
823 { "timezone", required_argument, NULL, ARG_TIMEZONE },
824 { "console", required_argument, NULL, ARG_CONSOLE },
825 { "pipe", no_argument, NULL, ARG_PIPE },
826 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
827 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
828 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
829 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
830 { "bind-user", required_argument, NULL, ARG_BIND_USER },
831 { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
832 { "image-policy", required_argument, NULL, ARG_IMAGE_POLICY },
833 { "background", required_argument, NULL, ARG_BACKGROUND },
834 {}
835 };
836
837 int c, r;
838 uint64_t plus = 0, minus = 0;
839 bool mask_all_settings = false, mask_no_settings = false;
840
841 assert(argc >= 0);
842 assert(argv);
843
844 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
845 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
846 optind = 0;
847 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
848 switch (c) {
849
850 case 'h':
851 return help();
852
853 case ARG_VERSION:
854 return version();
855
856 case 'D':
857 r = parse_path_argument(optarg, false, &arg_directory);
858 if (r < 0)
859 return r;
860
861 arg_settings_mask |= SETTING_DIRECTORY;
862 break;
863
864 case ARG_TEMPLATE:
865 r = parse_path_argument(optarg, false, &arg_template);
866 if (r < 0)
867 return r;
868
869 arg_settings_mask |= SETTING_DIRECTORY;
870 break;
871
872 case 'i':
873 r = parse_path_argument(optarg, false, &arg_image);
874 if (r < 0)
875 return r;
876
877 arg_settings_mask |= SETTING_DIRECTORY;
878 break;
879
880 case ARG_OCI_BUNDLE:
881 r = parse_path_argument(optarg, false, &arg_oci_bundle);
882 if (r < 0)
883 return r;
884
885 break;
886
887 case 'x':
888 arg_ephemeral = true;
889 arg_settings_mask |= SETTING_EPHEMERAL;
890 break;
891
892 case 'u':
893 r = free_and_strdup(&arg_user, optarg);
894 if (r < 0)
895 return log_oom();
896
897 arg_settings_mask |= SETTING_USER;
898 break;
899
900 case ARG_NETWORK_ZONE: {
901 _cleanup_free_ char *j = NULL;
902
903 j = strjoin("vz-", optarg);
904 if (!j)
905 return log_oom();
906
907 if (!ifname_valid(j))
908 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
909 "Network zone name not valid: %s", j);
910
911 free_and_replace(arg_network_zone, j);
912
913 arg_network_veth = true;
914 arg_private_network = true;
915 arg_settings_mask |= SETTING_NETWORK;
916 break;
917 }
918
919 case ARG_NETWORK_BRIDGE:
920
921 if (!ifname_valid(optarg))
922 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
923 "Bridge interface name not valid: %s", optarg);
924
925 r = free_and_strdup(&arg_network_bridge, optarg);
926 if (r < 0)
927 return log_oom();
928
929 _fallthrough_;
930 case 'n':
931 arg_network_veth = true;
932 arg_private_network = true;
933 arg_settings_mask |= SETTING_NETWORK;
934 break;
935
936 case ARG_NETWORK_VETH_EXTRA:
937 r = veth_extra_parse(&arg_network_veth_extra, optarg);
938 if (r < 0)
939 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
940
941 arg_private_network = true;
942 arg_settings_mask |= SETTING_NETWORK;
943 break;
944
945 case ARG_NETWORK_INTERFACE:
946 r = interface_pair_parse(&arg_network_interfaces, optarg);
947 if (r < 0)
948 return r;
949
950 arg_private_network = true;
951 arg_settings_mask |= SETTING_NETWORK;
952 break;
953
954 case ARG_NETWORK_MACVLAN:
955 r = macvlan_pair_parse(&arg_network_macvlan, optarg);
956 if (r < 0)
957 return r;
958
959 arg_private_network = true;
960 arg_settings_mask |= SETTING_NETWORK;
961 break;
962
963 case ARG_NETWORK_IPVLAN:
964 r = ipvlan_pair_parse(&arg_network_ipvlan, optarg);
965 if (r < 0)
966 return r;
967
968 _fallthrough_;
969 case ARG_PRIVATE_NETWORK:
970 arg_private_network = true;
971 arg_settings_mask |= SETTING_NETWORK;
972 break;
973
974 case ARG_NETWORK_NAMESPACE_PATH:
975 r = parse_path_argument(optarg, false, &arg_network_namespace_path);
976 if (r < 0)
977 return r;
978
979 arg_settings_mask |= SETTING_NETWORK;
980 break;
981
982 case 'b':
983 if (arg_start_mode == START_PID2)
984 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
985 "--boot and --as-pid2 may not be combined.");
986
987 arg_start_mode = START_BOOT;
988 arg_settings_mask |= SETTING_START_MODE;
989 break;
990
991 case 'a':
992 if (arg_start_mode == START_BOOT)
993 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
994 "--boot and --as-pid2 may not be combined.");
995
996 arg_start_mode = START_PID2;
997 arg_settings_mask |= SETTING_START_MODE;
998 break;
999
1000 case ARG_UUID:
1001 r = id128_from_string_nonzero(optarg, &arg_uuid);
1002 if (r == -ENXIO)
1003 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1004 "Machine UUID may not be all zeroes.");
1005 if (r < 0)
1006 return log_error_errno(r, "Invalid UUID: %s", optarg);
1007
1008 arg_settings_mask |= SETTING_MACHINE_ID;
1009 break;
1010
1011 case 'S': {
1012 _cleanup_free_ char *mangled = NULL;
1013
1014 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1015 if (r < 0)
1016 return log_oom();
1017
1018 free_and_replace(arg_slice, mangled);
1019 arg_settings_mask |= SETTING_SLICE;
1020 break;
1021 }
1022
1023 case 'M':
1024 if (isempty(optarg))
1025 arg_machine = mfree(arg_machine);
1026 else {
1027 if (!hostname_is_valid(optarg, 0))
1028 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1029 "Invalid machine name: %s", optarg);
1030
1031 r = free_and_strdup(&arg_machine, optarg);
1032 if (r < 0)
1033 return log_oom();
1034 }
1035 break;
1036
1037 case ARG_HOSTNAME:
1038 if (isempty(optarg))
1039 arg_hostname = mfree(arg_hostname);
1040 else {
1041 if (!hostname_is_valid(optarg, 0))
1042 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1043 "Invalid hostname: %s", optarg);
1044
1045 r = free_and_strdup(&arg_hostname, optarg);
1046 if (r < 0)
1047 return log_oom();
1048 }
1049
1050 arg_settings_mask |= SETTING_HOSTNAME;
1051 break;
1052
1053 case 'Z':
1054 arg_selinux_context = optarg;
1055 break;
1056
1057 case 'L':
1058 arg_selinux_apifs_context = optarg;
1059 break;
1060
1061 case ARG_READ_ONLY:
1062 arg_read_only = true;
1063 arg_settings_mask |= SETTING_READ_ONLY;
1064 break;
1065
1066 case ARG_AMBIENT_CAPABILITY: {
1067 uint64_t m;
1068 r = parse_capability_spec(optarg, &m);
1069 if (r <= 0)
1070 return r;
1071 arg_caps_ambient |= m;
1072 arg_settings_mask |= SETTING_CAPABILITY;
1073 break;
1074 }
1075 case ARG_CAPABILITY:
1076 case ARG_DROP_CAPABILITY: {
1077 uint64_t m;
1078 r = parse_capability_spec(optarg, &m);
1079 if (r <= 0)
1080 return r;
1081
1082 if (c == ARG_CAPABILITY)
1083 plus |= m;
1084 else
1085 minus |= m;
1086 arg_settings_mask |= SETTING_CAPABILITY;
1087 break;
1088 }
1089 case ARG_NO_NEW_PRIVILEGES:
1090 r = parse_boolean(optarg);
1091 if (r < 0)
1092 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1093
1094 arg_no_new_privileges = r;
1095 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1096 break;
1097
1098 case 'j':
1099 arg_link_journal = LINK_GUEST;
1100 arg_link_journal_try = true;
1101 arg_settings_mask |= SETTING_LINK_JOURNAL;
1102 break;
1103
1104 case ARG_LINK_JOURNAL:
1105 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1106 if (r < 0)
1107 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1108
1109 arg_settings_mask |= SETTING_LINK_JOURNAL;
1110 break;
1111
1112 case ARG_BIND:
1113 case ARG_BIND_RO:
1114 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1117
1118 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1119 break;
1120
1121 case ARG_TMPFS:
1122 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1123 if (r < 0)
1124 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1125
1126 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1127 break;
1128
1129 case ARG_OVERLAY:
1130 case ARG_OVERLAY_RO:
1131 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1132 if (r == -EADDRNOTAVAIL)
1133 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1134 if (r < 0)
1135 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1136
1137 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1138 break;
1139
1140 case ARG_INACCESSIBLE:
1141 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1142 if (r < 0)
1143 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1144
1145 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1146 break;
1147
1148 case 'E':
1149 r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1150 if (r < 0)
1151 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1152
1153 arg_settings_mask |= SETTING_ENVIRONMENT;
1154 break;
1155
1156 case 'q':
1157 arg_quiet = true;
1158 break;
1159
1160 case ARG_SHARE_SYSTEM:
1161 /* We don't officially support this anymore, except for compat reasons. People should use the
1162 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1163 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1164 arg_clone_ns_flags = 0;
1165 break;
1166
1167 case ARG_REGISTER:
1168 r = parse_boolean(optarg);
1169 if (r < 0) {
1170 log_error("Failed to parse --register= argument: %s", optarg);
1171 return r;
1172 }
1173
1174 arg_register = r;
1175 break;
1176
1177 case ARG_KEEP_UNIT:
1178 arg_keep_unit = true;
1179 break;
1180
1181 case ARG_PERSONALITY:
1182
1183 arg_personality = personality_from_string(optarg);
1184 if (arg_personality == PERSONALITY_INVALID)
1185 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1186 "Unknown or unsupported personality '%s'.", optarg);
1187
1188 arg_settings_mask |= SETTING_PERSONALITY;
1189 break;
1190
1191 case ARG_VOLATILE:
1192
1193 if (!optarg)
1194 arg_volatile_mode = VOLATILE_YES;
1195 else if (streq(optarg, "help")) {
1196 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1197 return 0;
1198 } else {
1199 VolatileMode m;
1200
1201 m = volatile_mode_from_string(optarg);
1202 if (m < 0)
1203 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1204 "Failed to parse --volatile= argument: %s", optarg);
1205 else
1206 arg_volatile_mode = m;
1207 }
1208
1209 arg_settings_mask |= SETTING_VOLATILE_MODE;
1210 break;
1211
1212 case 'p':
1213 r = expose_port_parse(&arg_expose_ports, optarg);
1214 if (r == -EEXIST)
1215 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1216 if (r < 0)
1217 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1218
1219 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1220 break;
1221
1222 case ARG_PROPERTY:
1223 if (strv_extend(&arg_property, optarg) < 0)
1224 return log_oom();
1225
1226 break;
1227
1228 case ARG_PRIVATE_USERS: {
1229 int boolean;
1230
1231 if (!optarg)
1232 boolean = true;
1233 else if (!in_charset(optarg, DIGITS))
1234 /* do *not* parse numbers as booleans */
1235 boolean = parse_boolean(optarg);
1236 else
1237 boolean = -1;
1238
1239 if (boolean == 0) {
1240 /* no: User namespacing off */
1241 arg_userns_mode = USER_NAMESPACE_NO;
1242 arg_uid_shift = UID_INVALID;
1243 arg_uid_range = UINT32_C(0x10000);
1244 } else if (boolean > 0) {
1245 /* yes: User namespacing on, UID range is read from root dir */
1246 arg_userns_mode = USER_NAMESPACE_FIXED;
1247 arg_uid_shift = UID_INVALID;
1248 arg_uid_range = UINT32_C(0x10000);
1249 } else if (streq(optarg, "pick")) {
1250 /* pick: User namespacing on, UID range is picked randomly */
1251 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1252 * implied by USER_NAMESPACE_PICK
1253 * further down. */
1254 arg_uid_shift = UID_INVALID;
1255 arg_uid_range = UINT32_C(0x10000);
1256
1257 } else if (streq(optarg, "identity")) {
1258 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1259 * itself, i.e. we don't actually map anything, but do take benefit of
1260 * isolation of capability sets. */
1261 arg_userns_mode = USER_NAMESPACE_FIXED;
1262 arg_uid_shift = 0;
1263 arg_uid_range = UINT32_C(0x10000);
1264 } else {
1265 /* anything else: User namespacing on, UID range is explicitly configured */
1266 r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
1267 if (r < 0)
1268 return r;
1269 arg_userns_mode = USER_NAMESPACE_FIXED;
1270 }
1271
1272 arg_settings_mask |= SETTING_USERNS;
1273 break;
1274 }
1275
1276 case 'U':
1277 if (userns_supported()) {
1278 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1279 * implied by USER_NAMESPACE_PICK
1280 * further down. */
1281 arg_uid_shift = UID_INVALID;
1282 arg_uid_range = UINT32_C(0x10000);
1283
1284 arg_settings_mask |= SETTING_USERNS;
1285 }
1286
1287 break;
1288
1289 case ARG_PRIVATE_USERS_CHOWN:
1290 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1291
1292 arg_settings_mask |= SETTING_USERNS;
1293 break;
1294
1295 case ARG_PRIVATE_USERS_OWNERSHIP:
1296 if (streq(optarg, "help")) {
1297 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1298 return 0;
1299 }
1300
1301 arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1302 if (arg_userns_ownership < 0)
1303 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1304
1305 arg_settings_mask |= SETTING_USERNS;
1306 break;
1307
1308 case ARG_KILL_SIGNAL:
1309 if (streq(optarg, "help")) {
1310 DUMP_STRING_TABLE(signal, int, _NSIG);
1311 return 0;
1312 }
1313
1314 arg_kill_signal = signal_from_string(optarg);
1315 if (arg_kill_signal < 0)
1316 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1317
1318 arg_settings_mask |= SETTING_KILL_SIGNAL;
1319 break;
1320
1321 case ARG_SETTINGS:
1322
1323 /* no → do not read files
1324 * yes → read files, do not override cmdline, trust only subset
1325 * override → read files, override cmdline, trust only subset
1326 * trusted → read files, do not override cmdline, trust all
1327 */
1328
1329 r = parse_boolean(optarg);
1330 if (r < 0) {
1331 if (streq(optarg, "trusted")) {
1332 mask_all_settings = false;
1333 mask_no_settings = false;
1334 arg_settings_trusted = true;
1335
1336 } else if (streq(optarg, "override")) {
1337 mask_all_settings = false;
1338 mask_no_settings = true;
1339 arg_settings_trusted = -1;
1340 } else
1341 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1342 } else if (r > 0) {
1343 /* yes */
1344 mask_all_settings = false;
1345 mask_no_settings = false;
1346 arg_settings_trusted = -1;
1347 } else {
1348 /* no */
1349 mask_all_settings = true;
1350 mask_no_settings = false;
1351 arg_settings_trusted = false;
1352 }
1353
1354 break;
1355
1356 case ARG_CHDIR: {
1357 _cleanup_free_ char *wd = NULL;
1358
1359 if (!path_is_absolute(optarg))
1360 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1361 "Working directory %s is not an absolute path.", optarg);
1362
1363 r = path_simplify_alloc(optarg, &wd);
1364 if (r < 0)
1365 return log_error_errno(r, "Failed to simplify path %s: %m", optarg);
1366
1367 if (!path_is_normalized(wd))
1368 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory path is not normalized: %s", wd);
1369
1370 if (path_below_api_vfs(wd))
1371 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Working directory is below API VFS, refusing: %s", wd);
1372
1373 free_and_replace(arg_chdir, wd);
1374 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1375 break;
1376 }
1377
1378 case ARG_PIVOT_ROOT:
1379 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1380 if (r < 0)
1381 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1382
1383 arg_settings_mask |= SETTING_PIVOT_ROOT;
1384 break;
1385
1386 case ARG_NOTIFY_READY:
1387 r = parse_boolean(optarg);
1388 if (r < 0)
1389 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1390 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1391 arg_notify_ready = r;
1392 arg_settings_mask |= SETTING_NOTIFY_READY;
1393 break;
1394
1395 case ARG_ROOT_HASH: {
1396 _cleanup_free_ void *k = NULL;
1397 size_t l;
1398
1399 r = unhexmem(optarg, &k, &l);
1400 if (r < 0)
1401 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1402 if (l < sizeof(sd_id128_t))
1403 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128-bit long: %s", optarg);
1404
1405 free_and_replace(arg_verity_settings.root_hash, k);
1406 arg_verity_settings.root_hash_size = l;
1407 break;
1408 }
1409
1410 case ARG_ROOT_HASH_SIG: {
1411 char *value;
1412 size_t l;
1413 void *p;
1414
1415 if ((value = startswith(optarg, "base64:"))) {
1416 r = unbase64mem(value, &p, &l);
1417 if (r < 0)
1418 return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1419
1420 } else {
1421 r = read_full_file(optarg, (char**) &p, &l);
1422 if (r < 0)
1423 return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1424 }
1425
1426 free_and_replace(arg_verity_settings.root_hash_sig, p);
1427 arg_verity_settings.root_hash_sig_size = l;
1428 break;
1429 }
1430
1431 case ARG_VERITY_DATA:
1432 r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1433 if (r < 0)
1434 return r;
1435 break;
1436
1437 case ARG_SYSTEM_CALL_FILTER: {
1438 bool negative;
1439 const char *items;
1440
1441 negative = optarg[0] == '~';
1442 items = negative ? optarg + 1 : optarg;
1443
1444 for (;;) {
1445 _cleanup_free_ char *word = NULL;
1446
1447 r = extract_first_word(&items, &word, NULL, 0);
1448 if (r == 0)
1449 break;
1450 if (r == -ENOMEM)
1451 return log_oom();
1452 if (r < 0)
1453 return log_error_errno(r, "Failed to parse system call filter: %m");
1454
1455 if (negative)
1456 r = strv_extend(&arg_syscall_deny_list, word);
1457 else
1458 r = strv_extend(&arg_syscall_allow_list, word);
1459 if (r < 0)
1460 return log_oom();
1461 }
1462
1463 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1464 break;
1465 }
1466
1467 case ARG_RLIMIT: {
1468 const char *eq;
1469 _cleanup_free_ char *name = NULL;
1470 int rl;
1471
1472 if (streq(optarg, "help")) {
1473 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1474 return 0;
1475 }
1476
1477 eq = strchr(optarg, '=');
1478 if (!eq)
1479 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1480 "--rlimit= expects an '=' assignment.");
1481
1482 name = strndup(optarg, eq - optarg);
1483 if (!name)
1484 return log_oom();
1485
1486 rl = rlimit_from_string_harder(name);
1487 if (rl < 0)
1488 return log_error_errno(rl, "Unknown resource limit: %s", name);
1489
1490 if (!arg_rlimit[rl]) {
1491 arg_rlimit[rl] = new0(struct rlimit, 1);
1492 if (!arg_rlimit[rl])
1493 return log_oom();
1494 }
1495
1496 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1497 if (r < 0)
1498 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1499
1500 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1501 break;
1502 }
1503
1504 case ARG_OOM_SCORE_ADJUST:
1505 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1506 if (r < 0)
1507 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1508
1509 arg_oom_score_adjust_set = true;
1510 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1511 break;
1512
1513 case ARG_CPU_AFFINITY: {
1514 CPUSet cpuset;
1515
1516 r = parse_cpu_set(optarg, &cpuset);
1517 if (r < 0)
1518 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1519
1520 cpu_set_reset(&arg_cpu_set);
1521 arg_cpu_set = cpuset;
1522 arg_settings_mask |= SETTING_CPU_AFFINITY;
1523 break;
1524 }
1525
1526 case ARG_RESOLV_CONF:
1527 if (streq(optarg, "help")) {
1528 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1529 return 0;
1530 }
1531
1532 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1533 if (arg_resolv_conf < 0)
1534 return log_error_errno(arg_resolv_conf,
1535 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1536
1537 arg_settings_mask |= SETTING_RESOLV_CONF;
1538 break;
1539
1540 case ARG_TIMEZONE:
1541 if (streq(optarg, "help")) {
1542 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1543 return 0;
1544 }
1545
1546 arg_timezone = timezone_mode_from_string(optarg);
1547 if (arg_timezone < 0)
1548 return log_error_errno(arg_timezone,
1549 "Failed to parse /etc/localtime mode: %s", optarg);
1550
1551 arg_settings_mask |= SETTING_TIMEZONE;
1552 break;
1553
1554 case ARG_CONSOLE:
1555 r = handle_arg_console(optarg);
1556 if (r <= 0)
1557 return r;
1558 break;
1559
1560 case 'P':
1561 case ARG_PIPE:
1562 r = handle_arg_console("pipe");
1563 if (r <= 0)
1564 return r;
1565 break;
1566
1567 case ARG_NO_PAGER:
1568 arg_pager_flags |= PAGER_DISABLE;
1569 break;
1570
1571 case ARG_SET_CREDENTIAL:
1572 r = machine_credential_set(&arg_credentials, optarg);
1573 if (r < 0)
1574 return r;
1575
1576 arg_settings_mask |= SETTING_CREDENTIALS;
1577 break;
1578
1579 case ARG_LOAD_CREDENTIAL:
1580 r = machine_credential_load(&arg_credentials, optarg);
1581 if (r < 0)
1582 return r;
1583
1584 arg_settings_mask |= SETTING_CREDENTIALS;
1585 break;
1586
1587 case ARG_BIND_USER:
1588 if (!valid_user_group_name(optarg, 0))
1589 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1590
1591 if (strv_extend(&arg_bind_user, optarg) < 0)
1592 return log_oom();
1593
1594 arg_settings_mask |= SETTING_BIND_USER;
1595 break;
1596
1597 case ARG_SUPPRESS_SYNC:
1598 r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1599 if (r < 0)
1600 return r;
1601
1602 arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1603 break;
1604
1605 case ARG_IMAGE_POLICY:
1606 r = parse_image_policy_argument(optarg, &arg_image_policy);
1607 if (r < 0)
1608 return r;
1609 break;
1610
1611 case ARG_BACKGROUND:
1612 r = free_and_strdup_warn(&arg_background, optarg);
1613 if (r < 0)
1614 return r;
1615 break;
1616
1617 case '?':
1618 return -EINVAL;
1619
1620 default:
1621 assert_not_reached();
1622 }
1623
1624 if (argc > optind) {
1625 strv_free(arg_parameters);
1626 arg_parameters = strv_copy(argv + optind);
1627 if (!arg_parameters)
1628 return log_oom();
1629
1630 arg_settings_mask |= SETTING_START_MODE;
1631 }
1632
1633 if (arg_ephemeral && arg_template && !arg_directory)
1634 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1635 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1636 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1637 * --directory=". */
1638 arg_directory = TAKE_PTR(arg_template);
1639
1640 arg_caps_retain |= plus;
1641 arg_caps_retain |= arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0;
1642 arg_caps_retain &= ~minus;
1643
1644 /* Make sure to parse environment before we reset the settings mask below */
1645 r = parse_environment();
1646 if (r < 0)
1647 return r;
1648
1649 /* Load all settings from .nspawn files */
1650 if (mask_no_settings)
1651 arg_settings_mask = 0;
1652
1653 /* Don't load any settings from .nspawn files */
1654 if (mask_all_settings)
1655 arg_settings_mask = _SETTINGS_MASK_ALL;
1656
1657 return 1;
1658 }
1659
1660 static int verify_arguments(void) {
1661 int r;
1662
1663 SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
1664
1665 if (!arg_privileged) {
1666 /* machined is not accessible to unpriv clients */
1667 if (arg_register) {
1668 log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients.");
1669 arg_register = false;
1670 }
1671
1672 if (!arg_private_network) {
1673 log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
1674 arg_private_network = true;
1675 }
1676 }
1677
1678 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1679 /* If we are running the stub init in the container, we don't need to look at what the init
1680 * in the container supports, because we are not using it. Let's immediately pick the right
1681 * setting based on the host system configuration.
1682 *
1683 * We only do this, if the user didn't use an environment variable to override the detection.
1684 */
1685
1686 r = cg_all_unified();
1687 if (r < 0)
1688 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1689 if (r > 0)
1690 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1691 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1692 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1693 else
1694 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1695 }
1696
1697 if (arg_userns_mode != USER_NAMESPACE_NO)
1698 arg_mount_settings |= MOUNT_USE_USERNS;
1699
1700 if (arg_private_network)
1701 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1702
1703 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1704 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1705 arg_register = false;
1706 if (arg_start_mode != START_PID1)
1707 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1708 }
1709
1710 if (arg_userns_ownership < 0)
1711 arg_userns_ownership =
1712 arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1713 USER_NAMESPACE_OWNERSHIP_OFF;
1714
1715 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1716 arg_kill_signal = SIGRTMIN+3;
1717
1718 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1719 arg_read_only = true;
1720
1721 if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1722 arg_read_only = true;
1723
1724 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1725 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1726 * The latter is not technically a user session, but we don't need to labour the point. */
1727 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1728
1729 if (arg_directory && arg_image)
1730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1731
1732 if (arg_template && arg_image)
1733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1734
1735 if (arg_template && !(arg_directory || arg_machine))
1736 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1737
1738 if (arg_ephemeral && arg_template)
1739 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1740
1741 /* Permit --ephemeral with --link-journal=try-* to satisfy principle of the least astonishment
1742 * (by common sense, "try" means "do not fail if not possible") */
1743 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO) && !arg_link_journal_try)
1744 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal={host,guest} may not be combined.");
1745
1746 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1747 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1748
1749 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1750 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1751 "--read-only and --private-users-ownership=chown may not be combined.");
1752
1753 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1754 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1755 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1756 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1757 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1758
1759 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1760 * we need to error out, to avoid conflicts between different network options. */
1761 if (arg_network_namespace_path &&
1762 (arg_network_interfaces || arg_network_macvlan ||
1763 arg_network_ipvlan || arg_network_veth_extra ||
1764 arg_network_bridge || arg_network_zone ||
1765 arg_network_veth))
1766 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1767
1768 if (arg_network_bridge && arg_network_zone)
1769 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1770 "--network-bridge= and --network-zone= may not be combined.");
1771
1772 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1773 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1774
1775 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1776 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1777
1778 if (arg_expose_ports && !arg_private_network)
1779 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1780
1781 if (arg_caps_ambient) {
1782 if (arg_caps_ambient == UINT64_MAX)
1783 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1784
1785 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1786 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1787
1788 if (arg_start_mode == START_BOOT)
1789 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1790 }
1791
1792 if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1793 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1794
1795 /* Drop duplicate --bind-user= entries */
1796 strv_uniq(arg_bind_user);
1797
1798 r = custom_mount_check_all();
1799 if (r < 0)
1800 return r;
1801
1802 return 0;
1803 }
1804
1805 static int verify_network_interfaces_initialized(void) {
1806 int r;
1807 r = test_network_interfaces_initialized(arg_network_interfaces);
1808 if (r < 0)
1809 return r;
1810
1811 r = test_network_interfaces_initialized(arg_network_macvlan);
1812 if (r < 0)
1813 return r;
1814
1815 r = test_network_interfaces_initialized(arg_network_ipvlan);
1816 if (r < 0)
1817 return r;
1818
1819 return 0;
1820 }
1821
1822 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1823 assert(p);
1824
1825 if (arg_userns_mode == USER_NAMESPACE_NO)
1826 return 0;
1827
1828 if (uid == UID_INVALID && gid == GID_INVALID)
1829 return 0;
1830
1831 if (uid != UID_INVALID) {
1832 uid += arg_uid_shift;
1833
1834 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1835 return -EOVERFLOW;
1836 }
1837
1838 if (gid != GID_INVALID) {
1839 gid += (gid_t) arg_uid_shift;
1840
1841 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1842 return -EOVERFLOW;
1843 }
1844
1845 return RET_NERRNO(lchown(p, uid, gid));
1846 }
1847
1848 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1849 const char *q;
1850 int r;
1851
1852 q = prefix_roota(root, path);
1853 r = RET_NERRNO(mkdir(q, mode));
1854 if (r == -EEXIST)
1855 return 0;
1856 if (r < 0)
1857 return r;
1858
1859 return userns_lchown(q, uid, gid);
1860 }
1861
1862 static const char *timezone_from_path(const char *path) {
1863 return PATH_STARTSWITH_SET(
1864 path,
1865 "../usr/share/zoneinfo/",
1866 "/usr/share/zoneinfo/");
1867 }
1868
1869 static bool etc_writable(void) {
1870 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1871 }
1872
1873 static int setup_timezone(const char *dest) {
1874 _cleanup_free_ char *p = NULL, *etc = NULL;
1875 const char *where, *check;
1876 TimezoneMode m;
1877 int r;
1878
1879 assert(dest);
1880
1881 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1882 r = readlink_malloc("/etc/localtime", &p);
1883 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1884 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1885 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1886 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1887 else if (r < 0) {
1888 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1889 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1890 * file.
1891 *
1892 * Example:
1893 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1894 */
1895 return 0;
1896 } else if (arg_timezone == TIMEZONE_AUTO)
1897 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1898 else
1899 m = arg_timezone;
1900 } else
1901 m = arg_timezone;
1902
1903 if (m == TIMEZONE_OFF)
1904 return 0;
1905
1906 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1907 if (r < 0) {
1908 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1909 return 0;
1910 }
1911
1912 where = strjoina(etc, "/localtime");
1913
1914 switch (m) {
1915
1916 case TIMEZONE_DELETE:
1917 if (unlink(where) < 0)
1918 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1919
1920 return 0;
1921
1922 case TIMEZONE_SYMLINK: {
1923 _cleanup_free_ char *q = NULL;
1924 const char *z, *what;
1925
1926 z = timezone_from_path(p);
1927 if (!z) {
1928 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1929 return 0;
1930 }
1931
1932 r = readlink_malloc(where, &q);
1933 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1934 return 0; /* Already pointing to the right place? Then do nothing .. */
1935
1936 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1937 r = chase(check, dest, 0, NULL, NULL);
1938 if (r < 0)
1939 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1940 else {
1941 if (unlink(where) < 0 && errno != ENOENT) {
1942 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1943 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1944 return 0;
1945 }
1946
1947 what = strjoina("../usr/share/zoneinfo/", z);
1948 if (symlink(what, where) < 0) {
1949 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1950 errno, "Failed to correct timezone of container, ignoring: %m");
1951 return 0;
1952 }
1953
1954 break;
1955 }
1956
1957 _fallthrough_;
1958 }
1959
1960 case TIMEZONE_BIND: {
1961 _cleanup_free_ char *resolved = NULL;
1962 int found;
1963
1964 found = chase(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1965 if (found < 0) {
1966 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1967 return 0;
1968 }
1969
1970 if (found == 0) /* missing? */
1971 (void) touch(resolved);
1972
1973 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1974 if (r >= 0)
1975 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1976
1977 _fallthrough_;
1978 }
1979
1980 case TIMEZONE_COPY:
1981 /* If mounting failed, try to copy */
1982 r = copy_file_atomic("/etc/localtime", where, 0644, COPY_REFLINK|COPY_REPLACE);
1983 if (r < 0) {
1984 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1985 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1986 return 0;
1987 }
1988
1989 break;
1990
1991 default:
1992 assert_not_reached();
1993 }
1994
1995 /* Fix permissions of the symlink or file copy we just created */
1996 r = userns_lchown(where, 0, 0);
1997 if (r < 0)
1998 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1999
2000 return 0;
2001 }
2002
2003 static int have_resolv_conf(const char *path) {
2004 assert(path);
2005
2006 if (access(path, F_OK) < 0) {
2007 if (errno == ENOENT)
2008 return 0;
2009
2010 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2011 }
2012
2013 return 1;
2014 }
2015
2016 static int resolved_listening(void) {
2017 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2018 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2019 _cleanup_free_ char *dns_stub_listener_mode = NULL;
2020 int r;
2021
2022 /* Check if resolved is listening */
2023
2024 r = sd_bus_open_system(&bus);
2025 if (r < 0)
2026 return log_debug_errno(r, "Failed to open system bus: %m");
2027
2028 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2029 if (r < 0)
2030 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2031 if (r == 0)
2032 return 0;
2033
2034 r = bus_get_property_string(bus, bus_resolve_mgr, "DNSStubListener", &error, &dns_stub_listener_mode);
2035 if (r < 0)
2036 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2037
2038 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2039 }
2040
2041 static int setup_resolv_conf(const char *dest) {
2042 _cleanup_free_ char *etc = NULL;
2043 const char *where, *what;
2044 ResolvConfMode m;
2045 int r;
2046
2047 assert(dest);
2048
2049 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2050 if (arg_private_network)
2051 m = RESOLV_CONF_OFF;
2052 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2053 m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2054 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2055 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2056 else
2057 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2058
2059 } else
2060 m = arg_resolv_conf;
2061
2062 if (m == RESOLV_CONF_OFF)
2063 return 0;
2064
2065 r = chase("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2066 if (r < 0) {
2067 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2068 return 0;
2069 }
2070
2071 where = strjoina(etc, "/resolv.conf");
2072
2073 if (m == RESOLV_CONF_DELETE) {
2074 if (unlink(where) < 0)
2075 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2076
2077 return 0;
2078 }
2079
2080 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2081 what = PRIVATE_STATIC_RESOLV_CONF;
2082 else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2083 what = PRIVATE_UPLINK_RESOLV_CONF;
2084 else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2085 what = PRIVATE_STUB_RESOLV_CONF;
2086 else
2087 what = "/etc/resolv.conf";
2088
2089 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2090 _cleanup_free_ char *resolved = NULL;
2091 int found;
2092
2093 found = chase(where, dest, CHASE_NONEXISTENT|CHASE_NOFOLLOW, &resolved, NULL);
2094 if (found < 0) {
2095 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2096 return 0;
2097 }
2098
2099 if (found == 0) /* missing? */
2100 (void) touch(resolved);
2101
2102 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2103 if (r >= 0)
2104 return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2105
2106 /* If that didn't work, let's copy the file */
2107 }
2108
2109 if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2110 r = copy_file_atomic(what, where, 0644, COPY_REFLINK|COPY_REPLACE);
2111 else
2112 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, COPY_REFLINK);
2113 if (r < 0) {
2114 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2115 * resolved or something similar runs inside and the symlink points there.
2116 *
2117 * If the disk image is read-only, there's also no point in complaining.
2118 */
2119 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2120 IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2121 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2122 return 0;
2123 }
2124
2125 r = userns_lchown(where, 0, 0);
2126 if (r < 0)
2127 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2128
2129 return 0;
2130 }
2131
2132 static int setup_boot_id(void) {
2133 _cleanup_(unlink_and_freep) char *from = NULL;
2134 _cleanup_free_ char *path = NULL;
2135 sd_id128_t rnd = SD_ID128_NULL;
2136 const char *to;
2137 int r;
2138
2139 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2140
2141 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2142 if (r < 0)
2143 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2144
2145 r = sd_id128_randomize(&rnd);
2146 if (r < 0)
2147 return log_error_errno(r, "Failed to generate random boot id: %m");
2148
2149 r = id128_write(path, ID128_FORMAT_UUID, rnd);
2150 if (r < 0)
2151 return log_error_errno(r, "Failed to write boot id: %m");
2152
2153 from = TAKE_PTR(path);
2154 to = "/proc/sys/kernel/random/boot_id";
2155
2156 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2157 if (r < 0)
2158 return r;
2159
2160 return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2161 }
2162
2163 static int copy_devnodes(const char *dest) {
2164 static const char devnodes[] =
2165 "null\0"
2166 "zero\0"
2167 "full\0"
2168 "random\0"
2169 "urandom\0"
2170 "tty\0"
2171 "net/tun\0";
2172
2173 int r = 0;
2174
2175 assert(dest);
2176
2177 BLOCK_WITH_UMASK(0000);
2178
2179 /* Create /dev/net, so that we can create /dev/net/tun in it */
2180 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2181 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2182
2183 NULSTR_FOREACH(d, devnodes) {
2184 _cleanup_free_ char *from = NULL, *to = NULL;
2185 struct stat st;
2186
2187 from = path_join("/dev/", d);
2188 if (!from)
2189 return log_oom();
2190
2191 to = path_join(dest, from);
2192 if (!to)
2193 return log_oom();
2194
2195 if (stat(from, &st) < 0) {
2196
2197 if (errno != ENOENT)
2198 return log_error_errno(errno, "Failed to stat %s: %m", from);
2199
2200 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2201 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2202 "%s is not a char or block device, cannot copy.", from);
2203 else {
2204 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2205
2206 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2207 /* Explicitly warn the user when /dev is already populated. */
2208 if (errno == EEXIST)
2209 log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
2210 if (errno != EPERM)
2211 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2212
2213 /* Some systems abusively restrict mknod but allow bind mounts. */
2214 r = touch(to);
2215 if (r < 0)
2216 return log_error_errno(r, "touch (%s) failed: %m", to);
2217 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2218 if (r < 0)
2219 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2220 }
2221
2222 r = userns_lchown(to, 0, 0);
2223 if (r < 0)
2224 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2225
2226 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2227 if (!dn)
2228 return log_oom();
2229
2230 r = userns_mkdir(dest, dn, 0755, 0, 0);
2231 if (r < 0)
2232 return log_error_errno(r, "Failed to create '%s': %m", dn);
2233
2234 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2235 return log_oom();
2236
2237 prefixed = path_join(dest, sl);
2238 if (!prefixed)
2239 return log_oom();
2240
2241 t = path_join("..", d);
2242 if (!t)
2243 return log_oom();
2244
2245 if (symlink(t, prefixed) < 0)
2246 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2247 }
2248 }
2249
2250 return r;
2251 }
2252
2253 static int make_extra_nodes(const char *dest) {
2254 size_t i;
2255 int r;
2256
2257 BLOCK_WITH_UMASK(0000);
2258
2259 for (i = 0; i < arg_n_extra_nodes; i++) {
2260 _cleanup_free_ char *path = NULL;
2261 DeviceNode *n = arg_extra_nodes + i;
2262
2263 path = path_join(dest, n->path);
2264 if (!path)
2265 return log_oom();
2266
2267 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2268 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2269
2270 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2271 if (r < 0)
2272 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2273 }
2274
2275 return 0;
2276 }
2277
2278 static int setup_pts(const char *dest) {
2279 _cleanup_free_ char *options = NULL;
2280 const char *p;
2281 int r;
2282
2283 #if HAVE_SELINUX
2284 if (arg_selinux_apifs_context)
2285 (void) asprintf(&options,
2286 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2287 arg_uid_shift + TTY_GID,
2288 arg_selinux_apifs_context);
2289 else
2290 #endif
2291 (void) asprintf(&options,
2292 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2293 arg_uid_shift + TTY_GID);
2294
2295 if (!options)
2296 return log_oom();
2297
2298 /* Mount /dev/pts itself */
2299 p = prefix_roota(dest, "/dev/pts");
2300 r = RET_NERRNO(mkdir(p, 0755));
2301 if (r < 0)
2302 return log_error_errno(r, "Failed to create /dev/pts: %m");
2303
2304 r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2305 if (r < 0)
2306 return r;
2307 r = userns_lchown(p, 0, 0);
2308 if (r < 0)
2309 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2310
2311 /* Create /dev/ptmx symlink */
2312 p = prefix_roota(dest, "/dev/ptmx");
2313 if (symlink("pts/ptmx", p) < 0)
2314 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2315 r = userns_lchown(p, 0, 0);
2316 if (r < 0)
2317 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2318
2319 /* And fix /dev/pts/ptmx ownership */
2320 p = prefix_roota(dest, "/dev/pts/ptmx");
2321 r = userns_lchown(p, 0, 0);
2322 if (r < 0)
2323 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2324
2325 return 0;
2326 }
2327
2328 static int setup_stdio_as_dev_console(void) {
2329 _cleanup_close_ int terminal = -EBADF;
2330 int r;
2331
2332 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2333 * explicitly, if we are configured to. */
2334 terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2335 if (terminal < 0)
2336 return log_error_errno(terminal, "Failed to open console: %m");
2337
2338 /* Make sure we can continue logging to the original stderr, even if
2339 * stderr points elsewhere now */
2340 r = log_dup_console();
2341 if (r < 0)
2342 return log_error_errno(r, "Failed to duplicate stderr: %m");
2343
2344 /* invalidates 'terminal' on success and failure */
2345 r = rearrange_stdio(terminal, terminal, terminal);
2346 TAKE_FD(terminal);
2347 if (r < 0)
2348 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2349
2350 return 0;
2351 }
2352
2353 static int setup_dev_console(const char *console) {
2354 _cleanup_free_ char *p = NULL;
2355 int r;
2356
2357 /* Create /dev/console symlink */
2358 r = path_make_relative("/dev", console, &p);
2359 if (r < 0)
2360 return log_error_errno(r, "Failed to create relative path: %m");
2361
2362 if (symlink(p, "/dev/console") < 0)
2363 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2364
2365 return 0;
2366 }
2367
2368 static int setup_keyring(void) {
2369 key_serial_t keyring;
2370
2371 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2372 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2373 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2374 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2375 * into the container. */
2376
2377 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2378 if (keyring == -1) {
2379 if (errno == ENOSYS)
2380 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2381 else if (ERRNO_IS_PRIVILEGE(errno))
2382 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2383 else
2384 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2385 }
2386
2387 return 0;
2388 }
2389
2390 int make_run_host(const char *root) {
2391 int r;
2392
2393 assert(root);
2394
2395 r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2396 if (r < 0)
2397 return log_error_errno(r, "Failed to create /run/host/: %m");
2398
2399 return 0;
2400 }
2401
2402 static int setup_credentials(const char *root) {
2403 bool world_readable = false;
2404 const char *q;
2405 int r;
2406
2407 if (arg_credentials.n_credentials == 0)
2408 return 0;
2409
2410 /* If starting a single-process container as a non-root user, the uid will only be resolved after we
2411 * are inside the inner child, when credential directories and files are already read-only, so they
2412 * are unusable as the single process won't have access to them. We also don't have access to the
2413 * uid that will actually be used from here, as we are setting credentials up from the outer child.
2414 * In order to make them usable as requested by the configuration, make them world readable in that
2415 * case, as by definition there are no other processes in that case besides the one being started,
2416 * which is being configured to be able to access credentials, and any of its children which will
2417 * inherit its privileges anyway. To ensure this, also enforce (and document) that
2418 * --no-new-privileges is necessary for this combination to work. */
2419 if (arg_no_new_privileges && !isempty(arg_user) && !STR_IN_SET(arg_user, "root", "0") &&
2420 arg_start_mode == START_PID1)
2421 world_readable = true;
2422
2423 r = make_run_host(root);
2424 if (r < 0)
2425 return r;
2426
2427 r = userns_mkdir(root, "/run/host/credentials", world_readable ? 0777 : 0700, 0, 0);
2428 if (r < 0)
2429 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2430
2431 q = prefix_roota(root, "/run/host/credentials");
2432 r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2433 if (r < 0)
2434 return r;
2435
2436 FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) {
2437 _cleanup_free_ char *j = NULL;
2438 _cleanup_close_ int fd = -EBADF;
2439
2440 j = path_join(q, cred->id);
2441 if (!j)
2442 return log_oom();
2443
2444 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, world_readable ? 0666 : 0600);
2445 if (fd < 0)
2446 return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2447
2448 r = loop_write(fd, cred->data, cred->size);
2449 if (r < 0)
2450 return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2451
2452 if (fchmod(fd, world_readable ? 0444 : 0400) < 0)
2453 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2454
2455 if (arg_userns_mode != USER_NAMESPACE_NO) {
2456 if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2457 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2458 }
2459 }
2460
2461 if (chmod(q, world_readable ? 0555 : 0500) < 0)
2462 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2463
2464 r = userns_lchown(q, 0, 0);
2465 if (r < 0)
2466 return r;
2467
2468 /* Make both mount and superblock read-only now */
2469 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2470 if (r < 0)
2471 return r;
2472
2473 return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2474 }
2475
2476 static int setup_kmsg(int fd_inner_socket) {
2477 _cleanup_(unlink_and_freep) char *from = NULL;
2478 _cleanup_free_ char *fifo = NULL;
2479 _cleanup_close_ int fd = -EBADF;
2480 int r;
2481
2482 assert(fd_inner_socket >= 0);
2483
2484 BLOCK_WITH_UMASK(0000);
2485
2486 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2487 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2488 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2489 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2490
2491 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2492 if (r < 0)
2493 return log_error_errno(r, "Failed to generate kmsg path: %m");
2494
2495 if (mkfifo(fifo, 0600) < 0)
2496 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2497
2498 from = TAKE_PTR(fifo);
2499
2500 r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2501 if (r < 0)
2502 return r;
2503
2504 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2505 if (fd < 0)
2506 return log_error_errno(errno, "Failed to open fifo: %m");
2507
2508 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2509 r = send_one_fd(fd_inner_socket, fd, 0);
2510 if (r < 0)
2511 return log_error_errno(r, "Failed to send FIFO fd: %m");
2512
2513 return 0;
2514 }
2515
2516 struct ExposeArgs {
2517 union in_addr_union address4;
2518 union in_addr_union address6;
2519 struct FirewallContext *fw_ctx;
2520 };
2521
2522 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2523 struct ExposeArgs *args = ASSERT_PTR(userdata);
2524
2525 assert(rtnl);
2526 assert(m);
2527
2528 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2529 (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2530 return 0;
2531 }
2532
2533 static int setup_hostname(void) {
2534 int r;
2535
2536 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2537 return 0;
2538
2539 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to set hostname: %m");
2542
2543 return 0;
2544 }
2545
2546 static int setup_journal(const char *directory) {
2547 _cleanup_free_ char *d = NULL;
2548 const char *p, *q;
2549 sd_id128_t this_id;
2550 bool try;
2551 int r;
2552
2553 /* Don't link journals in ephemeral mode */
2554 if (arg_ephemeral)
2555 return 0;
2556
2557 if (arg_link_journal == LINK_NO)
2558 return 0;
2559
2560 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2561
2562 r = sd_id128_get_machine(&this_id);
2563 if (r < 0)
2564 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2565
2566 if (sd_id128_equal(arg_uuid, this_id)) {
2567 log_full(try ? LOG_WARNING : LOG_ERR,
2568 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2569 if (try)
2570 return 0;
2571 return -EEXIST;
2572 }
2573
2574 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2575 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2576 if (r < 0) {
2577 bool ignore = r == -EROFS && try;
2578 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2579 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2580 return ignore ? 0 : r;
2581 }
2582 }
2583
2584 p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2585 q = prefix_roota(directory, p);
2586
2587 if (path_is_mount_point(p) > 0) {
2588 if (try)
2589 return 0;
2590
2591 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2592 "%s: already a mount point, refusing to use for journal", p);
2593 }
2594
2595 if (path_is_mount_point(q) > 0) {
2596 if (try)
2597 return 0;
2598
2599 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2600 "%s: already a mount point, refusing to use for journal", q);
2601 }
2602
2603 r = readlink_and_make_absolute(p, &d);
2604 if (r >= 0) {
2605 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2606 path_equal(d, q)) {
2607
2608 r = userns_mkdir(directory, p, 0755, 0, 0);
2609 if (r < 0)
2610 log_warning_errno(r, "Failed to create directory %s: %m", q);
2611 return 0;
2612 }
2613
2614 if (unlink(p) < 0)
2615 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2616 } else if (r == -EINVAL) {
2617
2618 if (arg_link_journal == LINK_GUEST &&
2619 rmdir(p) < 0) {
2620
2621 if (errno == ENOTDIR) {
2622 log_error("%s already exists and is neither a symlink nor a directory", p);
2623 return r;
2624 } else
2625 return log_error_errno(errno, "Failed to remove %s: %m", p);
2626 }
2627 } else if (r != -ENOENT)
2628 return log_error_errno(r, "readlink(%s) failed: %m", p);
2629
2630 if (arg_link_journal == LINK_GUEST) {
2631
2632 if (symlink(q, p) < 0) {
2633 if (try) {
2634 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2635 return 0;
2636 } else
2637 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2638 }
2639
2640 r = userns_mkdir(directory, p, 0755, 0, 0);
2641 if (r < 0)
2642 log_warning_errno(r, "Failed to create directory %s: %m", q);
2643 return 0;
2644 }
2645
2646 if (arg_link_journal == LINK_HOST) {
2647 /* don't create parents here — if the host doesn't have
2648 * permanent journal set up, don't force it here */
2649
2650 r = RET_NERRNO(mkdir(p, 0755));
2651 if (r < 0 && r != -EEXIST) {
2652 if (try) {
2653 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2654 return 0;
2655 } else
2656 return log_error_errno(r, "Failed to create %s: %m", p);
2657 }
2658
2659 } else if (access(p, F_OK) < 0)
2660 return 0;
2661
2662 if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2663 log_warning("%s is not empty, proceeding anyway.", q);
2664
2665 r = userns_mkdir(directory, p, 0755, 0, 0);
2666 if (r < 0)
2667 return log_error_errno(r, "Failed to create %s: %m", q);
2668
2669 r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2670 if (r < 0)
2671 return log_error_errno(r, "Failed to bind mount journal from host into guest: %m");
2672
2673 return 0;
2674 }
2675
2676 static int drop_capabilities(uid_t uid) {
2677 CapabilityQuintet q;
2678
2679 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2680 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2681 * arg_caps_retain. */
2682
2683 if (capability_quintet_is_set(&arg_full_capabilities)) {
2684 q = arg_full_capabilities;
2685
2686 if (q.bounding == UINT64_MAX)
2687 q.bounding = uid == 0 ? arg_caps_retain : 0;
2688
2689 if (q.effective == UINT64_MAX)
2690 q.effective = uid == 0 ? q.bounding : 0;
2691
2692 if (q.inheritable == UINT64_MAX)
2693 q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2694
2695 if (q.permitted == UINT64_MAX)
2696 q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2697
2698 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2699 q.ambient = arg_caps_ambient;
2700
2701 if (capability_quintet_mangle(&q))
2702 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2703
2704 } else {
2705 q = (CapabilityQuintet) {
2706 .bounding = arg_caps_retain,
2707 .effective = uid == 0 ? arg_caps_retain : 0,
2708 .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2709 .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2710 .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2711 };
2712
2713 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2714 * in order to maintain the same behavior as systemd < 242. */
2715 if (capability_quintet_mangle(&q))
2716 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2717 "Some capabilities will not be set because they are not in the current bounding set.");
2718
2719 }
2720
2721 return capability_quintet_enforce(&q);
2722 }
2723
2724 static int reset_audit_loginuid(void) {
2725 _cleanup_free_ char *p = NULL;
2726 int r;
2727
2728 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2729 return 0;
2730
2731 if (!arg_privileged)
2732 return 0;
2733
2734 r = read_one_line_file("/proc/self/loginuid", &p);
2735 if (r == -ENOENT)
2736 return 0;
2737 if (r < 0)
2738 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2739
2740 /* Already reset? */
2741 if (streq(p, "4294967295"))
2742 return 0;
2743
2744 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2745 if (r < 0) {
2746 log_error_errno(r,
2747 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2748 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2749 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2750 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2751 "using systemd-nspawn. Sleeping for 5s... (%m)");
2752
2753 sleep(5);
2754 }
2755
2756 return 0;
2757 }
2758
2759 static int mount_tunnel_dig(const char *root) {
2760 const char *p, *q;
2761 int r;
2762
2763 if (!arg_privileged) {
2764 log_debug("Not digging mount tunnel, because running unprivileged.");
2765 return 0;
2766 }
2767
2768 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2769 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2770 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2771 (void) mkdir_p(p, 0600);
2772
2773 r = make_run_host(root);
2774 if (r < 0)
2775 return r;
2776
2777 r = userns_mkdir(root, NSPAWN_MOUNT_TUNNEL, 0600, 0, 0);
2778 if (r < 0)
2779 return log_error_errno(r, "Failed to create "NSPAWN_MOUNT_TUNNEL": %m");
2780
2781 q = prefix_roota(root, NSPAWN_MOUNT_TUNNEL);
2782 r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2783 if (r < 0)
2784 return r;
2785
2786 r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2787 if (r < 0)
2788 return r;
2789
2790 return 0;
2791 }
2792
2793 static int mount_tunnel_open(void) {
2794 int r;
2795
2796 if (!arg_privileged) {
2797 log_debug("Not opening up mount tunnel, because running unprivileged.");
2798 return 0;
2799 }
2800
2801 r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
2802 if (r < 0)
2803 return r;
2804
2805 return 0;
2806 }
2807
2808 static int setup_machine_id(const char *directory) {
2809 int r;
2810
2811 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2812 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2813 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2814 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2815 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2816 * container behaves nicely). */
2817
2818 r = id128_get_machine(directory, &arg_uuid);
2819 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) {
2820 /* If the file is missing, empty, or uninitialized, we don't mind */
2821 if (sd_id128_is_null(arg_uuid)) {
2822 r = sd_id128_randomize(&arg_uuid);
2823 if (r < 0)
2824 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2825 }
2826 } else if (r < 0)
2827 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2828
2829 return 0;
2830 }
2831
2832 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2833 int r;
2834
2835 assert(directory);
2836
2837 if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2838 return 0;
2839
2840 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2841 if (r == -EOPNOTSUPP)
2842 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2843 if (r == -EBADE)
2844 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2845 if (r < 0)
2846 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2847 if (r == 0)
2848 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2849 else
2850 log_debug("Patched directory tree to match UID/GID range.");
2851
2852 return r;
2853 }
2854
2855 /*
2856 * Return values:
2857 * < 0 : wait_for_terminate() failed to get the state of the
2858 * container, the container was terminated by a signal, or
2859 * failed for an unknown reason. No change is made to the
2860 * container argument.
2861 * > 0 : The program executed in the container terminated with an
2862 * error. The exit code of the program executed in the
2863 * container is returned. The container argument has been set
2864 * to CONTAINER_TERMINATED.
2865 * 0 : The container is being rebooted, has been shut down or exited
2866 * successfully. The container argument has been set to either
2867 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2868 *
2869 * That is, success is indicated by a return value of zero, and an
2870 * error is indicated by a non-zero value.
2871 */
2872 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2873 siginfo_t status;
2874 int r;
2875
2876 r = wait_for_terminate(pid, &status);
2877 if (r < 0)
2878 return log_warning_errno(r, "Failed to wait for container: %m");
2879
2880 switch (status.si_code) {
2881
2882 case CLD_EXITED:
2883 if (status.si_status == 0)
2884 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2885 else
2886 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2887
2888 *container = CONTAINER_TERMINATED;
2889 return status.si_status;
2890
2891 case CLD_KILLED:
2892 if (status.si_status == SIGINT) {
2893 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2894 *container = CONTAINER_TERMINATED;
2895 return 0;
2896
2897 } else if (status.si_status == SIGHUP) {
2898 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2899 *container = CONTAINER_REBOOTED;
2900 return 0;
2901 }
2902
2903 _fallthrough_;
2904 case CLD_DUMPED:
2905 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2906 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2907
2908 default:
2909 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2910 "Container %s failed due to unknown reason.", arg_machine);
2911 }
2912 }
2913
2914 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2915 pid_t pid;
2916
2917 pid = PTR_TO_PID(userdata);
2918 if (pid > 0) {
2919 if (kill(pid, arg_kill_signal) >= 0) {
2920 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2921 sd_event_source_set_userdata(s, NULL);
2922 return 0;
2923 }
2924 }
2925
2926 sd_event_exit(sd_event_source_get_event(s), 0);
2927 return 0;
2928 }
2929
2930 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2931 pid_t pid;
2932
2933 assert(s);
2934 assert(ssi);
2935
2936 pid = PTR_TO_PID(userdata);
2937
2938 for (;;) {
2939 siginfo_t si = {};
2940
2941 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2942 return log_error_errno(errno, "Failed to waitid(): %m");
2943 if (si.si_pid == 0) /* No pending children. */
2944 break;
2945 if (si.si_pid == pid) {
2946 /* The main process we care for has exited. Return from
2947 * signal handler but leave the zombie. */
2948 sd_event_exit(sd_event_source_get_event(s), 0);
2949 break;
2950 }
2951
2952 /* Reap all other children. */
2953 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2954 }
2955
2956 return 0;
2957 }
2958
2959 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2960 pid_t pid;
2961
2962 assert(m);
2963
2964 pid = PTR_TO_PID(userdata);
2965
2966 if (arg_kill_signal > 0) {
2967 log_info("Container termination requested. Attempting to halt container.");
2968 (void) kill(pid, arg_kill_signal);
2969 } else {
2970 log_info("Container termination requested. Exiting.");
2971 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2972 }
2973
2974 return 0;
2975 }
2976
2977 static int pick_paths(void) {
2978 int r;
2979
2980 if (arg_directory) {
2981 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
2982 PickFilter filter = pick_filter_image_dir;
2983
2984 filter.architecture = arg_architecture;
2985
2986 r = path_pick_update_warn(
2987 &arg_directory,
2988 &filter,
2989 PICK_ARCHITECTURE|PICK_TRIES,
2990 &result);
2991 if (r < 0) {
2992 /* Accept ENOENT here so that the --template= logic can work */
2993 if (r != -ENOENT)
2994 return r;
2995 } else
2996 arg_architecture = result.architecture;
2997 }
2998
2999 if (arg_image) {
3000 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3001 PickFilter filter = pick_filter_image_raw;
3002
3003 filter.architecture = arg_architecture;
3004
3005 r = path_pick_update_warn(
3006 &arg_image,
3007 &filter,
3008 PICK_ARCHITECTURE|PICK_TRIES,
3009 &result);
3010 if (r < 0)
3011 return r;
3012
3013 arg_architecture = result.architecture;
3014 }
3015
3016 if (arg_template) {
3017 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3018 PickFilter filter = pick_filter_image_dir;
3019
3020 filter.architecture = arg_architecture;
3021
3022 r = path_pick_update_warn(
3023 &arg_template,
3024 &filter,
3025 PICK_ARCHITECTURE,
3026 &result);
3027 if (r < 0)
3028 return r;
3029
3030 arg_architecture = result.architecture;
3031 }
3032
3033 return 0;
3034 }
3035
3036 static int determine_names(void) {
3037 int r;
3038
3039 if (arg_template && !arg_directory && arg_machine) {
3040
3041 /* If --template= was specified then we should not search for a machine, but instead create a
3042 * new one in /var/lib/machine. */
3043
3044 arg_directory = path_join("/var/lib/machines", arg_machine);
3045 if (!arg_directory)
3046 return log_oom();
3047 }
3048
3049 if (!arg_image && !arg_directory) {
3050 if (arg_machine) {
3051 _cleanup_(image_unrefp) Image *i = NULL;
3052
3053 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3054 if (r == -ENOENT)
3055 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3056 if (r < 0)
3057 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3058
3059 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3060 r = free_and_strdup(&arg_image, i->path);
3061 else
3062 r = free_and_strdup(&arg_directory, i->path);
3063 if (r < 0)
3064 return log_oom();
3065
3066 if (!arg_ephemeral)
3067 arg_read_only = arg_read_only || i->read_only;
3068 } else {
3069 r = safe_getcwd(&arg_directory);
3070 if (r < 0)
3071 return log_error_errno(r, "Failed to determine current directory: %m");
3072 }
3073
3074 if (!arg_directory && !arg_image)
3075 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3076 }
3077
3078 if (!arg_machine) {
3079 if (arg_directory && path_equal(arg_directory, "/")) {
3080 arg_machine = gethostname_malloc();
3081 if (!arg_machine)
3082 return log_oom();
3083 } else if (arg_image) {
3084 char *e;
3085
3086 r = path_extract_filename(arg_image, &arg_machine);
3087 if (r < 0)
3088 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
3089
3090 /* Truncate suffix if there is one */
3091 e = endswith(arg_machine, ".raw");
3092 if (e)
3093 *e = 0;
3094 } else {
3095 r = path_extract_filename(arg_directory, &arg_machine);
3096 if (r < 0)
3097 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
3098 }
3099
3100 hostname_cleanup(arg_machine);
3101 if (!hostname_is_valid(arg_machine, 0))
3102 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3103
3104 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3105 * to match fixed config file names. */
3106 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3107 if (!arg_settings_filename)
3108 return log_oom();
3109
3110 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3111 * instances at once without manually having to specify -M each time. */
3112 if (arg_ephemeral)
3113 if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3114 return log_oom();
3115 } else {
3116 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3117 if (!arg_settings_filename)
3118 return log_oom();
3119 }
3120
3121 return 0;
3122 }
3123
3124 static int chase_and_update(char **p, unsigned flags) {
3125 char *chased;
3126 int r;
3127
3128 assert(p);
3129
3130 if (!*p)
3131 return 0;
3132
3133 r = chase(*p, NULL, flags, &chased, NULL);
3134 if (r < 0)
3135 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3136
3137 return free_and_replace(*p, chased);
3138 }
3139
3140 static int determine_uid_shift(const char *directory) {
3141
3142 if (arg_userns_mode == USER_NAMESPACE_NO) {
3143 arg_uid_shift = 0;
3144 return 0;
3145 }
3146
3147 if (arg_uid_shift == UID_INVALID) {
3148 struct stat st;
3149
3150 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3151
3152 if (stat(directory, &st) < 0)
3153 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3154
3155 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3156
3157 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3158 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3159 "UID and GID base of %s don't match.", directory);
3160
3161 arg_uid_range = UINT32_C(0x10000);
3162
3163 if (arg_uid_shift != 0) {
3164 /* If the image is shifted already, then we'll fall back to classic chowning, for
3165 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3166
3167 if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3168 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3169 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3170 } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3171 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3172 "UID base of %s is not zero, UID mapping not supported.", directory);
3173 }
3174 }
3175
3176 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3177 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3178
3179 return 0;
3180 }
3181
3182 static unsigned long effective_clone_ns_flags(void) {
3183 unsigned long flags = arg_clone_ns_flags;
3184
3185 if (arg_private_network)
3186 flags |= CLONE_NEWNET;
3187 if (arg_use_cgns)
3188 flags |= CLONE_NEWCGROUP;
3189 if (arg_userns_mode != USER_NAMESPACE_NO)
3190 flags |= CLONE_NEWUSER;
3191
3192 return flags;
3193 }
3194
3195 static int patch_sysctl(void) {
3196
3197 /* This table is inspired by runc's sysctl() function */
3198 static const struct {
3199 const char *key;
3200 bool prefix;
3201 unsigned long clone_flags;
3202 } safe_sysctl[] = {
3203 { "kernel.hostname", false, CLONE_NEWUTS },
3204 { "kernel.domainname", false, CLONE_NEWUTS },
3205 { "kernel.msgmax", false, CLONE_NEWIPC },
3206 { "kernel.msgmnb", false, CLONE_NEWIPC },
3207 { "kernel.msgmni", false, CLONE_NEWIPC },
3208 { "kernel.sem", false, CLONE_NEWIPC },
3209 { "kernel.shmall", false, CLONE_NEWIPC },
3210 { "kernel.shmmax", false, CLONE_NEWIPC },
3211 { "kernel.shmmni", false, CLONE_NEWIPC },
3212 { "fs.mqueue.", true, CLONE_NEWIPC },
3213 { "net.", true, CLONE_NEWNET },
3214 };
3215
3216 unsigned long flags;
3217 int r;
3218
3219 flags = effective_clone_ns_flags();
3220
3221 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3222 bool good = false;
3223 size_t i;
3224
3225 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3226
3227 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3228 continue;
3229
3230 if (safe_sysctl[i].prefix)
3231 good = startswith(*k, safe_sysctl[i].key);
3232 else
3233 good = streq(*k, safe_sysctl[i].key);
3234
3235 if (good)
3236 break;
3237 }
3238
3239 if (!good)
3240 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3241
3242 r = sysctl_write(*k, *v);
3243 if (r < 0)
3244 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3245 }
3246
3247 return 0;
3248 }
3249
3250 static int inner_child(
3251 Barrier *barrier,
3252 int fd_inner_socket,
3253 FDSet *fds,
3254 char **os_release_pairs) {
3255
3256 _cleanup_free_ char *home = NULL;
3257 size_t n_env = 1;
3258 char *envp[] = {
3259 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3260 NULL, /* container */
3261 NULL, /* TERM */
3262 NULL, /* HOME */
3263 NULL, /* USER */
3264 NULL, /* LOGNAME */
3265 NULL, /* container_uuid */
3266 NULL, /* LISTEN_FDS */
3267 NULL, /* LISTEN_PID */
3268 NULL, /* NOTIFY_SOCKET */
3269 NULL, /* CREDENTIALS_DIRECTORY */
3270 NULL, /* LANG */
3271 NULL
3272 };
3273 const char *exec_target;
3274 _cleanup_strv_free_ char **env_use = NULL;
3275 int r, which_failed;
3276
3277 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3278 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3279 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3280 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3281 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3282 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3283 * namespace.
3284 *
3285 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3286 * unshare(). See below. */
3287
3288 assert(barrier);
3289 assert(fd_inner_socket >= 0);
3290
3291 log_debug("Inner child is initializing.");
3292
3293 if (arg_userns_mode != USER_NAMESPACE_NO) {
3294 /* Tell the parent, that it now can write the UID map. */
3295 (void) barrier_place(barrier); /* #1 */
3296
3297 /* Wait until the parent wrote the UID map */
3298 if (!barrier_place_and_sync(barrier)) /* #2 */
3299 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3300
3301 /* Become the new root user inside our namespace */
3302 r = reset_uid_gid();
3303 if (r < 0)
3304 return log_error_errno(r, "Couldn't become new root: %m");
3305
3306 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3307 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3308 * propagation, but simply create new peer groups for all our mounts). */
3309 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3310 if (r < 0)
3311 return r;
3312 }
3313
3314 r = mount_all(NULL,
3315 arg_mount_settings | MOUNT_IN_USERNS,
3316 arg_uid_shift,
3317 arg_selinux_apifs_context);
3318 if (r < 0)
3319 return r;
3320
3321 if (!arg_network_namespace_path && arg_private_network) {
3322 _cleanup_close_ int netns_fd = -EBADF;
3323
3324 if (arg_privileged) {
3325 if (unshare(CLONE_NEWNET) < 0)
3326 return log_error_errno(errno, "Failed to unshare network namespace: %m");
3327 }
3328
3329 netns_fd = namespace_open_by_type(NAMESPACE_NET);
3330 if (netns_fd < 0)
3331 return log_error_errno(netns_fd, "Failed to open newly allocate network namespace: %m");
3332
3333 r = send_one_fd(fd_inner_socket, netns_fd, 0);
3334 if (r < 0)
3335 return log_error_errno(r, "Failed to send network namespace to supervisor: %m");
3336
3337 /* Tell the parent that it can setup network interfaces. */
3338 (void) barrier_place(barrier); /* #3 */
3339 }
3340
3341 if (arg_privileged) {
3342 r = mount_sysfs(NULL, arg_mount_settings);
3343 if (r < 0)
3344 return r;
3345 }
3346
3347 /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */
3348 if (!barrier_place_and_sync(barrier)) /* #4 */
3349 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3350 "Parent died too early");
3351
3352 if (arg_use_cgns) {
3353 r = unshare(CLONE_NEWCGROUP);
3354 if (r < 0)
3355 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3356 r = mount_cgroups(
3357 "",
3358 arg_unified_cgroup_hierarchy,
3359 arg_userns_mode != USER_NAMESPACE_NO,
3360 arg_uid_shift,
3361 arg_uid_range,
3362 arg_selinux_apifs_context,
3363 true);
3364 } else
3365 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3366 if (r < 0)
3367 return r;
3368
3369 r = setup_boot_id();
3370 if (r < 0)
3371 return r;
3372
3373 r = setup_kmsg(fd_inner_socket);
3374 if (r < 0)
3375 return r;
3376
3377 r = mount_custom(
3378 "/",
3379 arg_custom_mounts,
3380 arg_n_custom_mounts,
3381 0,
3382 0,
3383 arg_selinux_apifs_context,
3384 MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3385 if (r < 0)
3386 return r;
3387
3388 if (setsid() < 0)
3389 return log_error_errno(errno, "setsid() failed: %m");
3390
3391 if (arg_private_network)
3392 (void) loopback_setup();
3393
3394 if (arg_expose_ports) {
3395 r = expose_port_send_rtnl(fd_inner_socket);
3396 if (r < 0)
3397 return r;
3398 }
3399
3400 if (arg_console_mode != CONSOLE_PIPE) {
3401 _cleanup_close_ int master = -EBADF;
3402 _cleanup_free_ char *console = NULL;
3403
3404 /* Allocate a pty and make it available as /dev/console. */
3405 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3406 if (master < 0)
3407 return log_error_errno(master, "Failed to allocate a pty: %m");
3408
3409 r = setup_dev_console(console);
3410 if (r < 0)
3411 return log_error_errno(r, "Failed to set up /dev/console: %m");
3412
3413 r = send_one_fd(fd_inner_socket, master, 0);
3414 if (r < 0)
3415 return log_error_errno(r, "Failed to send master fd: %m");
3416
3417 r = setup_stdio_as_dev_console();
3418 if (r < 0)
3419 return r;
3420 }
3421
3422 r = patch_sysctl();
3423 if (r < 0)
3424 return r;
3425
3426 if (arg_oom_score_adjust_set) {
3427 r = set_oom_score_adjust(arg_oom_score_adjust);
3428 if (r < 0)
3429 return log_error_errno(r, "Failed to adjust OOM score: %m");
3430 }
3431
3432 if (arg_cpu_set.set)
3433 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3434 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3435
3436 (void) setup_hostname();
3437
3438 if (arg_personality != PERSONALITY_INVALID) {
3439 r = safe_personality(arg_personality);
3440 if (r < 0)
3441 return log_error_errno(r, "personality() failed: %m");
3442 #ifdef ARCHITECTURE_SECONDARY
3443 } else if (arg_architecture == ARCHITECTURE_SECONDARY) {
3444 r = safe_personality(PER_LINUX32);
3445 if (r < 0)
3446 return log_error_errno(r, "personality() failed: %m");
3447 #endif
3448 } else if (!arg_quiet && arg_architecture >= 0 && arg_architecture != native_architecture())
3449 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3450 "invocation with qemu userspace emulator (or equivalent) in effect.",
3451 architecture_to_string(arg_architecture));
3452
3453 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3454 if (r < 0)
3455 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3456
3457 #if HAVE_SECCOMP
3458 if (arg_seccomp) {
3459
3460 if (is_seccomp_available()) {
3461 r = seccomp_load(arg_seccomp);
3462 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
3463 return log_error_errno(r, "Failed to install seccomp filter: %m");
3464 if (r < 0)
3465 log_debug_errno(r, "Failed to install seccomp filter: %m");
3466 }
3467 } else
3468 #endif
3469 {
3470 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3471 if (r < 0)
3472 return r;
3473 }
3474
3475 if (arg_suppress_sync) {
3476 #if HAVE_SECCOMP
3477 r = seccomp_suppress_sync();
3478 if (r < 0)
3479 log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3480 #else
3481 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3482 #endif
3483 }
3484
3485 #if HAVE_SELINUX
3486 if (arg_selinux_context)
3487 if (setexeccon(arg_selinux_context) < 0)
3488 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3489 #endif
3490
3491 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3492 * if we need to later on. */
3493 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3494 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3495
3496 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3497 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3498 else
3499 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3500 if (r < 0)
3501 return r;
3502
3503 r = drop_capabilities(getuid());
3504 if (r < 0)
3505 return log_error_errno(r, "Dropping capabilities failed: %m");
3506
3507 if (arg_no_new_privileges)
3508 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3509 return log_error_errno(errno, "Failed to disable new privileges: %m");
3510
3511 /* LXC sets container=lxc, so follow the scheme here */
3512 envp[n_env++] = strjoina("container=", arg_container_service_name);
3513
3514 envp[n_env] = strv_find_prefix(environ, "TERM=");
3515 if (envp[n_env])
3516 n_env++;
3517
3518 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3519 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3520 return log_oom();
3521
3522 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3523 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3524 asprintf(envp + n_env++, "LOGNAME=%s", arg_user ?: "root") < 0)
3525 return log_oom();
3526
3527 assert(!sd_id128_is_null(arg_uuid));
3528
3529 if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3530 return log_oom();
3531
3532 if (!fdset_isempty(fds)) {
3533 r = fdset_cloexec(fds, false);
3534 if (r < 0)
3535 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3536
3537 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3538 (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3539 return log_oom();
3540 }
3541 if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3542 return log_oom();
3543
3544 if (arg_credentials.n_credentials > 0) {
3545 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3546 if (!envp[n_env])
3547 return log_oom();
3548 n_env++;
3549 }
3550
3551 if (arg_start_mode != START_BOOT) {
3552 envp[n_env] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE);
3553 if (!envp[n_env])
3554 return log_oom();
3555 n_env++;
3556 }
3557
3558 env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3559 if (!env_use)
3560 return log_oom();
3561
3562 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
3563 if (!barrier_place_and_sync(barrier)) /* #5 */
3564 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3565
3566 /* Note, this should be done this late (💣 and not moved earlier! 💣), so that all namespacing
3567 * changes are already in effect by now, so that any resolved paths here definitely reference
3568 * resources inside the container, and not outside of them. */
3569 if (arg_chdir)
3570 if (chdir(arg_chdir) < 0)
3571 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3572
3573 if (arg_start_mode == START_PID2) {
3574 r = stub_pid1(arg_uuid);
3575 if (r < 0)
3576 return r;
3577 }
3578
3579 if (arg_console_mode != CONSOLE_PIPE) {
3580 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3581 * are configured for that. Acquire it as controlling tty. */
3582 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3583 return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3584 }
3585
3586 log_debug("Inner child completed, invoking payload.");
3587
3588 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3589 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3590 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3591 log_close();
3592 log_set_open_when_needed(true);
3593 log_settle_target();
3594
3595 (void) fdset_close_others(fds);
3596
3597 if (arg_start_mode == START_BOOT) {
3598 char **a;
3599 size_t m;
3600
3601 /* Automatically search for the init system */
3602
3603 m = strv_length(arg_parameters);
3604 a = newa(char*, m + 2);
3605 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3606 a[1 + m] = NULL;
3607
3608 FOREACH_STRING(init,
3609 "/usr/lib/systemd/systemd",
3610 "/lib/systemd/systemd",
3611 "/sbin/init") {
3612 a[0] = (char*) init;
3613 execve(a[0], a, env_use);
3614 }
3615
3616 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3617 } else if (!strv_isempty(arg_parameters)) {
3618 const char *dollar_path;
3619
3620 exec_target = arg_parameters[0];
3621
3622 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3623 * binary. */
3624 dollar_path = strv_env_get(env_use, "PATH");
3625 if (dollar_path) {
3626 if (setenv("PATH", dollar_path, 1) < 0)
3627 return log_error_errno(errno, "Failed to update $PATH: %m");
3628 }
3629
3630 execvpe(arg_parameters[0], arg_parameters, env_use);
3631 } else {
3632 if (!arg_chdir)
3633 /* If we cannot change the directory, we'll end up in /, that is expected. */
3634 (void) chdir(home ?: "/root");
3635
3636 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3637 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3638 execle("/bin/bash", "-bash", NULL, env_use);
3639 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3640 execle("/bin/sh", "-sh", NULL, env_use);
3641
3642 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3643 }
3644
3645 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3646 }
3647
3648 static int setup_notify_child(const void *directory) {
3649 _cleanup_close_ int fd = -EBADF;
3650 _cleanup_free_ char *j = NULL;
3651 union sockaddr_union sa = {
3652 .un.sun_family = AF_UNIX,
3653 };
3654 int r;
3655
3656 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3657 if (fd < 0)
3658 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3659
3660 if (directory) {
3661 j = path_join(directory, NSPAWN_NOTIFY_SOCKET_PATH);
3662 if (!j)
3663 return log_oom();
3664 }
3665
3666 r = sockaddr_un_set_path(&sa.un, j ?: NSPAWN_NOTIFY_SOCKET_PATH);
3667 if (r < 0)
3668 return log_error_errno(r, "Failed to set AF_UNIX path to %s: %m", j ?: NSPAWN_NOTIFY_SOCKET_PATH);
3669
3670 (void) mkdir_parents(sa.un.sun_path, 0755);
3671 (void) sockaddr_un_unlink(&sa.un);
3672
3673 WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */
3674 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3675 if (r < 0)
3676 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3677 }
3678
3679 r = userns_lchown(sa.un.sun_path, 0, 0);
3680 if (r < 0)
3681 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3682
3683 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3684 if (r < 0)
3685 return log_error_errno(r, "SO_PASSCRED failed: %m");
3686
3687 return TAKE_FD(fd);
3688 }
3689
3690 static int setup_unix_export_dir_outside(char **ret) {
3691 int r;
3692
3693 assert(ret);
3694
3695 if (!arg_privileged) {
3696 log_debug("Not digging socket tunnel, because running unprivileged.");
3697 return 0;
3698 }
3699
3700 _cleanup_free_ char *p = NULL;
3701 p = path_join("/run/systemd/nspawn/unix-export", arg_machine);
3702 if (!p)
3703 return log_oom();
3704
3705 r = path_is_mount_point(p);
3706 if (r > 0)
3707 return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Mount point '%s' exists already, refusing.", p);
3708 if (r < 0 && r != -ENOENT)
3709 return log_error_errno(r, "Failed to detect if '%s' is a mount point: %m", p);
3710
3711 r = mkdir_p(p, 0755);
3712 if (r < 0)
3713 return log_error_errno(r, "Failed to create '%s': %m", p);
3714
3715 _cleanup_(rmdir_and_freep) char *q = TAKE_PTR(p);
3716
3717 /* Mount the "unix export" directory really tiny, just 64 inodes. We mark the superblock writable
3718 * (since the container shall bind sockets into it). */
3719 r = mount_nofollow_verbose(
3720 LOG_ERR,
3721 "tmpfs",
3722 q,
3723 "tmpfs",
3724 MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
3725 "size=4M,nr_inodes=64,mode=0755");
3726 if (r < 0)
3727 return r;
3728
3729 _cleanup_(umount_and_rmdir_and_freep) char *w = TAKE_PTR(q);
3730
3731 /* After creating the superblock we change the bind mount to be read-only. This means that the fs
3732 * itself is writable, but not through the mount accessible from the host. */
3733 r = mount_nofollow_verbose(
3734 LOG_ERR,
3735 /* source= */ NULL,
3736 w,
3737 /* fstype= */ NULL,
3738 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
3739 /* options= */ NULL);
3740 if (r < 0)
3741 return r;
3742
3743 *ret = TAKE_PTR(w);
3744 return 0;
3745 }
3746
3747 static int setup_unix_export_host_inside(const char *directory, const char *unix_export_path) {
3748 int r;
3749
3750 assert(directory);
3751
3752 if (!arg_privileged)
3753 return 0;
3754
3755 assert(unix_export_path);
3756
3757 r = make_run_host(directory);
3758 if (r < 0)
3759 return r;
3760
3761 _cleanup_free_ char *p = path_join(directory, "run/host/unix-export");
3762 if (!p)
3763 return log_oom();
3764
3765 if (mkdir(p, 0755) < 0)
3766 return log_error_errno(errno, "Failed to create '%s': %m", p);
3767
3768 r = mount_nofollow_verbose(
3769 LOG_ERR,
3770 unix_export_path,
3771 p,
3772 /* fstype= */ NULL,
3773 MS_BIND,
3774 /* options= */ NULL);
3775 if (r < 0)
3776 return r;
3777
3778 r = mount_nofollow_verbose(
3779 LOG_ERR,
3780 /* source= */ NULL,
3781 p,
3782 /* fstype= */ NULL,
3783 MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported(),
3784 /* options= */ NULL);
3785 if (r < 0)
3786 return r;
3787
3788 r = userns_lchown(p, 0, 0);
3789 if (r < 0)
3790 return log_error_errno(r, "Failed to chown '%s': %m", p);
3791
3792 return 0;
3793 }
3794
3795 static DissectImageFlags determine_dissect_image_flags(void) {
3796 return
3797 DISSECT_IMAGE_GENERIC_ROOT |
3798 DISSECT_IMAGE_REQUIRE_ROOT |
3799 DISSECT_IMAGE_RELAX_VAR_CHECK |
3800 DISSECT_IMAGE_USR_NO_ROOT |
3801 DISSECT_IMAGE_DISCARD_ON_LOOP |
3802 DISSECT_IMAGE_ADD_PARTITION_DEVICES |
3803 DISSECT_IMAGE_PIN_PARTITION_DEVICES |
3804 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
3805 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
3806 (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
3807 }
3808
3809 static int outer_child(
3810 Barrier *barrier,
3811 const char *directory,
3812 DissectedImage *dissected_image,
3813 int fd_outer_socket,
3814 int fd_inner_socket,
3815 FDSet *fds,
3816 int netns_fd,
3817 const char *unix_export_path) {
3818
3819 _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3820 _cleanup_strv_free_ char **os_release_pairs = NULL;
3821 _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
3822 bool idmap = false;
3823 const char *p;
3824 pid_t pid;
3825 ssize_t l;
3826 int r;
3827
3828 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3829 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3830 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3831 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3832 * forked off it, and it exits. */
3833
3834 assert(barrier);
3835 assert(directory);
3836 assert(fd_outer_socket >= 0);
3837 assert(fd_inner_socket >= 0);
3838
3839 log_debug("Outer child is initializing.");
3840
3841 r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3842 if (r < 0)
3843 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3844
3845 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3846 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3847
3848 r = reset_audit_loginuid();
3849 if (r < 0)
3850 return r;
3851
3852 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3853 * mounts to the real root. */
3854 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3855 if (r < 0)
3856 return r;
3857
3858 if (dissected_image) {
3859 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3860 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3861 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3862 * right place right away. This makes sure ESP partitions and userns are compatible. */
3863
3864 r = dissected_image_mount_and_warn(
3865 dissected_image,
3866 directory,
3867 arg_uid_shift,
3868 arg_uid_range,
3869 /* userns_fd= */ -EBADF,
3870 determine_dissect_image_flags()|
3871 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3872 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3873 if (r < 0)
3874 return r;
3875 }
3876
3877 r = determine_uid_shift(directory);
3878 if (r < 0)
3879 return r;
3880
3881 if (arg_userns_mode != USER_NAMESPACE_NO) {
3882 r = namespace_open(0,
3883 /* ret_pidns_fd = */ NULL,
3884 &mntns_fd,
3885 /* ret_netns_fd = */ NULL,
3886 /* ret_userns_fd = */ NULL,
3887 /* ret_root_fd = */ NULL);
3888 if (r < 0)
3889 return log_error_errno(r, "Failed to pin outer mount namespace: %m");
3890
3891 l = send_one_fd(fd_outer_socket, mntns_fd, 0);
3892 if (l < 0)
3893 return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
3894 mntns_fd = safe_close(mntns_fd);
3895
3896 /* Let the parent know which UID shift we read from the image */
3897 l = send(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3898 if (l < 0)
3899 return log_error_errno(errno, "Failed to send UID shift: %m");
3900 if (l != sizeof(arg_uid_shift))
3901 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3902 "Short write while sending UID shift.");
3903
3904 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3905 /* When we are supposed to pick the UID shift, the parent will check now whether the
3906 * UID shift we just read from the image is available. If yes, it will send the UID
3907 * shift back to us, if not it will pick a different one, and send it back to us. */
3908
3909 l = recv(fd_outer_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3910 if (l < 0)
3911 return log_error_errno(errno, "Failed to recv UID shift: %m");
3912 if (l != sizeof(arg_uid_shift))
3913 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3914 "Short read while receiving UID shift.");
3915 }
3916
3917 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3918 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3919 }
3920
3921 if (path_equal(directory, "/")) {
3922 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3923 * place, so that we can make changes to its mount structure (for example, to implement
3924 * --volatile=) without this interfering with our ability to access files such as
3925 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3926 * (instead of a temporary directory, since we are living in our own mount namespace here
3927 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3928 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3929
3930 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3931 if (r < 0)
3932 return r;
3933
3934 directory = "/run/systemd/nspawn-root";
3935 }
3936
3937 /* Make sure we always have a mount that we can move to root later on. */
3938 r = make_mount_point(directory);
3939 if (r < 0)
3940 return r;
3941
3942 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3943 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3944 * we'll live in our own little world from now on, and propagation from the host may only happen via
3945 * the mount tunnel dir, or not at all. */
3946 r = mount_follow_verbose(LOG_ERR, NULL, directory, NULL, MS_PRIVATE|MS_REC, NULL);
3947 if (r < 0)
3948 return r;
3949
3950 r = setup_pivot_root(
3951 directory,
3952 arg_pivot_root_new,
3953 arg_pivot_root_old);
3954 if (r < 0)
3955 return r;
3956
3957 r = setup_volatile_mode(
3958 directory,
3959 arg_volatile_mode,
3960 arg_uid_shift,
3961 arg_selinux_apifs_context);
3962 if (r < 0)
3963 return r;
3964
3965 r = bind_user_prepare(
3966 directory,
3967 arg_bind_user,
3968 arg_uid_shift,
3969 arg_uid_range,
3970 &arg_custom_mounts, &arg_n_custom_mounts,
3971 &bind_user_context);
3972 if (r < 0)
3973 return r;
3974
3975 if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3976 /* Send the user maps we determined to the parent, so that it installs it in our user
3977 * namespace UID map table */
3978
3979 for (size_t i = 0; i < bind_user_context->n_data; i++) {
3980 uid_t map[] = {
3981 bind_user_context->data[i].payload_user->uid,
3982 bind_user_context->data[i].host_user->uid,
3983 (uid_t) bind_user_context->data[i].payload_group->gid,
3984 (uid_t) bind_user_context->data[i].host_group->gid,
3985 };
3986
3987 l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
3988 if (l < 0)
3989 return log_error_errno(errno, "Failed to send user UID map: %m");
3990 if (l != sizeof(map))
3991 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3992 "Short write while sending user UID map.");
3993 }
3994 }
3995
3996 r = mount_custom(
3997 directory,
3998 arg_custom_mounts,
3999 arg_n_custom_mounts,
4000 arg_uid_shift,
4001 arg_uid_range,
4002 arg_selinux_apifs_context,
4003 MOUNT_ROOT_ONLY);
4004 if (r < 0)
4005 return r;
4006
4007 if (arg_userns_mode != USER_NAMESPACE_NO &&
4008 IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
4009 arg_uid_shift != 0) {
4010 _cleanup_free_ char *usr_subtree = NULL;
4011 char *dirs[3];
4012 size_t i = 0;
4013
4014 dirs[i++] = (char*) directory;
4015
4016 if (dissected_image && dissected_image->partitions[PARTITION_USR].found) {
4017 usr_subtree = path_join(directory, "/usr");
4018 if (!usr_subtree)
4019 return log_oom();
4020
4021 dirs[i++] = usr_subtree;
4022 }
4023
4024 dirs[i] = NULL;
4025
4026 r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
4027 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
4028 /* This might fail because the kernel or file system doesn't support idmapping. We
4029 * can't really distinguish this nicely, nor do we have any guarantees about the
4030 * error codes we see, could be EOPNOTSUPP or EINVAL. */
4031 if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
4032 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
4033 "ID mapped mounts are apparently not available, sorry.");
4034
4035 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
4036 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
4037 } else if (r < 0)
4038 return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
4039 else {
4040 log_debug("ID mapped mounts available, making use of them.");
4041 idmap = true;
4042 }
4043 }
4044
4045 if (dissected_image) {
4046 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
4047 r = dissected_image_mount_and_warn(
4048 dissected_image,
4049 directory,
4050 arg_uid_shift,
4051 arg_uid_range,
4052 /* userns_fd= */ -EBADF,
4053 determine_dissect_image_flags()|
4054 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
4055 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
4056 if (r < 0)
4057 return r;
4058 }
4059
4060 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4061 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
4062
4063 r = detect_unified_cgroup_hierarchy_from_image(directory);
4064 if (r < 0)
4065 return r;
4066
4067 l = send(fd_outer_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
4068 if (l < 0)
4069 return log_error_errno(errno, "Failed to send cgroup mode: %m");
4070 if (l != sizeof(arg_unified_cgroup_hierarchy))
4071 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4072 "Short write while sending cgroup mode.");
4073 }
4074
4075 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
4076 if (r < 0)
4077 return r;
4078
4079 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4080 if (r < 0)
4081 return r;
4082
4083 if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
4084 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
4085 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
4086 if (r < 0)
4087 return log_error_errno(r, "Failed to make tree read-only: %m");
4088 }
4089
4090 r = mount_all(directory,
4091 arg_mount_settings,
4092 arg_uid_shift,
4093 arg_selinux_apifs_context);
4094 if (r < 0)
4095 return r;
4096
4097 r = copy_devnodes(directory);
4098 if (r < 0)
4099 return r;
4100
4101 r = make_extra_nodes(directory);
4102 if (r < 0)
4103 return r;
4104
4105 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
4106
4107 p = prefix_roota(directory, "/run/host");
4108 (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
4109
4110 r = setup_unix_export_host_inside(directory, unix_export_path);
4111 if (r < 0)
4112 return r;
4113
4114 r = setup_pts(directory);
4115 if (r < 0)
4116 return r;
4117
4118 r = mount_tunnel_dig(directory);
4119 if (r < 0)
4120 return r;
4121
4122 r = setup_keyring();
4123 if (r < 0)
4124 return r;
4125
4126 r = setup_credentials(directory);
4127 if (r < 0)
4128 return r;
4129
4130 r = bind_user_setup(bind_user_context, directory);
4131 if (r < 0)
4132 return r;
4133
4134 r = mount_custom(
4135 directory,
4136 arg_custom_mounts,
4137 arg_n_custom_mounts,
4138 arg_uid_shift,
4139 arg_uid_range,
4140 arg_selinux_apifs_context,
4141 MOUNT_NON_ROOT_ONLY);
4142 if (r < 0)
4143 return r;
4144
4145 r = setup_timezone(directory);
4146 if (r < 0)
4147 return r;
4148
4149 r = setup_resolv_conf(directory);
4150 if (r < 0)
4151 return r;
4152
4153 r = setup_machine_id(directory);
4154 if (r < 0)
4155 return r;
4156
4157 r = setup_journal(directory);
4158 if (r < 0)
4159 return r;
4160
4161 /* The same stuff as the $container env var, but nicely readable for the entire payload */
4162 p = prefix_roota(directory, "/run/host/container-manager");
4163 (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444);
4164
4165 /* The same stuff as the $container_uuid env var */
4166 p = prefix_roota(directory, "/run/host/container-uuid");
4167 (void) write_string_filef(p, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_MODE_0444, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
4168
4169 if (!arg_use_cgns) {
4170 r = mount_cgroups(
4171 directory,
4172 arg_unified_cgroup_hierarchy,
4173 arg_userns_mode != USER_NAMESPACE_NO,
4174 arg_uid_shift,
4175 arg_uid_range,
4176 arg_selinux_apifs_context,
4177 false);
4178 if (r < 0)
4179 return r;
4180 }
4181
4182 /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll
4183 * now switch into the target directory, and then do the final setup from there. If a user namespace
4184 * is then allocated for the container, the root mount and everything else will be out of reach for
4185 * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and
4186 * procfs then anymore, since that only works if there's an unobstructed instance currently
4187 * visible. Hence there we do it the other way round: we first allocate a new set of namespaces
4188 * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
4189
4190 if (arg_privileged) {
4191 /* Mark everything as shared so our mounts get propagated down. This is required to make new
4192 * bind mounts available in systemd services inside the container that create a new mount
4193 * namespace. See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
4194 * /dev/) done after this will inherit the shared propagation mode.
4195 *
4196 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4197 * directory mount to root later on.
4198 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4199 */
4200 r = mount_switch_root(directory, MS_SHARED);
4201 if (r < 0)
4202 return log_error_errno(r, "Failed to move root directory: %m");
4203
4204 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4205 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4206 * the container. */
4207 r = mount_tunnel_open();
4208 if (r < 0)
4209 return r;
4210
4211 if (arg_userns_mode != USER_NAMESPACE_NO) {
4212 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4213 * requires that a fully visible instance is already present in the target mount
4214 * namespace. Mount one here so the inner child can mount its own instances. Later
4215 * we umount the temporary instances created here before we actually exec the
4216 * payload. Since the rootfs is shared the umount will propagate into the container.
4217 * Note, the inner child wouldn't be able to unmount the instances on its own since
4218 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4219 * this. */
4220 r = pin_fully_visible_fs();
4221 if (r < 0)
4222 return r;
4223 }
4224
4225 fd = setup_notify_child(NULL);
4226 } else
4227 fd = setup_notify_child(directory);
4228 if (fd < 0)
4229 return fd;
4230
4231 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4232 arg_clone_ns_flags |
4233 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
4234 ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
4235 if (pid < 0)
4236 return log_error_errno(errno, "Failed to fork inner child: %m");
4237 if (pid == 0) {
4238 fd_outer_socket = safe_close(fd_outer_socket);
4239
4240 /* The inner child has all namespaces that are requested, so that we all are owned by the
4241 * user if user namespaces are turned on. */
4242
4243 if (arg_network_namespace_path) {
4244 r = namespace_enter(/* pidns_fd = */ -EBADF,
4245 /* mntns_fd = */ -EBADF,
4246 netns_fd,
4247 /* userns_fd = */ -EBADF,
4248 /* root_fd = */ -EBADF);
4249 if (r < 0)
4250 return log_error_errno(r, "Failed to join network namespace: %m");
4251 }
4252
4253 if (!arg_privileged) {
4254 /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
4255 * inside the inner namespaces, but before we switch root. Hence do so here. */
4256 _cleanup_free_ char *j = path_join(directory, "/proc");
4257 if (!j)
4258 return log_oom();
4259
4260 r = mount_follow_verbose(LOG_ERR, "proc", j, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
4261 if (r < 0)
4262 return r;
4263
4264 r = mount_sysfs(directory, arg_mount_settings);
4265 if (r < 0)
4266 return r;
4267
4268 r = mount_switch_root(directory, MS_SHARED);
4269 if (r < 0)
4270 return log_error_errno(r, "Failed to move root directory: %m");
4271 }
4272
4273 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
4274 if (r < 0)
4275 _exit(EXIT_FAILURE);
4276
4277 _exit(EXIT_SUCCESS);
4278 }
4279
4280 l = send(fd_outer_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4281 if (l < 0)
4282 return log_error_errno(errno, "Failed to send PID: %m");
4283 if (l != sizeof(pid))
4284 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4285 "Short write while sending PID.");
4286
4287 l = send(fd_outer_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4288 if (l < 0)
4289 return log_error_errno(errno, "Failed to send machine ID: %m");
4290 if (l != sizeof(arg_uuid))
4291 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4292 "Short write while sending machine ID.");
4293
4294 l = send_one_fd(fd_outer_socket, fd, 0);
4295 if (l < 0)
4296 return log_error_errno(l, "Failed to send notify fd: %m");
4297
4298 fd_outer_socket = safe_close(fd_outer_socket);
4299 fd_inner_socket = safe_close(fd_inner_socket);
4300 netns_fd = safe_close(netns_fd);
4301
4302 return 0;
4303 }
4304
4305 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4306 bool tried_hashed = false;
4307 unsigned n_tries = 100;
4308 uid_t candidate;
4309 int r;
4310
4311 assert(shift);
4312 assert(ret_lock_file);
4313 assert(arg_userns_mode == USER_NAMESPACE_PICK);
4314 assert(arg_uid_range == 0x10000U);
4315
4316 candidate = *shift;
4317
4318 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4319
4320 for (;;) {
4321 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4322 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4323
4324 if (--n_tries <= 0)
4325 return -EBUSY;
4326
4327 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4328 goto next;
4329 if ((candidate & UINT32_C(0xFFFF)) != 0)
4330 goto next;
4331
4332 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4333 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4334 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4335 goto next;
4336 if (r < 0)
4337 return r;
4338
4339 /* Make some superficial checks whether the range is currently known in the user database */
4340 if (getpwuid_malloc(candidate, /* ret= */ NULL) >= 0)
4341 goto next;
4342 if (getpwuid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0)
4343 goto next;
4344 if (getgrgid_malloc(candidate, /* ret= */ NULL) >= 0)
4345 goto next;
4346 if (getgrgid_malloc(candidate + UINT32_C(0xFFFE), /* ret= */ NULL) >= 0)
4347 goto next;
4348
4349 *ret_lock_file = lf;
4350 lf = (struct LockFile) LOCK_FILE_INIT;
4351 *shift = candidate;
4352 return 0;
4353
4354 next:
4355 if (arg_machine && !tried_hashed) {
4356 /* Try to hash the base from the container name */
4357
4358 static const uint8_t hash_key[] = {
4359 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4360 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4361 };
4362
4363 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4364
4365 tried_hashed = true;
4366 } else
4367 random_bytes(&candidate, sizeof(candidate));
4368
4369 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4370 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4371 }
4372 }
4373
4374 static int add_one_uid_map(
4375 char **p,
4376 uid_t container_uid,
4377 uid_t host_uid,
4378 uid_t range) {
4379
4380 return strextendf(p,
4381 UID_FMT " " UID_FMT " " UID_FMT "\n",
4382 container_uid, host_uid, range);
4383 }
4384
4385 static int make_uid_map_string(
4386 const uid_t bind_user_uid[],
4387 size_t n_bind_user_uid,
4388 size_t offset,
4389 char **ret) {
4390
4391 _cleanup_free_ char *s = NULL;
4392 uid_t previous_uid = 0;
4393 int r;
4394
4395 assert(n_bind_user_uid == 0 || bind_user_uid);
4396 assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4397 assert(ret);
4398
4399 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4400 * quadruplet, consisting of host and container UID + GID. */
4401
4402 for (size_t i = 0; i < n_bind_user_uid; i++) {
4403 uid_t payload_uid = bind_user_uid[i*4+offset],
4404 host_uid = bind_user_uid[i*4+offset+1];
4405
4406 assert(previous_uid <= payload_uid);
4407 assert(payload_uid < arg_uid_range);
4408
4409 /* Add a range to close the gap to previous entry */
4410 if (payload_uid > previous_uid) {
4411 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4412 if (r < 0)
4413 return r;
4414 }
4415
4416 /* Map this specific user */
4417 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4418 if (r < 0)
4419 return r;
4420
4421 previous_uid = payload_uid + 1;
4422 }
4423
4424 /* And add a range to close the gap to finish the range */
4425 if (arg_uid_range > previous_uid) {
4426 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4427 if (r < 0)
4428 return r;
4429 }
4430
4431 assert(s);
4432
4433 *ret = TAKE_PTR(s);
4434 return 0;
4435 }
4436
4437 static int setup_uid_map(
4438 pid_t pid,
4439 const uid_t bind_user_uid[],
4440 size_t n_bind_user_uid) {
4441
4442 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4443 _cleanup_free_ char *s = NULL;
4444 int r;
4445
4446 assert(pid > 1);
4447
4448 /* Build the UID map string */
4449 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4450 return log_oom();
4451
4452 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4453 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4454 if (r < 0)
4455 return log_error_errno(r, "Failed to write UID map: %m");
4456
4457 /* And now build the GID map string */
4458 s = mfree(s);
4459 if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4460 return log_oom();
4461
4462 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4463 r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4464 if (r < 0)
4465 return log_error_errno(r, "Failed to write GID map: %m");
4466
4467 return 0;
4468 }
4469
4470 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4471 char buf[NOTIFY_BUFFER_MAX+1];
4472 char *p = NULL;
4473 struct iovec iovec = {
4474 .iov_base = buf,
4475 .iov_len = sizeof(buf)-1,
4476 };
4477 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4478 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4479 struct msghdr msghdr = {
4480 .msg_iov = &iovec,
4481 .msg_iovlen = 1,
4482 .msg_control = &control,
4483 .msg_controllen = sizeof(control),
4484 };
4485 struct ucred *ucred;
4486 ssize_t n;
4487 pid_t inner_child_pid;
4488 _cleanup_strv_free_ char **tags = NULL;
4489 int r;
4490
4491 assert(userdata);
4492
4493 inner_child_pid = PTR_TO_PID(userdata);
4494
4495 if (revents != EPOLLIN) {
4496 log_warning("Got unexpected poll event for notify fd.");
4497 return 0;
4498 }
4499
4500 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4501 if (ERRNO_IS_NEG_TRANSIENT(n))
4502 return 0;
4503 else if (n == -EXFULL) {
4504 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4505 return 0;
4506 } else if (n < 0)
4507 return log_warning_errno(n, "Couldn't read notification socket: %m");
4508
4509 cmsg_close_all(&msghdr);
4510
4511 ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4512 if (!ucred || ucred->pid != inner_child_pid) {
4513 log_debug("Received notify message without valid credentials. Ignoring.");
4514 return 0;
4515 }
4516
4517 if ((size_t) n >= sizeof(buf)) {
4518 log_warning("Received notify message exceeded maximum size. Ignoring.");
4519 return 0;
4520 }
4521
4522 buf[n] = 0;
4523 tags = strv_split(buf, "\n\r");
4524 if (!tags)
4525 return log_oom();
4526
4527 if (DEBUG_LOGGING) {
4528 _cleanup_free_ char *joined = strv_join(tags, " ");
4529
4530 if (joined) {
4531 _cleanup_free_ char *j = cescape(joined);
4532 free_and_replace(joined, j);
4533 }
4534
4535 log_debug("Got sd_notify() message: %s", strnull(joined));
4536 }
4537
4538 if (strv_contains(tags, "READY=1")) {
4539 r = sd_notify(false, "READY=1\n");
4540 if (r < 0)
4541 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4542 }
4543
4544 p = strv_find_startswith(tags, "STATUS=");
4545 if (p)
4546 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4547
4548 return 0;
4549 }
4550
4551 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4552 int r;
4553
4554 if (fd < 0)
4555 return 0;
4556
4557 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4558 if (r < 0)
4559 return log_error_errno(r, "Failed to allocate notify event source: %m");
4560
4561 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4562
4563 return 0;
4564 }
4565
4566 static void set_window_title(PTYForward *f) {
4567 _cleanup_free_ char *hn = NULL, *dot = NULL;
4568
4569 assert(f);
4570
4571 (void) gethostname_strict(&hn);
4572
4573 if (emoji_enabled())
4574 dot = strjoin(special_glyph(SPECIAL_GLYPH_BLUE_CIRCLE), " ");
4575
4576 if (hn)
4577 (void) pty_forward_set_titlef(f, "%sContainer %s on %s", strempty(dot), arg_machine, hn);
4578 else
4579 (void) pty_forward_set_titlef(f, "%sContainer %s", strempty(dot), arg_machine);
4580
4581 if (dot)
4582 (void) pty_forward_set_title_prefix(f, dot);
4583 }
4584
4585 static int merge_settings(Settings *settings, const char *path) {
4586 int rl;
4587
4588 assert(settings);
4589 assert(path);
4590
4591 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4592 * that this steals the fields of the Settings* structure, and hence modifies it. */
4593
4594 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4595 settings->start_mode >= 0) {
4596 arg_start_mode = settings->start_mode;
4597 strv_free_and_replace(arg_parameters, settings->parameters);
4598 }
4599
4600 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4601 settings->ephemeral >= 0)
4602 arg_ephemeral = settings->ephemeral;
4603
4604 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4605 settings->root) {
4606
4607 if (!arg_settings_trusted)
4608 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4609 else
4610 free_and_replace(arg_directory, settings->root);
4611 }
4612
4613 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4614 settings->pivot_root_new) {
4615 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4616 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4617 }
4618
4619 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4620 settings->working_directory)
4621 free_and_replace(arg_chdir, settings->working_directory);
4622
4623 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4624 settings->environment)
4625 strv_free_and_replace(arg_setenv, settings->environment);
4626
4627 if ((arg_settings_mask & SETTING_USER) == 0) {
4628
4629 if (settings->user)
4630 free_and_replace(arg_user, settings->user);
4631
4632 if (uid_is_valid(settings->uid))
4633 arg_uid = settings->uid;
4634 if (gid_is_valid(settings->gid))
4635 arg_gid = settings->gid;
4636 if (settings->n_supplementary_gids > 0) {
4637 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4638 arg_n_supplementary_gids = settings->n_supplementary_gids;
4639 }
4640 }
4641
4642 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4643 uint64_t plus, minus;
4644 uint64_t network_minus = 0;
4645 uint64_t ambient;
4646
4647 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4648 * Settings structure */
4649
4650 plus = settings->capability;
4651 minus = settings->drop_capability;
4652
4653 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4654 settings_network_configured(settings)) {
4655 if (settings_private_network(settings))
4656 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4657 else
4658 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4659 }
4660
4661 if (!arg_settings_trusted && plus != 0) {
4662 if (settings->capability != 0)
4663 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4664 } else {
4665 arg_caps_retain &= ~network_minus;
4666 arg_caps_retain |= plus;
4667 }
4668
4669 arg_caps_retain &= ~minus;
4670
4671 /* Copy the full capabilities over too */
4672 if (capability_quintet_is_set(&settings->full_capabilities)) {
4673 if (!arg_settings_trusted)
4674 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4675 else
4676 arg_full_capabilities = settings->full_capabilities;
4677 }
4678
4679 ambient = settings->ambient_capability;
4680 if (!arg_settings_trusted && ambient != 0)
4681 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4682 else
4683 arg_caps_ambient |= ambient;
4684 }
4685
4686 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4687 settings->kill_signal > 0)
4688 arg_kill_signal = settings->kill_signal;
4689
4690 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4691 settings->personality != PERSONALITY_INVALID)
4692 arg_personality = settings->personality;
4693
4694 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4695 !sd_id128_is_null(settings->machine_id)) {
4696
4697 if (!arg_settings_trusted)
4698 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4699 else
4700 arg_uuid = settings->machine_id;
4701 }
4702
4703 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4704 settings->read_only >= 0)
4705 arg_read_only = settings->read_only;
4706
4707 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4708 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4709 arg_volatile_mode = settings->volatile_mode;
4710
4711 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4712 settings->n_custom_mounts > 0) {
4713
4714 if (!arg_settings_trusted)
4715 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4716 else {
4717 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4718 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4719 arg_n_custom_mounts = settings->n_custom_mounts;
4720 settings->n_custom_mounts = 0;
4721 }
4722 }
4723
4724 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4725 settings_network_configured(settings)) {
4726
4727 if (!arg_settings_trusted)
4728 log_warning("Ignoring network settings, file %s is not trusted.", path);
4729 else {
4730 arg_network_veth = settings_network_veth(settings);
4731 arg_private_network = settings_private_network(settings);
4732
4733 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4734 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4735 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4736 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4737
4738 free_and_replace(arg_network_bridge, settings->network_bridge);
4739 free_and_replace(arg_network_zone, settings->network_zone);
4740
4741 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4742 }
4743 }
4744
4745 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4746 settings->expose_ports) {
4747
4748 if (!arg_settings_trusted)
4749 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4750 else {
4751 expose_port_free_all(arg_expose_ports);
4752 arg_expose_ports = TAKE_PTR(settings->expose_ports);
4753 }
4754 }
4755
4756 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4757 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4758
4759 if (!arg_settings_trusted)
4760 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4761 else {
4762 arg_userns_mode = settings->userns_mode;
4763 arg_uid_shift = settings->uid_shift;
4764 arg_uid_range = settings->uid_range;
4765 arg_userns_ownership = settings->userns_ownership;
4766 }
4767 }
4768
4769 if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4770 !strv_isempty(settings->bind_user))
4771 strv_free_and_replace(arg_bind_user, settings->bind_user);
4772
4773 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4774 settings->notify_ready >= 0)
4775 arg_notify_ready = settings->notify_ready;
4776
4777 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4778
4779 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4780 if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4781 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4782 else {
4783 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4784 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4785 }
4786 }
4787
4788 #if HAVE_SECCOMP
4789 if (settings->seccomp) {
4790 if (!arg_settings_trusted)
4791 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4792 else {
4793 seccomp_release(arg_seccomp);
4794 arg_seccomp = TAKE_PTR(settings->seccomp);
4795 }
4796 }
4797 #endif
4798 }
4799
4800 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4801 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4802 continue;
4803
4804 if (!settings->rlimit[rl])
4805 continue;
4806
4807 if (!arg_settings_trusted) {
4808 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4809 continue;
4810 }
4811
4812 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4813 }
4814
4815 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4816 settings->hostname)
4817 free_and_replace(arg_hostname, settings->hostname);
4818
4819 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4820 settings->no_new_privileges >= 0)
4821 arg_no_new_privileges = settings->no_new_privileges;
4822
4823 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4824 settings->oom_score_adjust_set) {
4825
4826 if (!arg_settings_trusted)
4827 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4828 else {
4829 arg_oom_score_adjust = settings->oom_score_adjust;
4830 arg_oom_score_adjust_set = true;
4831 }
4832 }
4833
4834 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4835 settings->cpu_set.set) {
4836
4837 if (!arg_settings_trusted)
4838 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4839 else {
4840 cpu_set_reset(&arg_cpu_set);
4841 arg_cpu_set = TAKE_STRUCT(settings->cpu_set);
4842 }
4843 }
4844
4845 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4846 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4847 arg_resolv_conf = settings->resolv_conf;
4848
4849 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4850 settings->link_journal != _LINK_JOURNAL_INVALID) {
4851
4852 if (!arg_settings_trusted)
4853 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4854 else {
4855 arg_link_journal = settings->link_journal;
4856 arg_link_journal_try = settings->link_journal_try;
4857 }
4858 }
4859
4860 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4861 settings->timezone != _TIMEZONE_MODE_INVALID)
4862 arg_timezone = settings->timezone;
4863
4864 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4865 settings->slice) {
4866
4867 if (!arg_settings_trusted)
4868 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4869 else
4870 free_and_replace(arg_slice, settings->slice);
4871 }
4872
4873 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4874 settings->use_cgns >= 0) {
4875
4876 if (!arg_settings_trusted)
4877 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4878 else
4879 arg_use_cgns = settings->use_cgns;
4880 }
4881
4882 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4883 settings->clone_ns_flags != ULONG_MAX) {
4884
4885 if (!arg_settings_trusted)
4886 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4887 else
4888 arg_clone_ns_flags = settings->clone_ns_flags;
4889 }
4890
4891 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4892 settings->console_mode >= 0) {
4893
4894 if (!arg_settings_trusted)
4895 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4896 else
4897 arg_console_mode = settings->console_mode;
4898 }
4899
4900 if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4901 settings->suppress_sync >= 0)
4902 arg_suppress_sync = settings->suppress_sync;
4903
4904 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4905 * don't consult arg_settings_mask for them. */
4906
4907 sd_bus_message_unref(arg_property_message);
4908 arg_property_message = TAKE_PTR(settings->properties);
4909
4910 arg_console_width = settings->console_width;
4911 arg_console_height = settings->console_height;
4912
4913 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4914 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4915 arg_n_extra_nodes = settings->n_extra_nodes;
4916 settings->n_extra_nodes = 0;
4917
4918 return 0;
4919 }
4920
4921 static int load_settings(void) {
4922 _cleanup_(settings_freep) Settings *settings = NULL;
4923 _cleanup_fclose_ FILE *f = NULL;
4924 _cleanup_free_ char *p = NULL;
4925 int r;
4926
4927 if (arg_oci_bundle)
4928 return 0;
4929
4930 /* If all settings are masked, there's no point in looking for
4931 * the settings file */
4932 if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4933 return 0;
4934
4935 /* We first look in the admin's directories in /etc and /run */
4936 if (arg_privileged) {
4937 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4938 _cleanup_free_ char *j = NULL;
4939
4940 j = path_join(i, arg_settings_filename);
4941 if (!j)
4942 return log_oom();
4943
4944 f = fopen(j, "re");
4945 if (f) {
4946 p = TAKE_PTR(j);
4947
4948 /* By default, we trust configuration from /etc and /run */
4949 if (arg_settings_trusted < 0)
4950 arg_settings_trusted = true;
4951
4952 break;
4953 }
4954
4955 if (errno != ENOENT)
4956 return log_error_errno(errno, "Failed to open %s: %m", j);
4957 }
4958 }
4959
4960 if (!f) {
4961 /* After that, let's look for a file next to the
4962 * actual image we shall boot. */
4963
4964 if (arg_image) {
4965 r = file_in_same_dir(arg_image, arg_settings_filename, &p);
4966 if (r < 0)
4967 return log_error_errno(r, "Failed to generate settings path from image path: %m");
4968 } else if (arg_directory) {
4969 r = file_in_same_dir(arg_directory, arg_settings_filename, &p);
4970 if (r < 0 && r != -EADDRNOTAVAIL) /* if directory is root fs, don't complain */
4971 return log_error_errno(r, "Failed to generate settings path from directory path: %m");
4972 }
4973
4974 if (p) {
4975 f = fopen(p, "re");
4976 if (!f && errno != ENOENT)
4977 return log_error_errno(errno, "Failed to open %s: %m", p);
4978
4979 /* By default, we do not trust configuration from /var/lib/machines */
4980 if (arg_settings_trusted < 0)
4981 arg_settings_trusted = false;
4982 }
4983 }
4984
4985 if (!f)
4986 return 0;
4987
4988 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4989
4990 r = settings_load(f, p, &settings);
4991 if (r < 0)
4992 return r;
4993
4994 return merge_settings(settings, p);
4995 }
4996
4997 static int load_oci_bundle(void) {
4998 _cleanup_(settings_freep) Settings *settings = NULL;
4999 int r;
5000
5001 if (!arg_oci_bundle)
5002 return 0;
5003
5004 /* By default let's trust OCI bundles */
5005 if (arg_settings_trusted < 0)
5006 arg_settings_trusted = true;
5007
5008 r = oci_load(NULL, arg_oci_bundle, &settings);
5009 if (r < 0)
5010 return r;
5011
5012 return merge_settings(settings, arg_oci_bundle);
5013 }
5014
5015 static int run_container(
5016 DissectedImage *dissected_image,
5017 int userns_fd,
5018 FDSet *fds,
5019 char veth_name[IFNAMSIZ],
5020 bool *veth_created,
5021 struct ExposeArgs *expose_args,
5022 int *master,
5023 pid_t *pid,
5024 int *ret) {
5025
5026 static const struct sigaction sa = {
5027 .sa_handler = nop_signal_handler,
5028 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
5029 };
5030
5031 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
5032 _cleanup_close_ int etc_passwd_lock = -EBADF;
5033 _cleanup_close_pair_ int
5034 fd_inner_socket_pair[2] = EBADF_PAIR,
5035 fd_outer_socket_pair[2] = EBADF_PAIR;
5036
5037 _cleanup_close_ int notify_socket = -EBADF, mntns_fd = -EBADF, fd_kmsg_fifo = -EBADF;
5038 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5039 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
5040 _cleanup_(umount_and_rmdir_and_freep) char *unix_export_host_dir = NULL;
5041 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
5042 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
5043 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
5044 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
5045 _cleanup_free_ uid_t *bind_user_uid = NULL;
5046 size_t n_bind_user_uid = 0;
5047 ContainerStatus container_status = 0;
5048 int ifi = 0, r;
5049 ssize_t l;
5050 sigset_t mask_chld;
5051 _cleanup_close_ int child_netns_fd = -EBADF;
5052
5053 assert_se(sigemptyset(&mask_chld) == 0);
5054 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
5055
5056 /* Set up the unix export host directory on the host first */
5057 r = setup_unix_export_dir_outside(&unix_export_host_dir);
5058 if (r < 0)
5059 return r;
5060
5061 if (arg_userns_mode == USER_NAMESPACE_PICK) {
5062 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
5063 * check with getpwuid() if the specific user already exists. Note that /etc might be
5064 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
5065 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
5066 * really just an extra safety net. We kinda assume that the UID range we allocate from is
5067 * really ours. */
5068
5069 etc_passwd_lock = take_etc_passwd_lock(NULL);
5070 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
5071 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
5072 }
5073
5074 r = barrier_create(&barrier);
5075 if (r < 0)
5076 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
5077
5078 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_inner_socket_pair) < 0)
5079 return log_error_errno(errno, "Failed to create inner socket pair: %m");
5080
5081 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_outer_socket_pair) < 0)
5082 return log_error_errno(errno, "Failed to create outer socket pair: %m");
5083
5084 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
5085 * parent's blocking calls and give it a chance to call wait() and terminate. */
5086 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
5087 if (r < 0)
5088 return log_error_errno(errno, "Failed to change the signal mask: %m");
5089
5090 r = sigaction(SIGCHLD, &sa, NULL);
5091 if (r < 0)
5092 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
5093
5094 if (arg_network_namespace_path) {
5095 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
5096 if (child_netns_fd < 0)
5097 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
5098
5099 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
5100 if (r == -EUCLEAN)
5101 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
5102 else if (r < 0)
5103 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
5104 else if (r == 0)
5105 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5106 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
5107 }
5108
5109 if (arg_privileged) {
5110 assert(userns_fd < 0);
5111
5112 /* If we have no user namespace then we'll clone and create a new mount namespace right-away. */
5113
5114 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
5115 if (*pid < 0)
5116 return log_error_errno(errno, "clone() failed%s: %m",
5117 errno == EINVAL ?
5118 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
5119 } else {
5120 assert(userns_fd >= 0);
5121
5122 /* If we have a user namespace then we'll clone() first, and then join the user namespace,
5123 * and then open the mount namespace, so that it is owned by the user namespace */
5124
5125 *pid = raw_clone(SIGCHLD);
5126 if (*pid < 0)
5127 return log_error_errno(errno, "clone() failed: %m");
5128
5129 if (*pid == 0) {
5130 if (setns(userns_fd, CLONE_NEWUSER) < 0) {
5131 log_error_errno(errno, "Failed to join allocate user namespace: %m");
5132 _exit(EXIT_FAILURE);
5133 }
5134
5135 r = reset_uid_gid();
5136 if (r < 0) {
5137 log_error_errno(r, "Failed to reset UID/GID to root: %m");
5138 _exit(EXIT_FAILURE);
5139 }
5140
5141 if (unshare(CLONE_NEWNS) < 0) {
5142 log_error_errno(errno, "Failed to unshare file system namespace: %m");
5143 _exit(EXIT_FAILURE);
5144 }
5145 }
5146 }
5147
5148 if (*pid == 0) {
5149 /* The outer child only has a file system namespace. */
5150 barrier_set_role(&barrier, BARRIER_CHILD);
5151
5152 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5153 fd_outer_socket_pair[0] = safe_close(fd_outer_socket_pair[0]);
5154
5155 (void) reset_all_signal_handlers();
5156 (void) reset_signal_mask();
5157
5158 r = outer_child(&barrier,
5159 arg_directory,
5160 dissected_image,
5161 fd_outer_socket_pair[1],
5162 fd_inner_socket_pair[1],
5163 fds,
5164 child_netns_fd,
5165 unix_export_host_dir);
5166 if (r < 0)
5167 _exit(EXIT_FAILURE);
5168
5169 _exit(EXIT_SUCCESS);
5170 }
5171
5172 barrier_set_role(&barrier, BARRIER_PARENT);
5173
5174 fdset_close(fds);
5175
5176 fd_inner_socket_pair[1] = safe_close(fd_inner_socket_pair[1]);
5177 fd_outer_socket_pair[1] = safe_close(fd_outer_socket_pair[1]);
5178
5179 if (arg_userns_mode != USER_NAMESPACE_NO) {
5180 mntns_fd = receive_one_fd(fd_outer_socket_pair[0], 0);
5181 if (mntns_fd < 0)
5182 return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
5183
5184 /* The child just let us know the UID shift it might have read from the image. */
5185 l = recv(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
5186 if (l < 0)
5187 return log_error_errno(errno, "Failed to read UID shift: %m");
5188 if (l != sizeof arg_uid_shift)
5189 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
5190
5191 if (arg_userns_mode == USER_NAMESPACE_PICK) {
5192 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
5193 * image, but if that's already in use, pick a new one, and report back to the child,
5194 * which one we now picked. */
5195
5196 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
5197 if (r < 0)
5198 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
5199
5200 l = send(fd_outer_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
5201 if (l < 0)
5202 return log_error_errno(errno, "Failed to send UID shift: %m");
5203 if (l != sizeof arg_uid_shift)
5204 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
5205 }
5206
5207 n_bind_user_uid = strv_length(arg_bind_user);
5208 if (n_bind_user_uid > 0) {
5209 /* Right after the UID shift, we'll receive the list of UID mappings for the
5210 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
5211
5212 bind_user_uid = new(uid_t, n_bind_user_uid*4);
5213 if (!bind_user_uid)
5214 return log_oom();
5215
5216 for (size_t i = 0; i < n_bind_user_uid; i++) {
5217 l = recv(fd_outer_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
5218 if (l < 0)
5219 return log_error_errno(errno, "Failed to read user UID map pair: %m");
5220 if (l != sizeof(uid_t)*4)
5221 return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
5222 SYNTHETIC_ERRNO(EIO),
5223 "Short read while reading bind user UID pairs.");
5224 }
5225 }
5226 }
5227
5228 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
5229 /* The child let us know the support cgroup mode it might have read from the image. */
5230 l = recv(fd_outer_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
5231 if (l < 0)
5232 return log_error_errno(errno, "Failed to read cgroup mode: %m");
5233 if (l != sizeof(arg_unified_cgroup_hierarchy))
5234 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zi bytes).%s",
5235 l, l == 0 ? " The child is most likely dead." : "");
5236 }
5237
5238 /* Wait for the outer child. */
5239 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
5240 if (r < 0)
5241 return r;
5242 if (r != EXIT_SUCCESS)
5243 return -EIO;
5244
5245 /* And now retrieve the PID of the inner child. */
5246 l = recv(fd_outer_socket_pair[0], pid, sizeof *pid, 0);
5247 if (l < 0)
5248 return log_error_errno(errno, "Failed to read inner child PID: %m");
5249 if (l != sizeof *pid)
5250 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
5251
5252 /* We also retrieve container UUID in case it was generated by outer child */
5253 l = recv(fd_outer_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
5254 if (l < 0)
5255 return log_error_errno(errno, "Failed to read container machine ID: %m");
5256 if (l != sizeof(arg_uuid))
5257 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
5258
5259 /* We also retrieve the socket used for notifications generated by outer child */
5260 notify_socket = receive_one_fd(fd_outer_socket_pair[0], 0);
5261 if (notify_socket < 0)
5262 return log_error_errno(notify_socket,
5263 "Failed to receive notification socket from the outer child: %m");
5264
5265 log_debug("Init process invoked as PID "PID_FMT, *pid);
5266
5267 if (arg_userns_mode != USER_NAMESPACE_NO) {
5268 if (!barrier_place_and_sync(&barrier)) /* #1 */
5269 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5270
5271 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
5272 if (r < 0)
5273 return r;
5274
5275 (void) barrier_place(&barrier); /* #2 */
5276 }
5277
5278 if (arg_private_network) {
5279 if (!arg_network_namespace_path) {
5280 /* Wait until the child has unshared its network namespace. */
5281 if (!barrier_place_and_sync(&barrier)) /* #3 */
5282 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
5283
5284 /* Make sure we have an open file descriptor to the child's network namespace so it
5285 * stays alive even if the child exits. */
5286 assert(child_netns_fd < 0);
5287 child_netns_fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5288 if (child_netns_fd < 0)
5289 return log_error_errno(r, "Failed to receive child network namespace: %m");
5290 }
5291
5292 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
5293 if (r < 0)
5294 return r;
5295
5296 if (arg_network_veth) {
5297 if (arg_privileged) {
5298 r = setup_veth(arg_machine, *pid, veth_name,
5299 arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
5300 if (r < 0)
5301 return r;
5302 else if (r > 0)
5303 ifi = r;
5304 } else {
5305 _cleanup_free_ char *host_ifname = NULL;
5306
5307 r = nsresource_add_netif(userns_fd, child_netns_fd, /* namespace_ifname= */ NULL, &host_ifname, /* ret_namespace_ifname= */ NULL);
5308 if (r < 0)
5309 return log_error_errno(r, "Failed to add network interface to container: %m");
5310
5311 ifi = if_nametoindex(host_ifname);
5312 if (ifi == 0)
5313 return log_error_errno(errno, "Failed to resolve interface '%s': %m", host_ifname);
5314
5315 if (strlen(host_ifname) >= IFNAMSIZ)
5316 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Host interface name too long?");
5317
5318 strcpy(veth_name, host_ifname);
5319 }
5320
5321 if (arg_network_bridge) {
5322 /* Add the interface to a bridge */
5323 r = setup_bridge(veth_name, arg_network_bridge, false);
5324 if (r < 0)
5325 return r;
5326 if (r > 0)
5327 ifi = r;
5328 } else if (arg_network_zone) {
5329 /* Add the interface to a bridge, possibly creating it */
5330 r = setup_bridge(veth_name, arg_network_zone, true);
5331 if (r < 0)
5332 return r;
5333 if (r > 0)
5334 ifi = r;
5335 }
5336 }
5337
5338 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5339 if (r < 0)
5340 return r;
5341
5342 /* We created the primary and extra veth links now; let's remember this, so that we know to
5343 remove them later on. Note that we don't bother with removing veth links that were created
5344 here when their setup failed half-way, because in that case the kernel should be able to
5345 remove them on its own, since they cannot be referenced by anything yet. */
5346 *veth_created = true;
5347
5348 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5349 if (r < 0)
5350 return r;
5351
5352 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5353 if (r < 0)
5354 return r;
5355 }
5356
5357 if (arg_register || !arg_keep_unit) {
5358 if (arg_privileged)
5359 r = sd_bus_default_system(&bus);
5360 else
5361 r = sd_bus_default_user(&bus);
5362 if (r < 0)
5363 return log_error_errno(r, "Failed to open bus: %m");
5364
5365 r = sd_bus_set_close_on_exit(bus, false);
5366 if (r < 0)
5367 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5368 }
5369
5370 if (!arg_keep_unit) {
5371 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5372 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5373 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5374
5375 r = sd_bus_match_signal_async(
5376 bus,
5377 NULL,
5378 "org.freedesktop.systemd1",
5379 NULL,
5380 "org.freedesktop.systemd1.Scope",
5381 "RequestStop",
5382 on_request_stop, NULL, PID_TO_PTR(*pid));
5383 if (r < 0)
5384 return log_error_errno(r, "Failed to request RequestStop match: %m");
5385 }
5386
5387 if (arg_register) {
5388 r = register_machine(
5389 bus,
5390 arg_machine,
5391 *pid,
5392 arg_directory,
5393 arg_uuid,
5394 ifi,
5395 arg_slice,
5396 arg_custom_mounts, arg_n_custom_mounts,
5397 arg_kill_signal,
5398 arg_property,
5399 arg_property_message,
5400 arg_keep_unit,
5401 arg_container_service_name,
5402 arg_start_mode);
5403 if (r < 0)
5404 return r;
5405
5406 } else if (!arg_keep_unit) {
5407 r = allocate_scope(
5408 bus,
5409 arg_machine,
5410 *pid,
5411 arg_slice,
5412 arg_custom_mounts, arg_n_custom_mounts,
5413 arg_kill_signal,
5414 arg_property,
5415 arg_property_message,
5416 /* allow_pidfds= */ true,
5417 arg_start_mode);
5418 if (r < 0)
5419 return r;
5420
5421 } else if (arg_slice || arg_property)
5422 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5423
5424 r = create_subcgroup(
5425 *pid,
5426 arg_keep_unit,
5427 arg_unified_cgroup_hierarchy,
5428 arg_uid_shift,
5429 userns_fd,
5430 arg_privileged);
5431 if (r < 0)
5432 return r;
5433
5434 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5435 if (r < 0)
5436 return r;
5437
5438 /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
5439 * that the child can now hand over control to the code to run inside the container. */
5440 (void) barrier_place(&barrier); /* #4 */
5441
5442 /* Block SIGCHLD here, before notifying child.
5443 * process_pty() will handle it with the other signals. */
5444 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5445
5446 /* Reset signal to default */
5447 r = default_signals(SIGCHLD);
5448 if (r < 0)
5449 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5450
5451 r = sd_event_new(&event);
5452 if (r < 0)
5453 return log_error_errno(r, "Failed to get default event source: %m");
5454
5455 (void) sd_event_set_watchdog(event, true);
5456
5457 if (bus) {
5458 r = sd_bus_attach_event(bus, event, 0);
5459 if (r < 0)
5460 return log_error_errno(r, "Failed to attach bus to event loop: %m");
5461 }
5462
5463 r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5464 if (r < 0)
5465 return r;
5466
5467 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5468 * before we take the fully visible instances away. */
5469 if (!barrier_sync(&barrier)) /* #5.1 */
5470 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5471
5472 if (arg_userns_mode != USER_NAMESPACE_NO) {
5473 r = wipe_fully_visible_fs(mntns_fd);
5474 if (r < 0)
5475 return r;
5476 mntns_fd = safe_close(mntns_fd);
5477 }
5478
5479 /* And now let the child know that we completed removing the procfs instances, and it can start the
5480 * payload. */
5481 if (!barrier_place(&barrier)) /* #5.2 */
5482 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5483
5484 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5485 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5486 etc_passwd_lock = safe_close(etc_passwd_lock);
5487
5488 (void) sd_notifyf(false,
5489 "STATUS=Container running.\n"
5490 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5491 if (!arg_notify_ready) {
5492 r = sd_notify(false, "READY=1\n");
5493 if (r < 0)
5494 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5495 }
5496
5497 if (arg_kill_signal > 0) {
5498 /* Try to kill the init system on SIGINT or SIGTERM */
5499 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5500 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5501 } else {
5502 /* Immediately exit */
5503 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5504 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5505 }
5506
5507 (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
5508
5509 r = sd_event_add_memory_pressure(event, NULL, NULL, NULL);
5510 if (r < 0)
5511 log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
5512
5513 /* Exit when the child exits */
5514 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5515
5516 /* Retrieve the kmsg fifo allocated by inner child */
5517 fd_kmsg_fifo = receive_one_fd(fd_inner_socket_pair[0], 0);
5518 if (fd_kmsg_fifo < 0)
5519 return log_error_errno(fd_kmsg_fifo, "Failed to receive kmsg fifo from inner child: %m");
5520
5521 if (arg_expose_ports) {
5522 r = expose_port_watch_rtnl(event, fd_inner_socket_pair[0], on_address_change, expose_args, &rtnl);
5523 if (r < 0)
5524 return r;
5525
5526 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5527 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5528 }
5529
5530 if (arg_console_mode != CONSOLE_PIPE) {
5531 _cleanup_close_ int fd = -EBADF;
5532 PTYForwardFlags flags = 0;
5533
5534 /* Retrieve the master pty allocated by inner child */
5535 fd = receive_one_fd(fd_inner_socket_pair[0], 0);
5536 if (fd < 0)
5537 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5538
5539 switch (arg_console_mode) {
5540
5541 case CONSOLE_READ_ONLY:
5542 flags |= PTY_FORWARD_READ_ONLY;
5543
5544 _fallthrough_;
5545
5546 case CONSOLE_INTERACTIVE:
5547 flags |= PTY_FORWARD_IGNORE_VHANGUP;
5548
5549 r = pty_forward_new(event, fd, flags, &forward);
5550 if (r < 0)
5551 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5552
5553 if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5554 (void) pty_forward_set_width_height(
5555 forward,
5556 arg_console_width,
5557 arg_console_height);
5558
5559 if (!arg_background && shall_tint_background()) {
5560 _cleanup_free_ char *bg = NULL;
5561
5562 r = terminal_tint_color(220 /* blue */, &bg);
5563 if (r < 0)
5564 log_debug_errno(r, "Failed to determine terminal background color, not tinting.");
5565 else
5566 (void) pty_forward_set_background_color(forward, bg);
5567 } else if (!isempty(arg_background))
5568 (void) pty_forward_set_background_color(forward, arg_background);
5569
5570 set_window_title(forward);
5571 break;
5572
5573 default:
5574 assert(arg_console_mode == CONSOLE_PASSIVE);
5575 }
5576
5577 *master = TAKE_FD(fd);
5578 }
5579
5580 fd_inner_socket_pair[0] = safe_close(fd_inner_socket_pair[0]);
5581
5582 r = sd_event_loop(event);
5583 if (r < 0)
5584 return log_error_errno(r, "Failed to run event loop: %m");
5585
5586 if (forward) {
5587 char last_char = 0;
5588
5589 (void) pty_forward_get_last_char(forward, &last_char);
5590 forward = pty_forward_free(forward);
5591
5592 if (!arg_quiet && last_char != '\n')
5593 putc('\n', stdout);
5594 }
5595
5596 /* Kill if it is not dead yet anyway */
5597 if (!arg_register && !arg_keep_unit && bus)
5598 terminate_scope(bus, arg_machine);
5599
5600 /* Normally redundant, but better safe than sorry */
5601 (void) kill(*pid, SIGKILL);
5602
5603 fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
5604
5605 if (arg_private_network && arg_privileged) {
5606 r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
5607 if (r < 0)
5608 return r;
5609 }
5610
5611 r = wait_for_container(TAKE_PID(*pid), &container_status);
5612
5613 /* Tell machined that we are gone. */
5614 if (bus)
5615 (void) unregister_machine(bus, arg_machine);
5616
5617 if (r < 0)
5618 /* We failed to wait for the container, or the container exited abnormally. */
5619 return r;
5620 if (r > 0 || container_status == CONTAINER_TERMINATED) {
5621 /* r > 0 → The container exited with a non-zero status.
5622 * As a special case, we need to replace 133 with a different value,
5623 * because 133 is special-cased in the service file to reboot the container.
5624 * otherwise → The container exited with zero status and a reboot was not requested.
5625 */
5626 if (r == EXIT_FORCE_RESTART)
5627 r = EXIT_FAILURE; /* replace 133 with the general failure code */
5628 *ret = r;
5629 return 0; /* finito */
5630 }
5631
5632 /* CONTAINER_REBOOTED, loop again */
5633
5634 if (arg_keep_unit) {
5635 /* Special handling if we are running as a service: instead of simply
5636 * restarting the machine we want to restart the entire service, so let's
5637 * inform systemd about this with the special exit code 133. The service
5638 * file uses RestartForceExitStatus=133 so that this results in a full
5639 * nspawn restart. This is necessary since we might have cgroup parameters
5640 * set we want to have flushed out. */
5641 *ret = EXIT_FORCE_RESTART;
5642 return 0; /* finito */
5643 }
5644
5645 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5646 expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5647
5648 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5649 *veth_created = false;
5650 return 1; /* loop again */
5651 }
5652
5653 static int initialize_rlimits(void) {
5654 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5655 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5656 * container execution environments. */
5657
5658 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5659 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
5660 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
5661 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
5662 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
5663 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
5664 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
5665 [RLIMIT_MEMLOCK] = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5666 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
5667 [RLIMIT_NICE] = { 0, 0 },
5668 [RLIMIT_NOFILE] = { 1024, 4096 },
5669 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
5670 [RLIMIT_RTPRIO] = { 0, 0 },
5671 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
5672 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
5673
5674 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5675 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5676 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5677 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5678 * that PID 1 changes a number of other resource limits during early initialization which is why we
5679 * don't read the other limits from PID 1 but prefer the static table above. */
5680 };
5681
5682 int rl, r;
5683
5684 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5685 /* Let's only fill in what the user hasn't explicitly configured anyway */
5686 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5687 const struct rlimit *v;
5688 struct rlimit buffer;
5689
5690 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5691 /* For these two let's read the limits off PID 1. See above for an explanation. */
5692
5693 r = pid_getrlimit(1, rl, &buffer);
5694 if (r < 0)
5695 return log_error_errno(r, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5696
5697 v = &buffer;
5698 } else if (rl == RLIMIT_NOFILE) {
5699 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5700 * userspace. Given that nspawn containers are often run without our PID 1,
5701 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5702 * so that container userspace gets similar resources as host userspace
5703 * gets. */
5704 buffer = kernel_defaults[rl];
5705 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5706 v = &buffer;
5707 } else
5708 v = kernel_defaults + rl;
5709
5710 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5711 if (!arg_rlimit[rl])
5712 return log_oom();
5713 }
5714
5715 if (DEBUG_LOGGING) {
5716 _cleanup_free_ char *k = NULL;
5717
5718 (void) rlimit_format(arg_rlimit[rl], &k);
5719 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5720 }
5721 }
5722
5723 return 0;
5724 }
5725
5726 static int cant_be_in_netns(void) {
5727 _cleanup_close_ int fd = -EBADF;
5728 struct ucred ucred;
5729 int r;
5730
5731 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5732 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5733 * nice message. */
5734
5735 if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5736 return 0;
5737
5738 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5739 if (fd < 0)
5740 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5741
5742 r = connect_unix_path(fd, AT_FDCWD, "/run/udev/control");
5743 if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
5744 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5745 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5746 if (ERRNO_IS_NEG_PRIVILEGE(r)) {
5747 log_debug_errno(r, "Can't connect to udev control socket, assuming we are in same netns.");
5748 return 0;
5749 }
5750 if (r < 0)
5751 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
5752
5753 r = getpeercred(fd, &ucred);
5754 if (r < 0)
5755 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5756
5757 r = in_same_namespace(ucred.pid, 0, NAMESPACE_NET);
5758 if (r < 0)
5759 return log_error_errno(r, "Failed to determine network namespace of udev: %m");
5760 if (r == 0)
5761 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5762 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5763 return 0;
5764 }
5765
5766 static int run(int argc, char *argv[]) {
5767 bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
5768 _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
5769 _cleanup_fdset_free_ FDSet *fds = NULL;
5770 int r, n_fd_passed, ret = EXIT_SUCCESS;
5771 char veth_name[IFNAMSIZ] = "";
5772 struct ExposeArgs expose_args = {};
5773 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5774 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5775 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5776 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5777 _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5778 pid_t pid = 0;
5779
5780 log_setup();
5781
5782 arg_privileged = getuid() == 0;
5783
5784 r = parse_argv(argc, argv);
5785 if (r <= 0)
5786 goto finish;
5787
5788 r = cant_be_in_netns();
5789 if (r < 0)
5790 goto finish;
5791
5792 r = initialize_rlimits();
5793 if (r < 0)
5794 goto finish;
5795
5796 r = load_oci_bundle();
5797 if (r < 0)
5798 goto finish;
5799
5800 r = pick_paths();
5801 if (r < 0)
5802 goto finish;
5803
5804 r = determine_names();
5805 if (r < 0)
5806 goto finish;
5807
5808 r = load_settings();
5809 if (r < 0)
5810 goto finish;
5811
5812 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
5813 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
5814 * indicate that. */
5815 if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
5816 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
5817
5818 r = cg_unified(); /* initialize cache early */
5819 if (r < 0) {
5820 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5821 goto finish;
5822 }
5823
5824 r = verify_arguments();
5825 if (r < 0)
5826 goto finish;
5827
5828 r = resolve_network_interface_names(arg_network_interfaces);
5829 if (r < 0)
5830 goto finish;
5831
5832 r = verify_network_interfaces_initialized();
5833 if (r < 0)
5834 goto finish;
5835
5836 /* Reapply environment settings. */
5837 (void) detect_unified_cgroup_hierarchy_from_environment();
5838
5839 if (!arg_privileged) {
5840 r = cg_all_unified();
5841 if (r < 0) {
5842 log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
5843 goto finish;
5844 }
5845 if (r == 0)
5846 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
5847 }
5848
5849 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5850 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5851 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5852 (void) ignore_signals(SIGPIPE);
5853
5854 n_fd_passed = sd_listen_fds(false);
5855 if (n_fd_passed > 0) {
5856 r = fdset_new_listen_fds(&fds, false);
5857 if (r < 0) {
5858 log_error_errno(r, "Failed to collect file descriptors: %m");
5859 goto finish;
5860 }
5861 }
5862
5863 /* The "default" umask. This is appropriate for most file and directory
5864 * operations performed by nspawn, and is the umask that will be used for
5865 * the child. Functions like copy_devnodes() change the umask temporarily. */
5866 umask(0022);
5867
5868 if (arg_console_mode < 0)
5869 arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
5870 CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5871
5872 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5873 arg_quiet = true;
5874
5875 if (arg_directory) {
5876 assert(!arg_image);
5877
5878 if (!arg_privileged) {
5879 r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
5880 goto finish;
5881 }
5882
5883 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5884 * /var from the host will propagate into container dynamically (because bad things happen if
5885 * two systems write to the same /var). Let's allow it for the special cases where /var is
5886 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5887 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5888 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5889 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5890 goto finish;
5891 }
5892
5893 if (arg_ephemeral) {
5894 _cleanup_free_ char *np = NULL;
5895
5896 r = chase_and_update(&arg_directory, 0);
5897 if (r < 0)
5898 goto finish;
5899
5900 /* If the specified path is a mount point we generate the new snapshot immediately
5901 * inside it under a random name. However if the specified is not a mount point we
5902 * create the new snapshot in the parent directory, just next to it. */
5903 r = path_is_mount_point(arg_directory);
5904 if (r < 0) {
5905 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5906 goto finish;
5907 }
5908 if (r > 0)
5909 r = tempfn_random_child(arg_directory, "machine.", &np);
5910 else
5911 r = tempfn_random(arg_directory, "machine.", &np);
5912 if (r < 0) {
5913 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5914 goto finish;
5915 }
5916
5917 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5918 * only owned by us and no one else. */
5919 r = image_path_lock(
5920 np,
5921 LOCK_EX|LOCK_NB,
5922 arg_privileged ? &tree_global_lock : NULL,
5923 &tree_local_lock);
5924 if (r < 0) {
5925 log_error_errno(r, "Failed to lock %s: %m", np);
5926 goto finish;
5927 }
5928
5929 {
5930 BLOCK_SIGNALS(SIGINT);
5931 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_directory, AT_FDCWD, np,
5932 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5933 BTRFS_SNAPSHOT_FALLBACK_COPY |
5934 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5935 BTRFS_SNAPSHOT_RECURSIVE |
5936 BTRFS_SNAPSHOT_QUOTA |
5937 BTRFS_SNAPSHOT_SIGINT);
5938 }
5939 if (r == -EINTR) {
5940 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5941 goto finish;
5942 }
5943 if (r < 0) {
5944 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5945 goto finish;
5946 }
5947
5948 free_and_replace(arg_directory, np);
5949 remove_directory = true;
5950 } else {
5951 r = chase_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5952 if (r < 0)
5953 goto finish;
5954
5955 r = image_path_lock(
5956 arg_directory,
5957 (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
5958 arg_privileged ? &tree_global_lock : NULL,
5959 &tree_local_lock);
5960 if (r == -EBUSY) {
5961 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5962 goto finish;
5963 }
5964 if (r < 0) {
5965 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5966 goto finish;
5967 }
5968
5969 if (arg_template) {
5970 r = chase_and_update(&arg_template, 0);
5971 if (r < 0)
5972 goto finish;
5973
5974 {
5975 BLOCK_SIGNALS(SIGINT);
5976 r = btrfs_subvol_snapshot_at(AT_FDCWD, arg_template, AT_FDCWD, arg_directory,
5977 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5978 BTRFS_SNAPSHOT_FALLBACK_COPY |
5979 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5980 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5981 BTRFS_SNAPSHOT_RECURSIVE |
5982 BTRFS_SNAPSHOT_QUOTA |
5983 BTRFS_SNAPSHOT_SIGINT);
5984 }
5985 if (r == -EEXIST)
5986 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5987 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5988 else if (r == -EINTR) {
5989 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5990 goto finish;
5991 } else if (r < 0) {
5992 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5993 goto finish;
5994 } else
5995 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5996 "Populated %s from template %s.", arg_directory, arg_template);
5997 }
5998 }
5999
6000 if (arg_start_mode == START_BOOT) {
6001 _cleanup_free_ char *b = NULL;
6002 const char *p;
6003 int check_os_release, is_os_tree;
6004
6005 if (arg_pivot_root_new) {
6006 b = path_join(arg_directory, arg_pivot_root_new);
6007 if (!b) {
6008 r = log_oom();
6009 goto finish;
6010 }
6011
6012 p = b;
6013 } else
6014 p = arg_directory;
6015
6016 check_os_release = getenv_bool("SYSTEMD_NSPAWN_CHECK_OS_RELEASE");
6017 if (check_os_release < 0 && check_os_release != -ENXIO) {
6018 r = log_error_errno(check_os_release, "Failed to parse $SYSTEMD_NSPAWN_CHECK_OS_RELEASE: %m");
6019 goto finish;
6020 }
6021
6022 is_os_tree = path_is_os_tree(p);
6023 if (is_os_tree == 0 && check_os_release == 0)
6024 log_debug("Directory %s is missing an os-release file, continuing anyway.", p);
6025 else if (is_os_tree <= 0) {
6026 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6027 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
6028 goto finish;
6029 }
6030 } else {
6031 _cleanup_free_ char *p = NULL;
6032
6033 if (arg_pivot_root_new)
6034 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
6035 else
6036 p = path_join(arg_directory, "/usr/");
6037 if (!p) {
6038 r = log_oom();
6039 goto finish;
6040 }
6041
6042 if (laccess(p, F_OK) < 0) {
6043 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
6044 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
6045 goto finish;
6046 }
6047 }
6048
6049 } else {
6050 DissectImageFlags dissect_image_flags =
6051 determine_dissect_image_flags();
6052
6053 assert(arg_image);
6054 assert(!arg_template);
6055
6056
6057 r = chase_and_update(&arg_image, 0);
6058 if (r < 0)
6059 goto finish;
6060
6061 if (arg_ephemeral) {
6062 _cleanup_free_ char *np = NULL;
6063
6064 r = tempfn_random(arg_image, "machine.", &np);
6065 if (r < 0) {
6066 log_error_errno(r, "Failed to generate name for image snapshot: %m");
6067 goto finish;
6068 }
6069
6070 /* Always take an exclusive lock on our own ephemeral copy. */
6071 r = image_path_lock(
6072 np,
6073 LOCK_EX|LOCK_NB,
6074 arg_privileged ? &tree_global_lock : NULL,
6075 &tree_local_lock);
6076 if (r < 0) {
6077 log_error_errno(r, "Failed to create image lock: %m");
6078 goto finish;
6079 }
6080
6081 {
6082 BLOCK_SIGNALS(SIGINT);
6083 r = copy_file_full(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600,
6084 FS_NOCOW_FL, FS_NOCOW_FL,
6085 COPY_REFLINK|COPY_CRTIME|COPY_SIGINT,
6086 NULL, NULL);
6087 }
6088 if (r == -EINTR) {
6089 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
6090 goto finish;
6091 }
6092 if (r < 0) {
6093 r = log_error_errno(r, "Failed to copy image file: %m");
6094 goto finish;
6095 }
6096
6097 free_and_replace(arg_image, np);
6098 remove_image = true;
6099 } else {
6100 r = image_path_lock(
6101 arg_image,
6102 (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
6103 arg_privileged ? &tree_global_lock : NULL,
6104 &tree_local_lock);
6105 if (r == -EBUSY) {
6106 log_error_errno(r, "Disk image %s is currently busy.", arg_image);
6107 goto finish;
6108 }
6109 if (r < 0) {
6110 log_error_errno(r, "Failed to create image lock: %m");
6111 goto finish;
6112 }
6113
6114 r = verity_settings_load(
6115 &arg_verity_settings,
6116 arg_image, NULL, NULL);
6117 if (r < 0) {
6118 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
6119 goto finish;
6120 }
6121
6122 if (arg_verity_settings.data_path)
6123 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
6124 }
6125
6126 if (!mkdtemp(tmprootdir)) {
6127 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6128 goto finish;
6129 }
6130
6131 remove_tmprootdir = true;
6132
6133 arg_directory = strdup(tmprootdir);
6134 if (!arg_directory) {
6135 r = log_oom();
6136 goto finish;
6137 }
6138
6139 if (arg_privileged) {
6140 r = loop_device_make_by_path(
6141 arg_image,
6142 arg_read_only ? O_RDONLY : O_RDWR,
6143 /* sector_size= */ UINT32_MAX,
6144 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
6145 LOCK_SH,
6146 &loop);
6147 if (r < 0) {
6148 log_error_errno(r, "Failed to set up loopback block device: %m");
6149 goto finish;
6150 }
6151
6152 r = dissect_loop_device_and_warn(
6153 loop,
6154 &arg_verity_settings,
6155 /* mount_options=*/ NULL,
6156 arg_image_policy ?: &image_policy_container,
6157 dissect_image_flags,
6158 &dissected_image);
6159 if (r == -ENOPKG) {
6160 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
6161 log_notice("Note that the disk image needs to\n"
6162 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
6163 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
6164 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
6165 " d) or contain a file system without a partition table\n"
6166 "in order to be bootable with systemd-nspawn.");
6167 goto finish;
6168 }
6169 if (r < 0)
6170 goto finish;
6171
6172 r = dissected_image_load_verity_sig_partition(
6173 dissected_image,
6174 loop->fd,
6175 &arg_verity_settings);
6176 if (r < 0)
6177 goto finish;
6178
6179 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
6180 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
6181 "root hash signature found! Proceeding without integrity checking.", arg_image);
6182
6183 r = dissected_image_decrypt_interactively(
6184 dissected_image,
6185 NULL,
6186 &arg_verity_settings,
6187 dissect_image_flags);
6188 if (r < 0)
6189 goto finish;
6190 } else {
6191 _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
6192 if (!userns_name) {
6193 r = log_oom();
6194 goto finish;
6195 }
6196
6197 /* if we are unprivileged, let's allocate a 64K userns first */
6198 userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
6199 if (userns_fd < 0) {
6200 r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
6201 goto finish;
6202 }
6203
6204 r = mountfsd_mount_image(
6205 arg_image,
6206 userns_fd,
6207 arg_image_policy,
6208 dissect_image_flags,
6209 &dissected_image);
6210 if (r < 0)
6211 goto finish;
6212 }
6213
6214 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
6215 if (remove_image && unlink(arg_image) >= 0)
6216 remove_image = false;
6217
6218 if (arg_architecture < 0)
6219 arg_architecture = dissected_image_architecture(dissected_image);
6220 }
6221
6222 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
6223 if (r < 0)
6224 goto finish;
6225
6226 if (!arg_quiet) {
6227 const char *t = arg_image ?: arg_directory;
6228 _cleanup_free_ char *u = NULL;
6229 (void) terminal_urlify_path(t, t, &u);
6230
6231 log_info("%s %sSpawning container %s on %s.%s",
6232 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), arg_machine, u ?: t, ansi_normal());
6233
6234 if (arg_console_mode == CONSOLE_INTERACTIVE)
6235 log_info("%s %sPress %sCtrl-]%s three times within 1s to kill container.%s",
6236 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal());
6237 }
6238
6239 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, SIGRTMIN+18) >= 0);
6240
6241 r = make_reaper_process(true);
6242 if (r < 0) {
6243 log_error_errno(r, "Failed to become subreaper: %m");
6244 goto finish;
6245 }
6246
6247 if (arg_expose_ports) {
6248 r = fw_ctx_new(&fw_ctx);
6249 if (r < 0) {
6250 log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
6251 goto finish;
6252 }
6253 expose_args.fw_ctx = fw_ctx;
6254 }
6255 for (;;) {
6256 r = run_container(
6257 dissected_image,
6258 userns_fd,
6259 fds,
6260 veth_name, &veth_created,
6261 &expose_args, &master,
6262 &pid, &ret);
6263 if (r <= 0)
6264 break;
6265 }
6266
6267 finish:
6268 (void) sd_notify(false,
6269 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
6270 "STOPPING=1\nSTATUS=Terminating...");
6271
6272 if (pid > 0)
6273 (void) kill(pid, SIGKILL);
6274
6275 /* Try to flush whatever is still queued in the pty */
6276 if (master >= 0) {
6277 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
6278 master = safe_close(master);
6279 }
6280
6281 if (pid > 0)
6282 (void) wait_for_terminate(pid, NULL);
6283
6284 pager_close();
6285
6286 if (remove_directory && arg_directory) {
6287 int k;
6288
6289 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
6290 if (k < 0)
6291 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
6292 }
6293
6294 if (remove_image && arg_image) {
6295 if (unlink(arg_image) < 0)
6296 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
6297 }
6298
6299 if (remove_tmprootdir) {
6300 if (rmdir(tmprootdir) < 0)
6301 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
6302 }
6303
6304 if (arg_machine && arg_privileged) {
6305 const char *p;
6306
6307 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
6308 (void) rm_rf(p, REMOVE_ROOT);
6309
6310 p = strjoina("/run/systemd/nspawn/unix-export/", arg_machine);
6311 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6312 (void) rmdir(p);
6313 }
6314
6315 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
6316 expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
6317
6318 if (arg_privileged) {
6319 if (veth_created)
6320 (void) remove_veth_links(veth_name, arg_network_veth_extra);
6321 (void) remove_bridge(arg_network_zone);
6322 }
6323
6324 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
6325 expose_port_free_all(arg_expose_ports);
6326 rlimit_free_all(arg_rlimit);
6327 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
6328
6329 if (r < 0)
6330 return r;
6331
6332 return ret;
6333 }
6334
6335 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);