]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tree-wide: drop sched.h when missing_sched.h is included
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_BLKID
4 #include <blkid.h>
5 #endif
6 #include <errno.h>
7 #include <getopt.h>
8 #include <grp.h>
9 #include <linux/fs.h>
10 #include <linux/loop.h>
11 #include <pwd.h>
12 #if HAVE_SELINUX
13 #include <selinux/selinux.h>
14 #endif
15 #include <signal.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <sys/file.h>
19 #include <sys/personality.h>
20 #include <sys/prctl.h>
21 #include <sys/types.h>
22 #include <sys/wait.h>
23 #include <unistd.h>
24
25 #include "sd-bus.h"
26 #include "sd-daemon.h"
27 #include "sd-id128.h"
28
29 #include "alloc-util.h"
30 #include "barrier.h"
31 #include "base-filesystem.h"
32 #include "blkid-util.h"
33 #include "btrfs-util.h"
34 #include "bus-error.h"
35 #include "bus-util.h"
36 #include "cap-list.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
39 #include "copy.h"
40 #include "cpu-set-util.h"
41 #include "dev-setup.h"
42 #include "dissect-image.h"
43 #include "env-util.h"
44 #include "fd-util.h"
45 #include "fdset.h"
46 #include "fileio.h"
47 #include "format-util.h"
48 #include "fs-util.h"
49 #include "gpt.h"
50 #include "hexdecoct.h"
51 #include "hostname-util.h"
52 #include "id128-util.h"
53 #include "log.h"
54 #include "loop-util.h"
55 #include "loopback-setup.h"
56 #include "machine-image.h"
57 #include "macro.h"
58 #include "main-func.h"
59 #include "missing_sched.h"
60 #include "mkdir.h"
61 #include "mount-util.h"
62 #include "mountpoint-util.h"
63 #include "namespace-util.h"
64 #include "netlink-util.h"
65 #include "nspawn-cgroup.h"
66 #include "nspawn-def.h"
67 #include "nspawn-expose-ports.h"
68 #include "nspawn-mount.h"
69 #include "nspawn-network.h"
70 #include "nspawn-oci.h"
71 #include "nspawn-patch-uid.h"
72 #include "nspawn-register.h"
73 #include "nspawn-seccomp.h"
74 #include "nspawn-settings.h"
75 #include "nspawn-setuid.h"
76 #include "nspawn-stub-pid1.h"
77 #include "nulstr-util.h"
78 #include "os-util.h"
79 #include "pager.h"
80 #include "parse-util.h"
81 #include "path-util.h"
82 #include "pretty-print.h"
83 #include "process-util.h"
84 #include "ptyfwd.h"
85 #include "random-util.h"
86 #include "raw-clone.h"
87 #include "rlimit-util.h"
88 #include "rm-rf.h"
89 #if HAVE_SECCOMP
90 #include "seccomp-util.h"
91 #endif
92 #include "selinux-util.h"
93 #include "signal-util.h"
94 #include "socket-util.h"
95 #include "stat-util.h"
96 #include "stdio-util.h"
97 #include "string-table.h"
98 #include "string-util.h"
99 #include "strv.h"
100 #include "sysctl-util.h"
101 #include "terminal-util.h"
102 #include "tmpfile-util.h"
103 #include "umask-util.h"
104 #include "unit-name.h"
105 #include "user-util.h"
106 #include "util.h"
107
108 #if HAVE_SPLIT_USR
109 #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
110 #else
111 #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
112 #endif
113
114 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
115 * nspawn_notify_socket_path is relative to the container
116 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
118
119 #define EXIT_FORCE_RESTART 133
120
121 typedef enum ContainerStatus {
122 CONTAINER_TERMINATED,
123 CONTAINER_REBOOTED,
124 } ContainerStatus;
125
126 static char *arg_directory = NULL;
127 static char *arg_template = NULL;
128 static char *arg_chdir = NULL;
129 static char *arg_pivot_root_new = NULL;
130 static char *arg_pivot_root_old = NULL;
131 static char *arg_user = NULL;
132 static uid_t arg_uid = UID_INVALID;
133 static gid_t arg_gid = GID_INVALID;
134 static gid_t* arg_supplementary_gids = NULL;
135 static size_t arg_n_supplementary_gids = 0;
136 static sd_id128_t arg_uuid = {};
137 static char *arg_machine = NULL; /* The name used by the host to refer to this */
138 static char *arg_hostname = NULL; /* The name the payload sees by default */
139 static const char *arg_selinux_context = NULL;
140 static const char *arg_selinux_apifs_context = NULL;
141 static char *arg_slice = NULL;
142 static bool arg_private_network = false;
143 static bool arg_read_only = false;
144 static StartMode arg_start_mode = START_PID1;
145 static bool arg_ephemeral = false;
146 static LinkJournal arg_link_journal = LINK_AUTO;
147 static bool arg_link_journal_try = false;
148 static uint64_t arg_caps_retain =
149 (1ULL << CAP_AUDIT_CONTROL) |
150 (1ULL << CAP_AUDIT_WRITE) |
151 (1ULL << CAP_CHOWN) |
152 (1ULL << CAP_DAC_OVERRIDE) |
153 (1ULL << CAP_DAC_READ_SEARCH) |
154 (1ULL << CAP_FOWNER) |
155 (1ULL << CAP_FSETID) |
156 (1ULL << CAP_IPC_OWNER) |
157 (1ULL << CAP_KILL) |
158 (1ULL << CAP_LEASE) |
159 (1ULL << CAP_LINUX_IMMUTABLE) |
160 (1ULL << CAP_MKNOD) |
161 (1ULL << CAP_NET_BIND_SERVICE) |
162 (1ULL << CAP_NET_BROADCAST) |
163 (1ULL << CAP_NET_RAW) |
164 (1ULL << CAP_SETFCAP) |
165 (1ULL << CAP_SETGID) |
166 (1ULL << CAP_SETPCAP) |
167 (1ULL << CAP_SETUID) |
168 (1ULL << CAP_SYS_ADMIN) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_SYS_CHROOT) |
171 (1ULL << CAP_SYS_NICE) |
172 (1ULL << CAP_SYS_PTRACE) |
173 (1ULL << CAP_SYS_RESOURCE) |
174 (1ULL << CAP_SYS_TTY_CONFIG);
175 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
176 static CustomMount *arg_custom_mounts = NULL;
177 static size_t arg_n_custom_mounts = 0;
178 static char **arg_setenv = NULL;
179 static bool arg_quiet = false;
180 static bool arg_register = true;
181 static bool arg_keep_unit = false;
182 static char **arg_network_interfaces = NULL;
183 static char **arg_network_macvlan = NULL;
184 static char **arg_network_ipvlan = NULL;
185 static bool arg_network_veth = false;
186 static char **arg_network_veth_extra = NULL;
187 static char *arg_network_bridge = NULL;
188 static char *arg_network_zone = NULL;
189 static char *arg_network_namespace_path = NULL;
190 static PagerFlags arg_pager_flags = 0;
191 static unsigned long arg_personality = PERSONALITY_INVALID;
192 static char *arg_image = NULL;
193 static char *arg_oci_bundle = NULL;
194 static VolatileMode arg_volatile_mode = VOLATILE_NO;
195 static ExposePort *arg_expose_ports = NULL;
196 static char **arg_property = NULL;
197 static sd_bus_message *arg_property_message = NULL;
198 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
199 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
200 static bool arg_userns_chown = false;
201 static int arg_kill_signal = 0;
202 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
203 static SettingsMask arg_settings_mask = 0;
204 static int arg_settings_trusted = -1;
205 static char **arg_parameters = NULL;
206 static const char *arg_container_service_name = "systemd-nspawn";
207 static bool arg_notify_ready = false;
208 static bool arg_use_cgns = true;
209 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
210 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
211 static void *arg_root_hash = NULL;
212 static size_t arg_root_hash_size = 0;
213 static char **arg_syscall_whitelist = NULL;
214 static char **arg_syscall_blacklist = NULL;
215 #if HAVE_SECCOMP
216 static scmp_filter_ctx arg_seccomp = NULL;
217 #endif
218 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
219 static bool arg_no_new_privileges = false;
220 static int arg_oom_score_adjust = 0;
221 static bool arg_oom_score_adjust_set = false;
222 static CPUSet arg_cpu_set = {};
223 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
224 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
225 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
226 static DeviceNode* arg_extra_nodes = NULL;
227 static size_t arg_n_extra_nodes = 0;
228 static char **arg_sysctl = NULL;
229 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
230
231 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
232 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
233 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
234 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
235 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
253 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
257 #if HAVE_SECCOMP
258 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
259 #endif
260 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
261 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
262
263 static int handle_arg_console(const char *arg) {
264 if (streq(arg, "help")) {
265 puts("interactive\n"
266 "read-only\n"
267 "passive\n"
268 "pipe");
269 return 0;
270 }
271
272 if (streq(arg, "interactive"))
273 arg_console_mode = CONSOLE_INTERACTIVE;
274 else if (streq(arg, "read-only"))
275 arg_console_mode = CONSOLE_READ_ONLY;
276 else if (streq(arg, "passive"))
277 arg_console_mode = CONSOLE_PASSIVE;
278 else if (streq(arg, "pipe"))
279 arg_console_mode = CONSOLE_PIPE;
280 else
281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
282
283 arg_settings_mask |= SETTING_CONSOLE_MODE;
284 return 1;
285 }
286
287 static int help(void) {
288 _cleanup_free_ char *link = NULL;
289 int r;
290
291 (void) pager_open(arg_pager_flags);
292
293 r = terminal_urlify_man("systemd-nspawn", "1", &link);
294 if (r < 0)
295 return log_oom();
296
297 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
298 "Spawn a command or OS in a light-weight container.\n\n"
299 " -h --help Show this help\n"
300 " --version Print version string\n"
301 " -q --quiet Do not show status information\n"
302 " --no-pager Do not pipe output into a pager\n"
303 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
304 "%3$sImage:%4$s\n"
305 " -D --directory=PATH Root directory for the container\n"
306 " --template=PATH Initialize root directory from template directory,\n"
307 " if missing\n"
308 " -x --ephemeral Run container with snapshot of root directory, and\n"
309 " remove it after exit\n"
310 " -i --image=PATH Root file system disk image (or device node) for\n"
311 " the container\n"
312 " --oci-bundle=PATH OCI bundle directory\n"
313 " --read-only Mount the root directory read-only\n"
314 " --volatile[=MODE] Run the system in volatile mode\n"
315 " --root-hash=HASH Specify verity root hash for root disk image\n"
316 " --pivot-root=PATH[:PATH]\n"
317 " Pivot root to given directory in the container\n\n"
318 "%3$sExecution:%4$s\n"
319 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
320 " -b --boot Boot up full system (i.e. invoke init)\n"
321 " --chdir=PATH Set working directory in the container\n"
322 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
323 " -u --user=USER Run the command under specified user or UID\n"
324 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
325 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
326 "%3$sSystem Identity:%4$s\n"
327 " -M --machine=NAME Set the machine name for the container\n"
328 " --hostname=NAME Override the hostname for the container\n"
329 " --uuid=UUID Set a specific machine UUID for the container\n\n"
330 "%3$sProperties:%4$s\n"
331 " -S --slice=SLICE Place the container in the specified slice\n"
332 " --property=NAME=VALUE Set scope unit property\n"
333 " --register=BOOLEAN Register container as machine\n"
334 " --keep-unit Do not register a scope for the machine, reuse\n"
335 " the service unit nspawn is running in\n\n"
336 "%3$sUser Namespacing:%4$s\n"
337 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
338 " --private-users[=UIDBASE[:NUIDS]]\n"
339 " Similar, but with user configured UID/GID range\n"
340 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
341 "%3$sNetworking:%4$s\n"
342 " --private-network Disable network in container\n"
343 " --network-interface=INTERFACE\n"
344 " Assign an existing network interface to the\n"
345 " container\n"
346 " --network-macvlan=INTERFACE\n"
347 " Create a macvlan network interface based on an\n"
348 " existing network interface to the container\n"
349 " --network-ipvlan=INTERFACE\n"
350 " Create a ipvlan network interface based on an\n"
351 " existing network interface to the container\n"
352 " -n --network-veth Add a virtual Ethernet connection between host\n"
353 " and container\n"
354 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
355 " Add an additional virtual Ethernet link between\n"
356 " host and container\n"
357 " --network-bridge=INTERFACE\n"
358 " Add a virtual Ethernet connection to the container\n"
359 " and attach it to an existing bridge on the host\n"
360 " --network-zone=NAME Similar, but attach the new interface to an\n"
361 " an automatically managed bridge interface\n"
362 " --network-namespace-path=PATH\n"
363 " Set network namespace to the one represented by\n"
364 " the specified kernel namespace file node\n"
365 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
366 " Expose a container IP port on the host\n\n"
367 "%3$sSecurity:%4$s\n"
368 " --capability=CAP In addition to the default, retain specified\n"
369 " capability\n"
370 " --drop-capability=CAP Drop the specified capability from the default set\n"
371 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
372 " --system-call-filter=LIST|~LIST\n"
373 " Permit/prohibit specific system calls\n"
374 " -Z --selinux-context=SECLABEL\n"
375 " Set the SELinux security context to be used by\n"
376 " processes in the container\n"
377 " -L --selinux-apifs-context=SECLABEL\n"
378 " Set the SELinux security context to be used by\n"
379 " API/tmpfs file systems in the container\n\n"
380 "%3$sResources:%4$s\n"
381 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
382 " --oom-score-adjust=VALUE\n"
383 " Adjust the OOM score value for the payload\n"
384 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
385 " --personality=ARCH Pick personality for this container\n\n"
386 "%3$sIntegration:%4$s\n"
387 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
388 " --timezone=MODE Select mode of /etc/localtime initialization\n"
389 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
390 " host, try-guest, try-host\n"
391 " -j Equivalent to --link-journal=try-guest\n\n"
392 "%3$sMounts:%4$s\n"
393 " --bind=PATH[:PATH[:OPTIONS]]\n"
394 " Bind mount a file or directory from the host into\n"
395 " the container\n"
396 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
397 " Similar, but creates a read-only bind mount\n"
398 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
399 " it\n"
400 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
401 " --overlay=PATH[:PATH...]:PATH\n"
402 " Create an overlay mount from the host to \n"
403 " the container\n"
404 " --overlay-ro=PATH[:PATH...]:PATH\n"
405 " Similar, but creates a read-only overlay mount\n\n"
406 "%3$sInput/Output:%4$s\n"
407 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
408 " set up for the container.\n"
409 " -P --pipe Equivalent to --console=pipe\n"
410 "\nSee the %2$s for details.\n"
411 , program_invocation_short_name
412 , link
413 , ansi_underline(), ansi_normal());
414
415 return 0;
416 }
417
418 static int custom_mount_check_all(void) {
419 size_t i;
420
421 for (i = 0; i < arg_n_custom_mounts; i++) {
422 CustomMount *m = &arg_custom_mounts[i];
423
424 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
425 if (arg_userns_chown)
426 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
427 "--private-users-chown may not be combined with custom root mounts.");
428 else if (arg_uid_shift == UID_INVALID)
429 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
430 "--private-users with automatic UID shift may not be combined with custom root mounts.");
431 }
432 }
433
434 return 0;
435 }
436
437 static int detect_unified_cgroup_hierarchy_from_environment(void) {
438 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
439 int r;
440
441 /* Allow the user to control whether the unified hierarchy is used */
442
443 e = getenv(var);
444 if (!e) {
445 static bool warned = false;
446
447 var = "UNIFIED_CGROUP_HIERARCHY";
448 e = getenv(var);
449 if (e && !warned) {
450 log_info("$UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY.");
451 warned = true;
452 }
453 }
454
455 if (!isempty(e)) {
456 r = parse_boolean(e);
457 if (r < 0)
458 return log_error_errno(r, "Failed to parse $%s: %m", var);
459 if (r > 0)
460 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
461 else
462 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
463 }
464
465 return 0;
466 }
467
468 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
469 int r;
470
471 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
472 * in the image actually supports. */
473 r = cg_all_unified();
474 if (r < 0)
475 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
476 if (r > 0) {
477 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
478 * routine only detects 231, so we'll have a false negative here for 230. */
479 r = systemd_installation_has_version(directory, 230);
480 if (r < 0)
481 return log_error_errno(r, "Failed to determine systemd version in container: %m");
482 if (r > 0)
483 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
484 else
485 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
486 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
487 /* Mixed cgroup hierarchy support was added in 233 */
488 r = systemd_installation_has_version(directory, 233);
489 if (r < 0)
490 return log_error_errno(r, "Failed to determine systemd version in container: %m");
491 if (r > 0)
492 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
493 else
494 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
495 } else
496 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
497
498 log_debug("Using %s hierarchy for container.",
499 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
500 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
501
502 return 0;
503 }
504
505 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
506 int r;
507
508 r = getenv_bool(name);
509 if (r == -ENXIO)
510 return 0;
511 if (r < 0)
512 return log_error_errno(r, "Failed to parse $%s: %m", name);
513
514 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
515 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
516 return 0;
517 }
518
519 static int parse_mount_settings_env(void) {
520 const char *e;
521 int r;
522
523 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
524 if (r < 0 && r != -ENXIO)
525 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
526 if (r >= 0)
527 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
528
529 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
530 if (streq_ptr(e, "network"))
531 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
532
533 else if (e) {
534 r = parse_boolean(e);
535 if (r < 0)
536 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
537
538 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
539 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
540 }
541
542 return 0;
543 }
544
545 static int parse_environment(void) {
546 const char *e;
547 int r;
548
549 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
550 if (r < 0)
551 return r;
552 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
553 if (r < 0)
554 return r;
555 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
556 if (r < 0)
557 return r;
558 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
559 if (r < 0)
560 return r;
561
562 r = parse_mount_settings_env();
563 if (r < 0)
564 return r;
565
566 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
567 * even if it is supported. If not supported, it has no effect. */
568 if (!cg_ns_supported())
569 arg_use_cgns = false;
570 else {
571 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
572 if (r < 0) {
573 if (r != -ENXIO)
574 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
575
576 arg_use_cgns = true;
577 } else {
578 arg_use_cgns = r > 0;
579 arg_settings_mask |= SETTING_USE_CGNS;
580 }
581 }
582
583 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
584 if (e)
585 arg_container_service_name = e;
586
587 return detect_unified_cgroup_hierarchy_from_environment();
588 }
589
590 static int parse_argv(int argc, char *argv[]) {
591 enum {
592 ARG_VERSION = 0x100,
593 ARG_PRIVATE_NETWORK,
594 ARG_UUID,
595 ARG_READ_ONLY,
596 ARG_CAPABILITY,
597 ARG_DROP_CAPABILITY,
598 ARG_LINK_JOURNAL,
599 ARG_BIND,
600 ARG_BIND_RO,
601 ARG_TMPFS,
602 ARG_OVERLAY,
603 ARG_OVERLAY_RO,
604 ARG_INACCESSIBLE,
605 ARG_SHARE_SYSTEM,
606 ARG_REGISTER,
607 ARG_KEEP_UNIT,
608 ARG_NETWORK_INTERFACE,
609 ARG_NETWORK_MACVLAN,
610 ARG_NETWORK_IPVLAN,
611 ARG_NETWORK_BRIDGE,
612 ARG_NETWORK_ZONE,
613 ARG_NETWORK_VETH_EXTRA,
614 ARG_NETWORK_NAMESPACE_PATH,
615 ARG_PERSONALITY,
616 ARG_VOLATILE,
617 ARG_TEMPLATE,
618 ARG_PROPERTY,
619 ARG_PRIVATE_USERS,
620 ARG_KILL_SIGNAL,
621 ARG_SETTINGS,
622 ARG_CHDIR,
623 ARG_PIVOT_ROOT,
624 ARG_PRIVATE_USERS_CHOWN,
625 ARG_NOTIFY_READY,
626 ARG_ROOT_HASH,
627 ARG_SYSTEM_CALL_FILTER,
628 ARG_RLIMIT,
629 ARG_HOSTNAME,
630 ARG_NO_NEW_PRIVILEGES,
631 ARG_OOM_SCORE_ADJUST,
632 ARG_CPU_AFFINITY,
633 ARG_RESOLV_CONF,
634 ARG_TIMEZONE,
635 ARG_CONSOLE,
636 ARG_PIPE,
637 ARG_OCI_BUNDLE,
638 ARG_NO_PAGER,
639 };
640
641 static const struct option options[] = {
642 { "help", no_argument, NULL, 'h' },
643 { "version", no_argument, NULL, ARG_VERSION },
644 { "directory", required_argument, NULL, 'D' },
645 { "template", required_argument, NULL, ARG_TEMPLATE },
646 { "ephemeral", no_argument, NULL, 'x' },
647 { "user", required_argument, NULL, 'u' },
648 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
649 { "as-pid2", no_argument, NULL, 'a' },
650 { "boot", no_argument, NULL, 'b' },
651 { "uuid", required_argument, NULL, ARG_UUID },
652 { "read-only", no_argument, NULL, ARG_READ_ONLY },
653 { "capability", required_argument, NULL, ARG_CAPABILITY },
654 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
655 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
656 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
657 { "bind", required_argument, NULL, ARG_BIND },
658 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
659 { "tmpfs", required_argument, NULL, ARG_TMPFS },
660 { "overlay", required_argument, NULL, ARG_OVERLAY },
661 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
662 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
663 { "machine", required_argument, NULL, 'M' },
664 { "hostname", required_argument, NULL, ARG_HOSTNAME },
665 { "slice", required_argument, NULL, 'S' },
666 { "setenv", required_argument, NULL, 'E' },
667 { "selinux-context", required_argument, NULL, 'Z' },
668 { "selinux-apifs-context", required_argument, NULL, 'L' },
669 { "quiet", no_argument, NULL, 'q' },
670 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
671 { "register", required_argument, NULL, ARG_REGISTER },
672 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
673 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
674 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
675 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
676 { "network-veth", no_argument, NULL, 'n' },
677 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
678 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
679 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
680 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
681 { "personality", required_argument, NULL, ARG_PERSONALITY },
682 { "image", required_argument, NULL, 'i' },
683 { "volatile", optional_argument, NULL, ARG_VOLATILE },
684 { "port", required_argument, NULL, 'p' },
685 { "property", required_argument, NULL, ARG_PROPERTY },
686 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
687 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
688 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
689 { "settings", required_argument, NULL, ARG_SETTINGS },
690 { "chdir", required_argument, NULL, ARG_CHDIR },
691 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
692 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
693 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
694 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
695 { "rlimit", required_argument, NULL, ARG_RLIMIT },
696 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
697 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
698 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
699 { "timezone", required_argument, NULL, ARG_TIMEZONE },
700 { "console", required_argument, NULL, ARG_CONSOLE },
701 { "pipe", no_argument, NULL, ARG_PIPE },
702 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
703 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
704 {}
705 };
706
707 int c, r;
708 const char *p;
709 uint64_t plus = 0, minus = 0;
710 bool mask_all_settings = false, mask_no_settings = false;
711
712 assert(argc >= 0);
713 assert(argv);
714
715 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
716 switch (c) {
717
718 case 'h':
719 return help();
720
721 case ARG_VERSION:
722 return version();
723
724 case 'D':
725 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
726 if (r < 0)
727 return r;
728
729 arg_settings_mask |= SETTING_DIRECTORY;
730 break;
731
732 case ARG_TEMPLATE:
733 r = parse_path_argument_and_warn(optarg, false, &arg_template);
734 if (r < 0)
735 return r;
736
737 arg_settings_mask |= SETTING_DIRECTORY;
738 break;
739
740 case 'i':
741 r = parse_path_argument_and_warn(optarg, false, &arg_image);
742 if (r < 0)
743 return r;
744
745 arg_settings_mask |= SETTING_DIRECTORY;
746 break;
747
748 case ARG_OCI_BUNDLE:
749 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
750 if (r < 0)
751 return r;
752
753 break;
754
755 case 'x':
756 arg_ephemeral = true;
757 arg_settings_mask |= SETTING_EPHEMERAL;
758 break;
759
760 case 'u':
761 r = free_and_strdup(&arg_user, optarg);
762 if (r < 0)
763 return log_oom();
764
765 arg_settings_mask |= SETTING_USER;
766 break;
767
768 case ARG_NETWORK_ZONE: {
769 char *j;
770
771 j = strjoin("vz-", optarg);
772 if (!j)
773 return log_oom();
774
775 if (!ifname_valid(j)) {
776 log_error("Network zone name not valid: %s", j);
777 free(j);
778 return -EINVAL;
779 }
780
781 free_and_replace(arg_network_zone, j);
782
783 arg_network_veth = true;
784 arg_private_network = true;
785 arg_settings_mask |= SETTING_NETWORK;
786 break;
787 }
788
789 case ARG_NETWORK_BRIDGE:
790
791 if (!ifname_valid(optarg))
792 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
793 "Bridge interface name not valid: %s", optarg);
794
795 r = free_and_strdup(&arg_network_bridge, optarg);
796 if (r < 0)
797 return log_oom();
798
799 _fallthrough_;
800 case 'n':
801 arg_network_veth = true;
802 arg_private_network = true;
803 arg_settings_mask |= SETTING_NETWORK;
804 break;
805
806 case ARG_NETWORK_VETH_EXTRA:
807 r = veth_extra_parse(&arg_network_veth_extra, optarg);
808 if (r < 0)
809 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
810
811 arg_private_network = true;
812 arg_settings_mask |= SETTING_NETWORK;
813 break;
814
815 case ARG_NETWORK_INTERFACE:
816 if (!ifname_valid(optarg))
817 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
818 "Network interface name not valid: %s", optarg);
819
820 if (strv_extend(&arg_network_interfaces, optarg) < 0)
821 return log_oom();
822
823 arg_private_network = true;
824 arg_settings_mask |= SETTING_NETWORK;
825 break;
826
827 case ARG_NETWORK_MACVLAN:
828
829 if (!ifname_valid(optarg))
830 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
831 "MACVLAN network interface name not valid: %s", optarg);
832
833 if (strv_extend(&arg_network_macvlan, optarg) < 0)
834 return log_oom();
835
836 arg_private_network = true;
837 arg_settings_mask |= SETTING_NETWORK;
838 break;
839
840 case ARG_NETWORK_IPVLAN:
841
842 if (!ifname_valid(optarg))
843 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
844 "IPVLAN network interface name not valid: %s", optarg);
845
846 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
847 return log_oom();
848
849 _fallthrough_;
850 case ARG_PRIVATE_NETWORK:
851 arg_private_network = true;
852 arg_settings_mask |= SETTING_NETWORK;
853 break;
854
855 case ARG_NETWORK_NAMESPACE_PATH:
856 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
857 if (r < 0)
858 return r;
859
860 arg_settings_mask |= SETTING_NETWORK;
861 break;
862
863 case 'b':
864 if (arg_start_mode == START_PID2)
865 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
866 "--boot and --as-pid2 may not be combined.");
867
868 arg_start_mode = START_BOOT;
869 arg_settings_mask |= SETTING_START_MODE;
870 break;
871
872 case 'a':
873 if (arg_start_mode == START_BOOT)
874 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
875 "--boot and --as-pid2 may not be combined.");
876
877 arg_start_mode = START_PID2;
878 arg_settings_mask |= SETTING_START_MODE;
879 break;
880
881 case ARG_UUID:
882 r = sd_id128_from_string(optarg, &arg_uuid);
883 if (r < 0)
884 return log_error_errno(r, "Invalid UUID: %s", optarg);
885
886 if (sd_id128_is_null(arg_uuid))
887 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
888 "Machine UUID may not be all zeroes.");
889
890 arg_settings_mask |= SETTING_MACHINE_ID;
891 break;
892
893 case 'S': {
894 _cleanup_free_ char *mangled = NULL;
895
896 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
897 if (r < 0)
898 return log_oom();
899
900 free_and_replace(arg_slice, mangled);
901 arg_settings_mask |= SETTING_SLICE;
902 break;
903 }
904
905 case 'M':
906 if (isempty(optarg))
907 arg_machine = mfree(arg_machine);
908 else {
909 if (!machine_name_is_valid(optarg))
910 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
911 "Invalid machine name: %s", optarg);
912
913 r = free_and_strdup(&arg_machine, optarg);
914 if (r < 0)
915 return log_oom();
916 }
917 break;
918
919 case ARG_HOSTNAME:
920 if (isempty(optarg))
921 arg_hostname = mfree(arg_hostname);
922 else {
923 if (!hostname_is_valid(optarg, false))
924 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
925 "Invalid hostname: %s", optarg);
926
927 r = free_and_strdup(&arg_hostname, optarg);
928 if (r < 0)
929 return log_oom();
930 }
931
932 arg_settings_mask |= SETTING_HOSTNAME;
933 break;
934
935 case 'Z':
936 arg_selinux_context = optarg;
937 break;
938
939 case 'L':
940 arg_selinux_apifs_context = optarg;
941 break;
942
943 case ARG_READ_ONLY:
944 arg_read_only = true;
945 arg_settings_mask |= SETTING_READ_ONLY;
946 break;
947
948 case ARG_CAPABILITY:
949 case ARG_DROP_CAPABILITY: {
950 p = optarg;
951 for (;;) {
952 _cleanup_free_ char *t = NULL;
953
954 r = extract_first_word(&p, &t, ",", 0);
955 if (r < 0)
956 return log_error_errno(r, "Failed to parse capability %s.", t);
957 if (r == 0)
958 break;
959
960 if (streq(t, "all")) {
961 if (c == ARG_CAPABILITY)
962 plus = (uint64_t) -1;
963 else
964 minus = (uint64_t) -1;
965 } else {
966 r = capability_from_name(t);
967 if (r < 0)
968 return log_error_errno(r, "Failed to parse capability %s.", t);
969
970 if (c == ARG_CAPABILITY)
971 plus |= 1ULL << r;
972 else
973 minus |= 1ULL << r;
974 }
975 }
976
977 arg_settings_mask |= SETTING_CAPABILITY;
978 break;
979 }
980
981 case ARG_NO_NEW_PRIVILEGES:
982 r = parse_boolean(optarg);
983 if (r < 0)
984 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
985
986 arg_no_new_privileges = r;
987 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
988 break;
989
990 case 'j':
991 arg_link_journal = LINK_GUEST;
992 arg_link_journal_try = true;
993 arg_settings_mask |= SETTING_LINK_JOURNAL;
994 break;
995
996 case ARG_LINK_JOURNAL:
997 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
998 if (r < 0)
999 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1000
1001 arg_settings_mask |= SETTING_LINK_JOURNAL;
1002 break;
1003
1004 case ARG_BIND:
1005 case ARG_BIND_RO:
1006 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1007 if (r < 0)
1008 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1009
1010 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1011 break;
1012
1013 case ARG_TMPFS:
1014 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1015 if (r < 0)
1016 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1017
1018 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1019 break;
1020
1021 case ARG_OVERLAY:
1022 case ARG_OVERLAY_RO:
1023 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1024 if (r == -EADDRNOTAVAIL)
1025 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1026 if (r < 0)
1027 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1028
1029 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1030 break;
1031
1032 case ARG_INACCESSIBLE:
1033 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1034 if (r < 0)
1035 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1036
1037 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1038 break;
1039
1040 case 'E': {
1041 char **n;
1042
1043 if (!env_assignment_is_valid(optarg))
1044 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1045 "Environment variable assignment '%s' is not valid.", optarg);
1046
1047 n = strv_env_set(arg_setenv, optarg);
1048 if (!n)
1049 return log_oom();
1050
1051 strv_free_and_replace(arg_setenv, n);
1052 arg_settings_mask |= SETTING_ENVIRONMENT;
1053 break;
1054 }
1055
1056 case 'q':
1057 arg_quiet = true;
1058 break;
1059
1060 case ARG_SHARE_SYSTEM:
1061 /* We don't officially support this anymore, except for compat reasons. People should use the
1062 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1063 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1064 arg_clone_ns_flags = 0;
1065 break;
1066
1067 case ARG_REGISTER:
1068 r = parse_boolean(optarg);
1069 if (r < 0) {
1070 log_error("Failed to parse --register= argument: %s", optarg);
1071 return r;
1072 }
1073
1074 arg_register = r;
1075 break;
1076
1077 case ARG_KEEP_UNIT:
1078 arg_keep_unit = true;
1079 break;
1080
1081 case ARG_PERSONALITY:
1082
1083 arg_personality = personality_from_string(optarg);
1084 if (arg_personality == PERSONALITY_INVALID)
1085 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1086 "Unknown or unsupported personality '%s'.", optarg);
1087
1088 arg_settings_mask |= SETTING_PERSONALITY;
1089 break;
1090
1091 case ARG_VOLATILE:
1092
1093 if (!optarg)
1094 arg_volatile_mode = VOLATILE_YES;
1095 else if (streq(optarg, "help")) {
1096 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1097 return 0;
1098 } else {
1099 VolatileMode m;
1100
1101 m = volatile_mode_from_string(optarg);
1102 if (m < 0)
1103 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1104 "Failed to parse --volatile= argument: %s", optarg);
1105 else
1106 arg_volatile_mode = m;
1107 }
1108
1109 arg_settings_mask |= SETTING_VOLATILE_MODE;
1110 break;
1111
1112 case 'p':
1113 r = expose_port_parse(&arg_expose_ports, optarg);
1114 if (r == -EEXIST)
1115 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1116 if (r < 0)
1117 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1118
1119 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1120 break;
1121
1122 case ARG_PROPERTY:
1123 if (strv_extend(&arg_property, optarg) < 0)
1124 return log_oom();
1125
1126 break;
1127
1128 case ARG_PRIVATE_USERS: {
1129 int boolean = -1;
1130
1131 if (!optarg)
1132 boolean = true;
1133 else if (!in_charset(optarg, DIGITS))
1134 /* do *not* parse numbers as booleans */
1135 boolean = parse_boolean(optarg);
1136
1137 if (boolean == false) {
1138 /* no: User namespacing off */
1139 arg_userns_mode = USER_NAMESPACE_NO;
1140 arg_uid_shift = UID_INVALID;
1141 arg_uid_range = UINT32_C(0x10000);
1142 } else if (boolean == true) {
1143 /* yes: User namespacing on, UID range is read from root dir */
1144 arg_userns_mode = USER_NAMESPACE_FIXED;
1145 arg_uid_shift = UID_INVALID;
1146 arg_uid_range = UINT32_C(0x10000);
1147 } else if (streq(optarg, "pick")) {
1148 /* pick: User namespacing on, UID range is picked randomly */
1149 arg_userns_mode = USER_NAMESPACE_PICK;
1150 arg_uid_shift = UID_INVALID;
1151 arg_uid_range = UINT32_C(0x10000);
1152 } else {
1153 _cleanup_free_ char *buffer = NULL;
1154 const char *range, *shift;
1155
1156 /* anything else: User namespacing on, UID range is explicitly configured */
1157
1158 range = strchr(optarg, ':');
1159 if (range) {
1160 buffer = strndup(optarg, range - optarg);
1161 if (!buffer)
1162 return log_oom();
1163 shift = buffer;
1164
1165 range++;
1166 r = safe_atou32(range, &arg_uid_range);
1167 if (r < 0)
1168 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1169 } else
1170 shift = optarg;
1171
1172 r = parse_uid(shift, &arg_uid_shift);
1173 if (r < 0)
1174 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1175
1176 arg_userns_mode = USER_NAMESPACE_FIXED;
1177 }
1178
1179 if (arg_uid_range <= 0)
1180 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1181 "UID range cannot be 0.");
1182
1183 arg_settings_mask |= SETTING_USERNS;
1184 break;
1185 }
1186
1187 case 'U':
1188 if (userns_supported()) {
1189 arg_userns_mode = USER_NAMESPACE_PICK;
1190 arg_uid_shift = UID_INVALID;
1191 arg_uid_range = UINT32_C(0x10000);
1192
1193 arg_settings_mask |= SETTING_USERNS;
1194 }
1195
1196 break;
1197
1198 case ARG_PRIVATE_USERS_CHOWN:
1199 arg_userns_chown = true;
1200
1201 arg_settings_mask |= SETTING_USERNS;
1202 break;
1203
1204 case ARG_KILL_SIGNAL:
1205 if (streq(optarg, "help")) {
1206 DUMP_STRING_TABLE(signal, int, _NSIG);
1207 return 0;
1208 }
1209
1210 arg_kill_signal = signal_from_string(optarg);
1211 if (arg_kill_signal < 0)
1212 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1213 "Cannot parse signal: %s", optarg);
1214
1215 arg_settings_mask |= SETTING_KILL_SIGNAL;
1216 break;
1217
1218 case ARG_SETTINGS:
1219
1220 /* no → do not read files
1221 * yes → read files, do not override cmdline, trust only subset
1222 * override → read files, override cmdline, trust only subset
1223 * trusted → read files, do not override cmdline, trust all
1224 */
1225
1226 r = parse_boolean(optarg);
1227 if (r < 0) {
1228 if (streq(optarg, "trusted")) {
1229 mask_all_settings = false;
1230 mask_no_settings = false;
1231 arg_settings_trusted = true;
1232
1233 } else if (streq(optarg, "override")) {
1234 mask_all_settings = false;
1235 mask_no_settings = true;
1236 arg_settings_trusted = -1;
1237 } else
1238 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1239 } else if (r > 0) {
1240 /* yes */
1241 mask_all_settings = false;
1242 mask_no_settings = false;
1243 arg_settings_trusted = -1;
1244 } else {
1245 /* no */
1246 mask_all_settings = true;
1247 mask_no_settings = false;
1248 arg_settings_trusted = false;
1249 }
1250
1251 break;
1252
1253 case ARG_CHDIR:
1254 if (!path_is_absolute(optarg))
1255 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1256 "Working directory %s is not an absolute path.", optarg);
1257
1258 r = free_and_strdup(&arg_chdir, optarg);
1259 if (r < 0)
1260 return log_oom();
1261
1262 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1263 break;
1264
1265 case ARG_PIVOT_ROOT:
1266 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1267 if (r < 0)
1268 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1269
1270 arg_settings_mask |= SETTING_PIVOT_ROOT;
1271 break;
1272
1273 case ARG_NOTIFY_READY:
1274 r = parse_boolean(optarg);
1275 if (r < 0)
1276 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1277 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1278 arg_notify_ready = r;
1279 arg_settings_mask |= SETTING_NOTIFY_READY;
1280 break;
1281
1282 case ARG_ROOT_HASH: {
1283 void *k;
1284 size_t l;
1285
1286 r = unhexmem(optarg, strlen(optarg), &k, &l);
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1289 if (l < sizeof(sd_id128_t)) {
1290 free(k);
1291 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1292 }
1293
1294 free(arg_root_hash);
1295 arg_root_hash = k;
1296 arg_root_hash_size = l;
1297 break;
1298 }
1299
1300 case ARG_SYSTEM_CALL_FILTER: {
1301 bool negative;
1302 const char *items;
1303
1304 negative = optarg[0] == '~';
1305 items = negative ? optarg + 1 : optarg;
1306
1307 for (;;) {
1308 _cleanup_free_ char *word = NULL;
1309
1310 r = extract_first_word(&items, &word, NULL, 0);
1311 if (r == 0)
1312 break;
1313 if (r == -ENOMEM)
1314 return log_oom();
1315 if (r < 0)
1316 return log_error_errno(r, "Failed to parse system call filter: %m");
1317
1318 if (negative)
1319 r = strv_extend(&arg_syscall_blacklist, word);
1320 else
1321 r = strv_extend(&arg_syscall_whitelist, word);
1322 if (r < 0)
1323 return log_oom();
1324 }
1325
1326 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1327 break;
1328 }
1329
1330 case ARG_RLIMIT: {
1331 const char *eq;
1332 _cleanup_free_ char *name = NULL;
1333 int rl;
1334
1335 if (streq(optarg, "help")) {
1336 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1337 return 0;
1338 }
1339
1340 eq = strchr(optarg, '=');
1341 if (!eq)
1342 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1343 "--rlimit= expects an '=' assignment.");
1344
1345 name = strndup(optarg, eq - optarg);
1346 if (!name)
1347 return log_oom();
1348
1349 rl = rlimit_from_string_harder(name);
1350 if (rl < 0)
1351 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1352 "Unknown resource limit: %s", name);
1353
1354 if (!arg_rlimit[rl]) {
1355 arg_rlimit[rl] = new0(struct rlimit, 1);
1356 if (!arg_rlimit[rl])
1357 return log_oom();
1358 }
1359
1360 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1363
1364 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1365 break;
1366 }
1367
1368 case ARG_OOM_SCORE_ADJUST:
1369 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1370 if (r < 0)
1371 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1372
1373 arg_oom_score_adjust_set = true;
1374 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1375 break;
1376
1377 case ARG_CPU_AFFINITY: {
1378 CPUSet cpuset;
1379
1380 r = parse_cpu_set(optarg, &cpuset);
1381 if (r < 0)
1382 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1383
1384 cpu_set_reset(&arg_cpu_set);
1385 arg_cpu_set = cpuset;
1386 arg_settings_mask |= SETTING_CPU_AFFINITY;
1387 break;
1388 }
1389
1390 case ARG_RESOLV_CONF:
1391 if (streq(optarg, "help")) {
1392 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1393 return 0;
1394 }
1395
1396 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1397 if (arg_resolv_conf < 0)
1398 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1399 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1400
1401 arg_settings_mask |= SETTING_RESOLV_CONF;
1402 break;
1403
1404 case ARG_TIMEZONE:
1405 if (streq(optarg, "help")) {
1406 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1407 return 0;
1408 }
1409
1410 arg_timezone = timezone_mode_from_string(optarg);
1411 if (arg_timezone < 0)
1412 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1413 "Failed to parse /etc/localtime mode: %s", optarg);
1414
1415 arg_settings_mask |= SETTING_TIMEZONE;
1416 break;
1417
1418 case ARG_CONSOLE:
1419 r = handle_arg_console(optarg);
1420 if (r <= 0)
1421 return r;
1422 break;
1423
1424 case 'P':
1425 case ARG_PIPE:
1426 r = handle_arg_console("pipe");
1427 if (r <= 0)
1428 return r;
1429 break;
1430
1431 case ARG_NO_PAGER:
1432 arg_pager_flags |= PAGER_DISABLE;
1433 break;
1434
1435 case '?':
1436 return -EINVAL;
1437
1438 default:
1439 assert_not_reached("Unhandled option");
1440 }
1441
1442 if (argc > optind) {
1443 strv_free(arg_parameters);
1444 arg_parameters = strv_copy(argv + optind);
1445 if (!arg_parameters)
1446 return log_oom();
1447
1448 arg_settings_mask |= SETTING_START_MODE;
1449 }
1450
1451 if (arg_ephemeral && arg_template && !arg_directory)
1452 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1453 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1454 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1455 * --directory=". */
1456 arg_directory = TAKE_PTR(arg_template);
1457
1458 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1459
1460 /* Make sure to parse environment before we reset the settings mask below */
1461 r = parse_environment();
1462 if (r < 0)
1463 return r;
1464
1465 /* Load all settings from .nspawn files */
1466 if (mask_no_settings)
1467 arg_settings_mask = 0;
1468
1469 /* Don't load any settings from .nspawn files */
1470 if (mask_all_settings)
1471 arg_settings_mask = _SETTINGS_MASK_ALL;
1472
1473 return 1;
1474 }
1475
1476 static int verify_arguments(void) {
1477 int r;
1478
1479 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1480 /* If we are running the stub init in the container, we don't need to look at what the init
1481 * in the container supports, because we are not using it. Let's immediately pick the right
1482 * setting based on the host system configuration.
1483 *
1484 * We only do this, if the user didn't use an environment variable to override the detection.
1485 */
1486
1487 r = cg_all_unified();
1488 if (r < 0)
1489 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1490 if (r > 0)
1491 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1492 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1493 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1494 else
1495 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1496 }
1497
1498 if (arg_userns_mode != USER_NAMESPACE_NO)
1499 arg_mount_settings |= MOUNT_USE_USERNS;
1500
1501 if (arg_private_network)
1502 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1503
1504 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1505 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1506 arg_register = false;
1507 if (arg_start_mode != START_PID1)
1508 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1509 }
1510
1511 if (arg_userns_mode == USER_NAMESPACE_PICK)
1512 arg_userns_chown = true;
1513
1514 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1515 arg_kill_signal = SIGRTMIN+3;
1516
1517 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1518 arg_read_only = true;
1519
1520 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1521 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1522 * The latter is not technically a user session, but we don't need to labour the point. */
1523 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1524
1525 if (arg_directory && arg_image)
1526 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1527
1528 if (arg_template && arg_image)
1529 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1530
1531 if (arg_template && !(arg_directory || arg_machine))
1532 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1533
1534 if (arg_ephemeral && arg_template)
1535 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1536
1537 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1538 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1539
1540 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1541 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1542
1543 if (arg_userns_chown && arg_read_only)
1544 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1545 "--read-only and --private-users-chown may not be combined.");
1546
1547 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1548 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1549 * copy-up (in case of overlay) making the entire exercise pointless. */
1550 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1551 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1552
1553 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1554 * to avoid conflicts between different network options. */
1555 if (arg_network_namespace_path &&
1556 (arg_network_interfaces || arg_network_macvlan ||
1557 arg_network_ipvlan || arg_network_veth_extra ||
1558 arg_network_bridge || arg_network_zone ||
1559 arg_network_veth || arg_private_network))
1560 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1561
1562 if (arg_network_bridge && arg_network_zone)
1563 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1564 "--network-bridge= and --network-zone= may not be combined.");
1565
1566 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1567 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1568
1569 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1570 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1571
1572 if (arg_expose_ports && !arg_private_network)
1573 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1574
1575 #if ! HAVE_LIBIPTC
1576 if (arg_expose_ports)
1577 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1578 #endif
1579
1580 r = custom_mount_check_all();
1581 if (r < 0)
1582 return r;
1583
1584 return 0;
1585 }
1586
1587 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1588 assert(p);
1589
1590 if (arg_userns_mode == USER_NAMESPACE_NO)
1591 return 0;
1592
1593 if (uid == UID_INVALID && gid == GID_INVALID)
1594 return 0;
1595
1596 if (uid != UID_INVALID) {
1597 uid += arg_uid_shift;
1598
1599 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1600 return -EOVERFLOW;
1601 }
1602
1603 if (gid != GID_INVALID) {
1604 gid += (gid_t) arg_uid_shift;
1605
1606 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1607 return -EOVERFLOW;
1608 }
1609
1610 if (lchown(p, uid, gid) < 0)
1611 return -errno;
1612
1613 return 0;
1614 }
1615
1616 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1617 const char *q;
1618 int r;
1619
1620 q = prefix_roota(root, path);
1621 r = mkdir_errno_wrapper(q, mode);
1622 if (r == -EEXIST)
1623 return 0;
1624 if (r < 0)
1625 return r;
1626
1627 return userns_lchown(q, uid, gid);
1628 }
1629
1630 static const char *timezone_from_path(const char *path) {
1631 return PATH_STARTSWITH_SET(
1632 path,
1633 "../usr/share/zoneinfo/",
1634 "/usr/share/zoneinfo/");
1635 }
1636
1637 static bool etc_writable(void) {
1638 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1639 }
1640
1641 static int setup_timezone(const char *dest) {
1642 _cleanup_free_ char *p = NULL, *etc = NULL;
1643 const char *where, *check;
1644 TimezoneMode m;
1645 int r;
1646
1647 assert(dest);
1648
1649 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1650 r = readlink_malloc("/etc/localtime", &p);
1651 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1652 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1653 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1654 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1655 else if (r < 0) {
1656 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1657 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1658 * file.
1659 *
1660 * Example:
1661 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1662 */
1663 return 0;
1664 } else if (arg_timezone == TIMEZONE_AUTO)
1665 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1666 else
1667 m = arg_timezone;
1668 } else
1669 m = arg_timezone;
1670
1671 if (m == TIMEZONE_OFF)
1672 return 0;
1673
1674 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1675 if (r < 0) {
1676 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1677 return 0;
1678 }
1679
1680 where = strjoina(etc, "/localtime");
1681
1682 switch (m) {
1683
1684 case TIMEZONE_DELETE:
1685 if (unlink(where) < 0)
1686 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1687
1688 return 0;
1689
1690 case TIMEZONE_SYMLINK: {
1691 _cleanup_free_ char *q = NULL;
1692 const char *z, *what;
1693
1694 z = timezone_from_path(p);
1695 if (!z) {
1696 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1697 return 0;
1698 }
1699
1700 r = readlink_malloc(where, &q);
1701 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1702 return 0; /* Already pointing to the right place? Then do nothing .. */
1703
1704 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1705 r = chase_symlinks(check, dest, 0, NULL, NULL);
1706 if (r < 0)
1707 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1708 else {
1709 if (unlink(where) < 0 && errno != ENOENT) {
1710 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1711 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1712 return 0;
1713 }
1714
1715 what = strjoina("../usr/share/zoneinfo/", z);
1716 if (symlink(what, where) < 0) {
1717 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1718 errno, "Failed to correct timezone of container, ignoring: %m");
1719 return 0;
1720 }
1721
1722 break;
1723 }
1724
1725 _fallthrough_;
1726 }
1727
1728 case TIMEZONE_BIND: {
1729 _cleanup_free_ char *resolved = NULL;
1730 int found;
1731
1732 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1733 if (found < 0) {
1734 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1735 return 0;
1736 }
1737
1738 if (found == 0) /* missing? */
1739 (void) touch(resolved);
1740
1741 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1742 if (r >= 0)
1743 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1744
1745 _fallthrough_;
1746 }
1747
1748 case TIMEZONE_COPY:
1749 /* If mounting failed, try to copy */
1750 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1751 if (r < 0) {
1752 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1753 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1754 return 0;
1755 }
1756
1757 break;
1758
1759 default:
1760 assert_not_reached("unexpected mode");
1761 }
1762
1763 /* Fix permissions of the symlink or file copy we just created */
1764 r = userns_lchown(where, 0, 0);
1765 if (r < 0)
1766 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1767
1768 return 0;
1769 }
1770
1771 static int have_resolv_conf(const char *path) {
1772 assert(path);
1773
1774 if (access(path, F_OK) < 0) {
1775 if (errno == ENOENT)
1776 return 0;
1777
1778 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1779 }
1780
1781 return 1;
1782 }
1783
1784 static int resolved_listening(void) {
1785 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1786 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1787 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1788 int r;
1789
1790 /* Check if resolved is listening */
1791
1792 r = sd_bus_open_system(&bus);
1793 if (r < 0)
1794 return log_debug_errno(r, "Failed to open system bus: %m");
1795
1796 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1797 if (r < 0)
1798 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1799 if (r == 0)
1800 return 0;
1801
1802 r = sd_bus_get_property_string(bus,
1803 "org.freedesktop.resolve1",
1804 "/org/freedesktop/resolve1",
1805 "org.freedesktop.resolve1.Manager",
1806 "DNSStubListener",
1807 &error,
1808 &dns_stub_listener_mode);
1809 if (r < 0)
1810 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1811
1812 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1813 }
1814
1815 static int setup_resolv_conf(const char *dest) {
1816 _cleanup_free_ char *etc = NULL;
1817 const char *where, *what;
1818 ResolvConfMode m;
1819 int r;
1820
1821 assert(dest);
1822
1823 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1824 if (arg_private_network)
1825 m = RESOLV_CONF_OFF;
1826 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1827 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
1828 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1829 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
1830 else
1831 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
1832 } else
1833 m = arg_resolv_conf;
1834
1835 if (m == RESOLV_CONF_OFF)
1836 return 0;
1837
1838 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1839 if (r < 0) {
1840 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1841 return 0;
1842 }
1843
1844 where = strjoina(etc, "/resolv.conf");
1845
1846 if (m == RESOLV_CONF_DELETE) {
1847 if (unlink(where) < 0)
1848 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1849
1850 return 0;
1851 }
1852
1853 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1854 what = STATIC_RESOLV_CONF;
1855 else
1856 what = "/etc/resolv.conf";
1857
1858 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1859 _cleanup_free_ char *resolved = NULL;
1860 int found;
1861
1862 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1863 if (found < 0) {
1864 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1865 return 0;
1866 }
1867
1868 if (found == 0) /* missing? */
1869 (void) touch(resolved);
1870
1871 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1872 if (r >= 0)
1873 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1874 }
1875
1876 /* If that didn't work, let's copy the file */
1877 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
1878 if (r < 0) {
1879 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1880 * resolved or something similar runs inside and the symlink points there.
1881 *
1882 * If the disk image is read-only, there's also no point in complaining.
1883 */
1884 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1885 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1886 return 0;
1887 }
1888
1889 r = userns_lchown(where, 0, 0);
1890 if (r < 0)
1891 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1892
1893 return 0;
1894 }
1895
1896 static int setup_boot_id(void) {
1897 _cleanup_(unlink_and_freep) char *from = NULL;
1898 _cleanup_free_ char *path = NULL;
1899 sd_id128_t rnd = SD_ID128_NULL;
1900 const char *to;
1901 int r;
1902
1903 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
1904
1905 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
1906 if (r < 0)
1907 return log_error_errno(r, "Failed to generate random boot ID path: %m");
1908
1909 r = sd_id128_randomize(&rnd);
1910 if (r < 0)
1911 return log_error_errno(r, "Failed to generate random boot id: %m");
1912
1913 r = id128_write(path, ID128_UUID, rnd, false);
1914 if (r < 0)
1915 return log_error_errno(r, "Failed to write boot id: %m");
1916
1917 from = TAKE_PTR(path);
1918 to = "/proc/sys/kernel/random/boot_id";
1919
1920 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1921 if (r < 0)
1922 return r;
1923
1924 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1925 }
1926
1927 static int copy_devnodes(const char *dest) {
1928 static const char devnodes[] =
1929 "null\0"
1930 "zero\0"
1931 "full\0"
1932 "random\0"
1933 "urandom\0"
1934 "tty\0"
1935 "net/tun\0";
1936
1937 _cleanup_umask_ mode_t u;
1938 const char *d;
1939 int r = 0;
1940
1941 assert(dest);
1942
1943 u = umask(0000);
1944
1945 /* Create /dev/net, so that we can create /dev/net/tun in it */
1946 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1947 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1948
1949 NULSTR_FOREACH(d, devnodes) {
1950 _cleanup_free_ char *from = NULL, *to = NULL;
1951 struct stat st;
1952
1953 from = path_join("/dev/", d);
1954 if (!from)
1955 return log_oom();
1956
1957 to = path_join(dest, from);
1958 if (!to)
1959 return log_oom();
1960
1961 if (stat(from, &st) < 0) {
1962
1963 if (errno != ENOENT)
1964 return log_error_errno(errno, "Failed to stat %s: %m", from);
1965
1966 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1967 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1968 "%s is not a char or block device, cannot copy.", from);
1969 else {
1970 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1971
1972 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1973 /* Explicitly warn the user when /dev is already populated. */
1974 if (errno == EEXIST)
1975 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1976 if (errno != EPERM)
1977 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1978
1979 /* Some systems abusively restrict mknod but allow bind mounts. */
1980 r = touch(to);
1981 if (r < 0)
1982 return log_error_errno(r, "touch (%s) failed: %m", to);
1983 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1984 if (r < 0)
1985 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1986 }
1987
1988 r = userns_lchown(to, 0, 0);
1989 if (r < 0)
1990 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1991
1992 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
1993 if (!dn)
1994 return log_oom();
1995
1996 r = userns_mkdir(dest, dn, 0755, 0, 0);
1997 if (r < 0)
1998 return log_error_errno(r, "Failed to create '%s': %m", dn);
1999
2000 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2001 return log_oom();
2002
2003 prefixed = path_join(dest, sl);
2004 if (!prefixed)
2005 return log_oom();
2006
2007 t = path_join("..", d);
2008 if (!t)
2009 return log_oom();
2010
2011 if (symlink(t, prefixed) < 0)
2012 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2013 }
2014 }
2015
2016 return r;
2017 }
2018
2019 static int make_extra_nodes(const char *dest) {
2020 _cleanup_umask_ mode_t u;
2021 size_t i;
2022 int r;
2023
2024 u = umask(0000);
2025
2026 for (i = 0; i < arg_n_extra_nodes; i++) {
2027 _cleanup_free_ char *path = NULL;
2028 DeviceNode *n = arg_extra_nodes + i;
2029
2030 path = path_join(dest, n->path);
2031 if (!path)
2032 return log_oom();
2033
2034 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2035 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2036
2037 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2040 }
2041
2042 return 0;
2043 }
2044
2045 static int setup_pts(const char *dest) {
2046 _cleanup_free_ char *options = NULL;
2047 const char *p;
2048 int r;
2049
2050 #if HAVE_SELINUX
2051 if (arg_selinux_apifs_context)
2052 (void) asprintf(&options,
2053 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2054 arg_uid_shift + TTY_GID,
2055 arg_selinux_apifs_context);
2056 else
2057 #endif
2058 (void) asprintf(&options,
2059 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2060 arg_uid_shift + TTY_GID);
2061
2062 if (!options)
2063 return log_oom();
2064
2065 /* Mount /dev/pts itself */
2066 p = prefix_roota(dest, "/dev/pts");
2067 r = mkdir_errno_wrapper(p, 0755);
2068 if (r < 0)
2069 return log_error_errno(r, "Failed to create /dev/pts: %m");
2070
2071 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2072 if (r < 0)
2073 return r;
2074 r = userns_lchown(p, 0, 0);
2075 if (r < 0)
2076 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2077
2078 /* Create /dev/ptmx symlink */
2079 p = prefix_roota(dest, "/dev/ptmx");
2080 if (symlink("pts/ptmx", p) < 0)
2081 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2082 r = userns_lchown(p, 0, 0);
2083 if (r < 0)
2084 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2085
2086 /* And fix /dev/pts/ptmx ownership */
2087 p = prefix_roota(dest, "/dev/pts/ptmx");
2088 r = userns_lchown(p, 0, 0);
2089 if (r < 0)
2090 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2091
2092 return 0;
2093 }
2094
2095 static int setup_stdio_as_dev_console(void) {
2096 int terminal;
2097 int r;
2098
2099 terminal = open_terminal("/dev/console", O_RDWR);
2100 if (terminal < 0)
2101 return log_error_errno(terminal, "Failed to open console: %m");
2102
2103 /* Make sure we can continue logging to the original stderr, even if
2104 * stderr points elsewhere now */
2105 r = log_dup_console();
2106 if (r < 0)
2107 return log_error_errno(r, "Failed to duplicate stderr: %m");
2108
2109 /* invalidates 'terminal' on success and failure */
2110 r = rearrange_stdio(terminal, terminal, terminal);
2111 if (r < 0)
2112 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2113
2114 return 0;
2115 }
2116
2117 static int setup_dev_console(const char *console) {
2118 _cleanup_free_ char *p = NULL;
2119 int r;
2120
2121 /* Create /dev/console symlink */
2122 r = path_make_relative("/dev", console, &p);
2123 if (r < 0)
2124 return log_error_errno(r, "Failed to create relative path: %m");
2125
2126 if (symlink(p, "/dev/console") < 0)
2127 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2128
2129 return 0;
2130 }
2131
2132 static int setup_keyring(void) {
2133 key_serial_t keyring;
2134
2135 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2136 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2137 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2138 * these system calls let's make sure we don't leak anything into the container. */
2139
2140 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2141 if (keyring == -1) {
2142 if (errno == ENOSYS)
2143 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2144 else if (IN_SET(errno, EACCES, EPERM))
2145 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2146 else
2147 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2148 }
2149
2150 return 0;
2151 }
2152
2153 static int setup_kmsg(int kmsg_socket) {
2154 _cleanup_(unlink_and_freep) char *from = NULL;
2155 _cleanup_free_ char *fifo = NULL;
2156 _cleanup_close_ int fd = -1;
2157 _cleanup_umask_ mode_t u;
2158 int r;
2159
2160 assert(kmsg_socket >= 0);
2161
2162 u = umask(0000);
2163
2164 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2165 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2166 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2167 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2168
2169 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2170 if (r < 0)
2171 return log_error_errno(r, "Failed to generate kmsg path: %m");
2172
2173 if (mkfifo(fifo, 0600) < 0)
2174 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2175
2176 from = TAKE_PTR(fifo);
2177
2178 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2179 if (r < 0)
2180 return r;
2181
2182 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2183 if (fd < 0)
2184 return log_error_errno(errno, "Failed to open fifo: %m");
2185
2186 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2187 r = send_one_fd(kmsg_socket, fd, 0);
2188 if (r < 0)
2189 return log_error_errno(r, "Failed to send FIFO fd: %m");
2190
2191 return 0;
2192 }
2193
2194 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2195 union in_addr_union *exposed = userdata;
2196
2197 assert(rtnl);
2198 assert(m);
2199 assert(exposed);
2200
2201 expose_port_execute(rtnl, arg_expose_ports, exposed);
2202 return 0;
2203 }
2204
2205 static int setup_hostname(void) {
2206 int r;
2207
2208 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2209 return 0;
2210
2211 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to set hostname: %m");
2214
2215 return 0;
2216 }
2217
2218 static int setup_journal(const char *directory) {
2219 _cleanup_free_ char *d = NULL;
2220 const char *dirname, *p, *q;
2221 sd_id128_t this_id;
2222 char id[33];
2223 bool try;
2224 int r;
2225
2226 /* Don't link journals in ephemeral mode */
2227 if (arg_ephemeral)
2228 return 0;
2229
2230 if (arg_link_journal == LINK_NO)
2231 return 0;
2232
2233 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2234
2235 r = sd_id128_get_machine(&this_id);
2236 if (r < 0)
2237 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2238
2239 if (sd_id128_equal(arg_uuid, this_id)) {
2240 log_full(try ? LOG_WARNING : LOG_ERR,
2241 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2242 if (try)
2243 return 0;
2244 return -EEXIST;
2245 }
2246
2247 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2248 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2249 if (r < 0) {
2250 bool ignore = r == -EROFS && try;
2251 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2252 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2253 return ignore ? 0 : r;
2254 }
2255 }
2256
2257 (void) sd_id128_to_string(arg_uuid, id);
2258
2259 p = strjoina("/var/log/journal/", id);
2260 q = prefix_roota(directory, p);
2261
2262 if (path_is_mount_point(p, NULL, 0) > 0) {
2263 if (try)
2264 return 0;
2265
2266 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2267 "%s: already a mount point, refusing to use for journal", p);
2268 }
2269
2270 if (path_is_mount_point(q, NULL, 0) > 0) {
2271 if (try)
2272 return 0;
2273
2274 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2275 "%s: already a mount point, refusing to use for journal", q);
2276 }
2277
2278 r = readlink_and_make_absolute(p, &d);
2279 if (r >= 0) {
2280 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2281 path_equal(d, q)) {
2282
2283 r = userns_mkdir(directory, p, 0755, 0, 0);
2284 if (r < 0)
2285 log_warning_errno(r, "Failed to create directory %s: %m", q);
2286 return 0;
2287 }
2288
2289 if (unlink(p) < 0)
2290 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2291 } else if (r == -EINVAL) {
2292
2293 if (arg_link_journal == LINK_GUEST &&
2294 rmdir(p) < 0) {
2295
2296 if (errno == ENOTDIR) {
2297 log_error("%s already exists and is neither a symlink nor a directory", p);
2298 return r;
2299 } else
2300 return log_error_errno(errno, "Failed to remove %s: %m", p);
2301 }
2302 } else if (r != -ENOENT)
2303 return log_error_errno(r, "readlink(%s) failed: %m", p);
2304
2305 if (arg_link_journal == LINK_GUEST) {
2306
2307 if (symlink(q, p) < 0) {
2308 if (try) {
2309 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2310 return 0;
2311 } else
2312 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2313 }
2314
2315 r = userns_mkdir(directory, p, 0755, 0, 0);
2316 if (r < 0)
2317 log_warning_errno(r, "Failed to create directory %s: %m", q);
2318 return 0;
2319 }
2320
2321 if (arg_link_journal == LINK_HOST) {
2322 /* don't create parents here — if the host doesn't have
2323 * permanent journal set up, don't force it here */
2324
2325 r = mkdir_errno_wrapper(p, 0755);
2326 if (r < 0 && r != -EEXIST) {
2327 if (try) {
2328 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2329 return 0;
2330 } else
2331 return log_error_errno(r, "Failed to create %s: %m", p);
2332 }
2333
2334 } else if (access(p, F_OK) < 0)
2335 return 0;
2336
2337 if (dir_is_empty(q) == 0)
2338 log_warning("%s is not empty, proceeding anyway.", q);
2339
2340 r = userns_mkdir(directory, p, 0755, 0, 0);
2341 if (r < 0)
2342 return log_error_errno(r, "Failed to create %s: %m", q);
2343
2344 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2345 if (r < 0)
2346 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2347
2348 return 0;
2349 }
2350
2351 static int drop_capabilities(uid_t uid) {
2352 CapabilityQuintet q;
2353
2354 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2355 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2356 * arg_caps_retain. */
2357
2358 if (capability_quintet_is_set(&arg_full_capabilities)) {
2359 q = arg_full_capabilities;
2360
2361 if (q.bounding == (uint64_t) -1)
2362 q.bounding = uid == 0 ? arg_caps_retain : 0;
2363
2364 if (q.effective == (uint64_t) -1)
2365 q.effective = uid == 0 ? q.bounding : 0;
2366
2367 if (q.inheritable == (uint64_t) -1)
2368 q.inheritable = uid == 0 ? q.bounding : 0;
2369
2370 if (q.permitted == (uint64_t) -1)
2371 q.permitted = uid == 0 ? q.bounding : 0;
2372
2373 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2374 q.ambient = 0;
2375
2376 if (capability_quintet_mangle(&q))
2377 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2378
2379 } else {
2380 q = (CapabilityQuintet) {
2381 .bounding = arg_caps_retain,
2382 .effective = uid == 0 ? arg_caps_retain : 0,
2383 .inheritable = uid == 0 ? arg_caps_retain : 0,
2384 .permitted = uid == 0 ? arg_caps_retain : 0,
2385 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2386 };
2387
2388 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2389 * in order to maintain the same behavior as systemd < 242. */
2390 if (capability_quintet_mangle(&q))
2391 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2392 "Some capabilities will not be set because they are not in the current bounding set.");
2393
2394 }
2395
2396 return capability_quintet_enforce(&q);
2397 }
2398
2399 static int reset_audit_loginuid(void) {
2400 _cleanup_free_ char *p = NULL;
2401 int r;
2402
2403 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2404 return 0;
2405
2406 r = read_one_line_file("/proc/self/loginuid", &p);
2407 if (r == -ENOENT)
2408 return 0;
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2411
2412 /* Already reset? */
2413 if (streq(p, "4294967295"))
2414 return 0;
2415
2416 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2417 if (r < 0) {
2418 log_error_errno(r,
2419 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2420 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2421 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2422 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2423 "using systemd-nspawn. Sleeping for 5s... (%m)");
2424
2425 sleep(5);
2426 }
2427
2428 return 0;
2429 }
2430
2431 static int setup_propagate(const char *root) {
2432 const char *p, *q;
2433 int r;
2434
2435 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2436 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2437 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2438 (void) mkdir_p(p, 0600);
2439
2440 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2441 if (r < 0)
2442 return log_error_errno(r, "Failed to create /run/systemd: %m");
2443
2444 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2445 if (r < 0)
2446 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2447
2448 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2449 if (r < 0)
2450 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2451
2452 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
2453 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2454 if (r < 0)
2455 return r;
2456
2457 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2458 if (r < 0)
2459 return r;
2460
2461 /* machined will MS_MOVE into that directory, and that's only
2462 * supported for non-shared mounts. */
2463 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2464 }
2465
2466 static int setup_machine_id(const char *directory) {
2467 const char *etc_machine_id;
2468 sd_id128_t id;
2469 int r;
2470
2471 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2472 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2473 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2474 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2475 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2476 * container behaves nicely). */
2477
2478 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2479
2480 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2481 if (r < 0) {
2482 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2483 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2484
2485 if (sd_id128_is_null(arg_uuid)) {
2486 r = sd_id128_randomize(&arg_uuid);
2487 if (r < 0)
2488 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2489 }
2490 } else {
2491 if (sd_id128_is_null(id))
2492 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2493 "Machine ID in container image is zero, refusing.");
2494
2495 arg_uuid = id;
2496 }
2497
2498 return 0;
2499 }
2500
2501 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2502 int r;
2503
2504 assert(directory);
2505
2506 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2507 return 0;
2508
2509 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2510 if (r == -EOPNOTSUPP)
2511 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2512 if (r == -EBADE)
2513 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2514 if (r < 0)
2515 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2516 if (r == 0)
2517 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2518 else
2519 log_debug("Patched directory tree to match UID/GID range.");
2520
2521 return r;
2522 }
2523
2524 /*
2525 * Return values:
2526 * < 0 : wait_for_terminate() failed to get the state of the
2527 * container, the container was terminated by a signal, or
2528 * failed for an unknown reason. No change is made to the
2529 * container argument.
2530 * > 0 : The program executed in the container terminated with an
2531 * error. The exit code of the program executed in the
2532 * container is returned. The container argument has been set
2533 * to CONTAINER_TERMINATED.
2534 * 0 : The container is being rebooted, has been shut down or exited
2535 * successfully. The container argument has been set to either
2536 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2537 *
2538 * That is, success is indicated by a return value of zero, and an
2539 * error is indicated by a non-zero value.
2540 */
2541 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2542 siginfo_t status;
2543 int r;
2544
2545 r = wait_for_terminate(pid, &status);
2546 if (r < 0)
2547 return log_warning_errno(r, "Failed to wait for container: %m");
2548
2549 switch (status.si_code) {
2550
2551 case CLD_EXITED:
2552 if (status.si_status == 0)
2553 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2554 else
2555 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2556
2557 *container = CONTAINER_TERMINATED;
2558 return status.si_status;
2559
2560 case CLD_KILLED:
2561 if (status.si_status == SIGINT) {
2562 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2563 *container = CONTAINER_TERMINATED;
2564 return 0;
2565
2566 } else if (status.si_status == SIGHUP) {
2567 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2568 *container = CONTAINER_REBOOTED;
2569 return 0;
2570 }
2571
2572 _fallthrough_;
2573 case CLD_DUMPED:
2574 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2575 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2576
2577 default:
2578 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2579 "Container %s failed due to unknown reason.", arg_machine);
2580 }
2581 }
2582
2583 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2584 pid_t pid;
2585
2586 pid = PTR_TO_PID(userdata);
2587 if (pid > 0) {
2588 if (kill(pid, arg_kill_signal) >= 0) {
2589 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2590 sd_event_source_set_userdata(s, NULL);
2591 return 0;
2592 }
2593 }
2594
2595 sd_event_exit(sd_event_source_get_event(s), 0);
2596 return 0;
2597 }
2598
2599 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2600 pid_t pid;
2601
2602 assert(s);
2603 assert(ssi);
2604
2605 pid = PTR_TO_PID(userdata);
2606
2607 for (;;) {
2608 siginfo_t si = {};
2609
2610 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2611 return log_error_errno(errno, "Failed to waitid(): %m");
2612 if (si.si_pid == 0) /* No pending children. */
2613 break;
2614 if (si.si_pid == pid) {
2615 /* The main process we care for has exited. Return from
2616 * signal handler but leave the zombie. */
2617 sd_event_exit(sd_event_source_get_event(s), 0);
2618 break;
2619 }
2620
2621 /* Reap all other children. */
2622 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2623 }
2624
2625 return 0;
2626 }
2627
2628 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2629 pid_t pid;
2630
2631 assert(m);
2632
2633 pid = PTR_TO_PID(userdata);
2634
2635 if (arg_kill_signal > 0) {
2636 log_info("Container termination requested. Attempting to halt container.");
2637 (void) kill(pid, arg_kill_signal);
2638 } else {
2639 log_info("Container termination requested. Exiting.");
2640 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2641 }
2642
2643 return 0;
2644 }
2645
2646 static int determine_names(void) {
2647 int r;
2648
2649 if (arg_template && !arg_directory && arg_machine) {
2650
2651 /* If --template= was specified then we should not
2652 * search for a machine, but instead create a new one
2653 * in /var/lib/machine. */
2654
2655 arg_directory = path_join("/var/lib/machines", arg_machine);
2656 if (!arg_directory)
2657 return log_oom();
2658 }
2659
2660 if (!arg_image && !arg_directory) {
2661 if (arg_machine) {
2662 _cleanup_(image_unrefp) Image *i = NULL;
2663
2664 r = image_find(IMAGE_MACHINE, arg_machine, &i);
2665 if (r == -ENOENT)
2666 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2667 if (r < 0)
2668 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2669
2670 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2671 r = free_and_strdup(&arg_image, i->path);
2672 else
2673 r = free_and_strdup(&arg_directory, i->path);
2674 if (r < 0)
2675 return log_oom();
2676
2677 if (!arg_ephemeral)
2678 arg_read_only = arg_read_only || i->read_only;
2679 } else {
2680 r = safe_getcwd(&arg_directory);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to determine current directory: %m");
2683 }
2684
2685 if (!arg_directory && !arg_image)
2686 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
2687 }
2688
2689 if (!arg_machine) {
2690 if (arg_directory && path_equal(arg_directory, "/"))
2691 arg_machine = gethostname_malloc();
2692 else {
2693 if (arg_image) {
2694 char *e;
2695
2696 arg_machine = strdup(basename(arg_image));
2697
2698 /* Truncate suffix if there is one */
2699 e = endswith(arg_machine, ".raw");
2700 if (e)
2701 *e = 0;
2702 } else
2703 arg_machine = strdup(basename(arg_directory));
2704 }
2705 if (!arg_machine)
2706 return log_oom();
2707
2708 hostname_cleanup(arg_machine);
2709 if (!machine_name_is_valid(arg_machine))
2710 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
2711
2712 if (arg_ephemeral) {
2713 char *b;
2714
2715 /* Add a random suffix when this is an
2716 * ephemeral machine, so that we can run many
2717 * instances at once without manually having
2718 * to specify -M each time. */
2719
2720 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2721 return log_oom();
2722
2723 free(arg_machine);
2724 arg_machine = b;
2725 }
2726 }
2727
2728 return 0;
2729 }
2730
2731 static int chase_symlinks_and_update(char **p, unsigned flags) {
2732 char *chased;
2733 int r;
2734
2735 assert(p);
2736
2737 if (!*p)
2738 return 0;
2739
2740 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
2741 if (r < 0)
2742 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2743
2744 return free_and_replace(*p, chased);
2745 }
2746
2747 static int determine_uid_shift(const char *directory) {
2748 int r;
2749
2750 if (arg_userns_mode == USER_NAMESPACE_NO) {
2751 arg_uid_shift = 0;
2752 return 0;
2753 }
2754
2755 if (arg_uid_shift == UID_INVALID) {
2756 struct stat st;
2757
2758 r = stat(directory, &st);
2759 if (r < 0)
2760 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2761
2762 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2763
2764 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2765 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2766 "UID and GID base of %s don't match.", directory);
2767
2768 arg_uid_range = UINT32_C(0x10000);
2769 }
2770
2771 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2772 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2773 "UID base too high for UID range.");
2774
2775 return 0;
2776 }
2777
2778 static unsigned long effective_clone_ns_flags(void) {
2779 unsigned long flags = arg_clone_ns_flags;
2780
2781 if (arg_private_network)
2782 flags |= CLONE_NEWNET;
2783 if (arg_use_cgns)
2784 flags |= CLONE_NEWCGROUP;
2785 if (arg_userns_mode != USER_NAMESPACE_NO)
2786 flags |= CLONE_NEWUSER;
2787
2788 return flags;
2789 }
2790
2791 static int patch_sysctl(void) {
2792
2793 /* This table is inspired by runc's sysctl() function */
2794 static const struct {
2795 const char *key;
2796 bool prefix;
2797 unsigned long clone_flags;
2798 } safe_sysctl[] = {
2799 { "kernel.hostname", false, CLONE_NEWUTS },
2800 { "kernel.domainname", false, CLONE_NEWUTS },
2801 { "kernel.msgmax", false, CLONE_NEWIPC },
2802 { "kernel.msgmnb", false, CLONE_NEWIPC },
2803 { "kernel.msgmni", false, CLONE_NEWIPC },
2804 { "kernel.sem", false, CLONE_NEWIPC },
2805 { "kernel.shmall", false, CLONE_NEWIPC },
2806 { "kernel.shmmax", false, CLONE_NEWIPC },
2807 { "kernel.shmmni", false, CLONE_NEWIPC },
2808 { "fs.mqueue.", true, CLONE_NEWIPC },
2809 { "net.", true, CLONE_NEWNET },
2810 };
2811
2812 unsigned long flags;
2813 char **k, **v;
2814 int r;
2815
2816 flags = effective_clone_ns_flags();
2817
2818 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2819 bool good = false;
2820 size_t i;
2821
2822 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2823
2824 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2825 continue;
2826
2827 if (safe_sysctl[i].prefix)
2828 good = startswith(*k, safe_sysctl[i].key);
2829 else
2830 good = streq(*k, safe_sysctl[i].key);
2831
2832 if (good)
2833 break;
2834 }
2835
2836 if (!good)
2837 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2838
2839 r = sysctl_write(*k, *v);
2840 if (r < 0)
2841 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2842 }
2843
2844 return 0;
2845 }
2846
2847 static int inner_child(
2848 Barrier *barrier,
2849 const char *directory,
2850 bool secondary,
2851 int kmsg_socket,
2852 int rtnl_socket,
2853 int master_pty_socket,
2854 FDSet *fds) {
2855
2856 _cleanup_free_ char *home = NULL;
2857 char as_uuid[37];
2858 size_t n_env = 1;
2859 const char *envp[] = {
2860 "PATH=" DEFAULT_PATH_COMPAT,
2861 NULL, /* container */
2862 NULL, /* TERM */
2863 NULL, /* HOME */
2864 NULL, /* USER */
2865 NULL, /* LOGNAME */
2866 NULL, /* container_uuid */
2867 NULL, /* LISTEN_FDS */
2868 NULL, /* LISTEN_PID */
2869 NULL, /* NOTIFY_SOCKET */
2870 NULL
2871 };
2872 const char *exec_target;
2873 _cleanup_strv_free_ char **env_use = NULL;
2874 int r, which_failed;
2875
2876 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2877 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2878 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2879 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2880 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2881 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2882 * namespace.
2883 *
2884 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2885 * unshare(). See below. */
2886
2887 assert(barrier);
2888 assert(directory);
2889 assert(kmsg_socket >= 0);
2890
2891 log_debug("Inner child is initializing.");
2892
2893 if (arg_userns_mode != USER_NAMESPACE_NO) {
2894 /* Tell the parent, that it now can write the UID map. */
2895 (void) barrier_place(barrier); /* #1 */
2896
2897 /* Wait until the parent wrote the UID map */
2898 if (!barrier_place_and_sync(barrier)) /* #2 */
2899 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2900 "Parent died too early");
2901 }
2902
2903 r = reset_uid_gid();
2904 if (r < 0)
2905 return log_error_errno(r, "Couldn't become new root: %m");
2906
2907 r = mount_all(NULL,
2908 arg_mount_settings | MOUNT_IN_USERNS,
2909 arg_uid_shift,
2910 arg_selinux_apifs_context);
2911 if (r < 0)
2912 return r;
2913
2914 if (!arg_network_namespace_path && arg_private_network) {
2915 r = unshare(CLONE_NEWNET);
2916 if (r < 0)
2917 return log_error_errno(errno, "Failed to unshare network namespace: %m");
2918
2919 /* Tell the parent that it can setup network interfaces. */
2920 (void) barrier_place(barrier); /* #3 */
2921 }
2922
2923 r = mount_sysfs(NULL, arg_mount_settings);
2924 if (r < 0)
2925 return r;
2926
2927 /* Wait until we are cgroup-ified, so that we
2928 * can mount the right cgroup path writable */
2929 if (!barrier_place_and_sync(barrier)) /* #4 */
2930 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2931 "Parent died too early");
2932
2933 if (arg_use_cgns) {
2934 r = unshare(CLONE_NEWCGROUP);
2935 if (r < 0)
2936 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2937 r = mount_cgroups(
2938 "",
2939 arg_unified_cgroup_hierarchy,
2940 arg_userns_mode != USER_NAMESPACE_NO,
2941 arg_uid_shift,
2942 arg_uid_range,
2943 arg_selinux_apifs_context,
2944 true);
2945 if (r < 0)
2946 return r;
2947 } else {
2948 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2949 if (r < 0)
2950 return r;
2951 }
2952
2953 r = setup_boot_id();
2954 if (r < 0)
2955 return r;
2956
2957 r = setup_kmsg(kmsg_socket);
2958 if (r < 0)
2959 return r;
2960 kmsg_socket = safe_close(kmsg_socket);
2961
2962 r = mount_custom(
2963 "/",
2964 arg_custom_mounts,
2965 arg_n_custom_mounts,
2966 false,
2967 0,
2968 0,
2969 arg_selinux_apifs_context,
2970 true);
2971 if (r < 0)
2972 return r;
2973
2974 if (setsid() < 0)
2975 return log_error_errno(errno, "setsid() failed: %m");
2976
2977 if (arg_private_network)
2978 loopback_setup();
2979
2980 if (arg_expose_ports) {
2981 r = expose_port_send_rtnl(rtnl_socket);
2982 if (r < 0)
2983 return r;
2984 rtnl_socket = safe_close(rtnl_socket);
2985 }
2986
2987 if (arg_console_mode != CONSOLE_PIPE) {
2988 _cleanup_close_ int master = -1;
2989 _cleanup_free_ char *console = NULL;
2990
2991 /* Allocate a pty and make it available as /dev/console. */
2992 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
2993 if (master < 0)
2994 return log_error_errno(master, "Failed to allocate a pty: %m");
2995
2996 r = setup_dev_console(console);
2997 if (r < 0)
2998 return log_error_errno(r, "Failed to setup /dev/console: %m");
2999
3000 r = send_one_fd(master_pty_socket, master, 0);
3001 if (r < 0)
3002 return log_error_errno(r, "Failed to send master fd: %m");
3003 master_pty_socket = safe_close(master_pty_socket);
3004
3005 r = setup_stdio_as_dev_console();
3006 if (r < 0)
3007 return r;
3008 }
3009
3010 r = patch_sysctl();
3011 if (r < 0)
3012 return r;
3013
3014 if (arg_oom_score_adjust_set) {
3015 r = set_oom_score_adjust(arg_oom_score_adjust);
3016 if (r < 0)
3017 return log_error_errno(r, "Failed to adjust OOM score: %m");
3018 }
3019
3020 if (arg_cpu_set.set)
3021 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3022 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3023
3024 (void) setup_hostname();
3025
3026 if (arg_personality != PERSONALITY_INVALID) {
3027 r = safe_personality(arg_personality);
3028 if (r < 0)
3029 return log_error_errno(r, "personality() failed: %m");
3030 } else if (secondary) {
3031 r = safe_personality(PER_LINUX32);
3032 if (r < 0)
3033 return log_error_errno(r, "personality() failed: %m");
3034 }
3035
3036 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3037 if (r < 0)
3038 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3039
3040 #if HAVE_SECCOMP
3041 if (arg_seccomp) {
3042
3043 if (is_seccomp_available()) {
3044
3045 r = seccomp_load(arg_seccomp);
3046 if (ERRNO_IS_SECCOMP_FATAL(r))
3047 return log_error_errno(r, "Failed to install seccomp filter: %m");
3048 if (r < 0)
3049 log_debug_errno(r, "Failed to install seccomp filter: %m");
3050 }
3051 } else
3052 #endif
3053 {
3054 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3055 if (r < 0)
3056 return r;
3057 }
3058
3059 #if HAVE_SELINUX
3060 if (arg_selinux_context)
3061 if (setexeccon(arg_selinux_context) < 0)
3062 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3063 #endif
3064
3065 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3066 * if we need to later on. */
3067 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3068 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3069
3070 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3071 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3072 else
3073 r = change_uid_gid(arg_user, &home);
3074 if (r < 0)
3075 return r;
3076
3077 r = drop_capabilities(getuid());
3078 if (r < 0)
3079 return log_error_errno(r, "Dropping capabilities failed: %m");
3080
3081 if (arg_no_new_privileges)
3082 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3083 return log_error_errno(errno, "Failed to disable new privileges: %m");
3084
3085 /* LXC sets container=lxc, so follow the scheme here */
3086 envp[n_env++] = strjoina("container=", arg_container_service_name);
3087
3088 envp[n_env] = strv_find_prefix(environ, "TERM=");
3089 if (envp[n_env])
3090 n_env++;
3091
3092 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3093 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3094 return log_oom();
3095
3096 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3097 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3098 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3099 return log_oom();
3100
3101 assert(!sd_id128_is_null(arg_uuid));
3102
3103 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3104 return log_oom();
3105
3106 if (fdset_size(fds) > 0) {
3107 r = fdset_cloexec(fds, false);
3108 if (r < 0)
3109 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3110
3111 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3112 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3113 return log_oom();
3114 }
3115 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3116 return log_oom();
3117
3118 env_use = strv_env_merge(2, envp, arg_setenv);
3119 if (!env_use)
3120 return log_oom();
3121
3122 /* Let the parent know that we are ready and
3123 * wait until the parent is ready with the
3124 * setup, too... */
3125 if (!barrier_place_and_sync(barrier)) /* #5 */
3126 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3127 "Parent died too early");
3128
3129 if (arg_chdir)
3130 if (chdir(arg_chdir) < 0)
3131 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3132
3133 if (arg_start_mode == START_PID2) {
3134 r = stub_pid1(arg_uuid);
3135 if (r < 0)
3136 return r;
3137 }
3138
3139 log_debug("Inner child completed, invoking payload.");
3140
3141 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3142 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3143 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3144 log_close();
3145 log_set_open_when_needed(true);
3146
3147 (void) fdset_close_others(fds);
3148
3149 if (arg_start_mode == START_BOOT) {
3150 char **a;
3151 size_t m;
3152
3153 /* Automatically search for the init system */
3154
3155 m = strv_length(arg_parameters);
3156 a = newa(char*, m + 2);
3157 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3158 a[1 + m] = NULL;
3159
3160 a[0] = (char*) "/usr/lib/systemd/systemd";
3161 execve(a[0], a, env_use);
3162
3163 a[0] = (char*) "/lib/systemd/systemd";
3164 execve(a[0], a, env_use);
3165
3166 a[0] = (char*) "/sbin/init";
3167 execve(a[0], a, env_use);
3168
3169 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3170 } else if (!strv_isempty(arg_parameters)) {
3171 const char *dollar_path;
3172
3173 exec_target = arg_parameters[0];
3174
3175 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3176 * binary. */
3177 dollar_path = strv_env_get(env_use, "PATH");
3178 if (dollar_path) {
3179 if (putenv((char*) dollar_path) != 0)
3180 return log_error_errno(errno, "Failed to update $PATH: %m");
3181 }
3182
3183 execvpe(arg_parameters[0], arg_parameters, env_use);
3184 } else {
3185 if (!arg_chdir)
3186 /* If we cannot change the directory, we'll end up in /, that is expected. */
3187 (void) chdir(home ?: "/root");
3188
3189 execle("/bin/bash", "-bash", NULL, env_use);
3190 execle("/bin/sh", "-sh", NULL, env_use);
3191
3192 exec_target = "/bin/bash, /bin/sh";
3193 }
3194
3195 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3196 }
3197
3198 static int setup_sd_notify_child(void) {
3199 _cleanup_close_ int fd = -1;
3200 union sockaddr_union sa = {
3201 .un.sun_family = AF_UNIX,
3202 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3203 };
3204 int r;
3205
3206 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3207 if (fd < 0)
3208 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3209
3210 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3211 (void) sockaddr_un_unlink(&sa.un);
3212
3213 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3214 if (r < 0)
3215 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3216
3217 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3218 if (r < 0)
3219 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3220
3221 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3222 if (r < 0)
3223 return log_error_errno(r, "SO_PASSCRED failed: %m");
3224
3225 return TAKE_FD(fd);
3226 }
3227
3228 static int outer_child(
3229 Barrier *barrier,
3230 const char *directory,
3231 DissectedImage *dissected_image,
3232 bool secondary,
3233 int pid_socket,
3234 int uuid_socket,
3235 int notify_socket,
3236 int kmsg_socket,
3237 int rtnl_socket,
3238 int uid_shift_socket,
3239 int master_pty_socket,
3240 int unified_cgroup_hierarchy_socket,
3241 FDSet *fds,
3242 int netns_fd) {
3243
3244 _cleanup_close_ int fd = -1;
3245 pid_t pid;
3246 ssize_t l;
3247 int r;
3248
3249 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3250 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3251 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3252 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3253
3254 assert(barrier);
3255 assert(directory);
3256 assert(pid_socket >= 0);
3257 assert(uuid_socket >= 0);
3258 assert(notify_socket >= 0);
3259 assert(master_pty_socket >= 0);
3260 assert(kmsg_socket >= 0);
3261
3262 log_debug("Outer child is initializing.");
3263
3264 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3265 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3266
3267 r = reset_audit_loginuid();
3268 if (r < 0)
3269 return r;
3270
3271 /* Mark everything as slave, so that we still
3272 * receive mounts from the real root, but don't
3273 * propagate mounts to the real root. */
3274 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3275 if (r < 0)
3276 return r;
3277
3278 if (dissected_image) {
3279 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3280 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3281 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3282 * makes sure ESP partitions and userns are compatible. */
3283
3284 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3285 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3286 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3287 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3288 if (r < 0)
3289 return r;
3290 }
3291
3292 r = determine_uid_shift(directory);
3293 if (r < 0)
3294 return r;
3295
3296 if (arg_userns_mode != USER_NAMESPACE_NO) {
3297 /* Let the parent know which UID shift we read from the image */
3298 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3299 if (l < 0)
3300 return log_error_errno(errno, "Failed to send UID shift: %m");
3301 if (l != sizeof(arg_uid_shift))
3302 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3303 "Short write while sending UID shift.");
3304
3305 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3306 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3307 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3308 * not it will pick a different one, and send it back to us. */
3309
3310 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3311 if (l < 0)
3312 return log_error_errno(errno, "Failed to recv UID shift: %m");
3313 if (l != sizeof(arg_uid_shift))
3314 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3315 "Short read while receiving UID shift.");
3316 }
3317
3318 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3319 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3320 }
3321
3322 if (path_equal(directory, "/")) {
3323 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3324 * place, so that we can make changes to its mount structure (for example, to implement
3325 * --volatile=) without this interfering with our ability to access files such as
3326 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3327 * (instead of a temporary directory, since we are living in our own mount namspace here
3328 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3329 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3330
3331 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3332 if (r < 0)
3333 return r;
3334
3335 directory = "/run/systemd/nspawn-root";
3336
3337 } else if (!dissected_image) {
3338 /* Turn directory into bind mount (we need that so that we can move the bind mount to root
3339 * later on). */
3340 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3341 if (r < 0)
3342 return r;
3343 }
3344
3345 r = setup_pivot_root(
3346 directory,
3347 arg_pivot_root_new,
3348 arg_pivot_root_old);
3349 if (r < 0)
3350 return r;
3351
3352 r = setup_volatile_mode(
3353 directory,
3354 arg_volatile_mode,
3355 arg_userns_mode != USER_NAMESPACE_NO,
3356 arg_uid_shift,
3357 arg_uid_range,
3358 arg_selinux_apifs_context);
3359 if (r < 0)
3360 return r;
3361
3362 if (dissected_image) {
3363 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3364 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3365 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3366 if (r < 0)
3367 return r;
3368 }
3369
3370 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3371 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3372
3373 r = detect_unified_cgroup_hierarchy_from_image(directory);
3374 if (r < 0)
3375 return r;
3376
3377 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3378 if (l < 0)
3379 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3380 if (l != sizeof(arg_unified_cgroup_hierarchy))
3381 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3382 "Short write while sending cgroup mode.");
3383
3384 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3385 }
3386
3387 /* Mark everything as shared so our mounts get propagated down. This is
3388 * required to make new bind mounts available in systemd services
3389 * inside the container that create a new mount namespace.
3390 * See https://github.com/systemd/systemd/issues/3860
3391 * Further submounts (such as /dev) done after this will inherit the
3392 * shared propagation mode. */
3393 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3394 if (r < 0)
3395 return r;
3396
3397 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3398 if (r < 0)
3399 return r;
3400
3401 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3402 if (r < 0)
3403 return r;
3404
3405 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
3406 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3407 if (r < 0)
3408 return log_error_errno(r, "Failed to make tree read-only: %m");
3409 }
3410
3411 r = mount_all(directory,
3412 arg_mount_settings,
3413 arg_uid_shift,
3414 arg_selinux_apifs_context);
3415 if (r < 0)
3416 return r;
3417
3418 r = copy_devnodes(directory);
3419 if (r < 0)
3420 return r;
3421
3422 r = make_extra_nodes(directory);
3423 if (r < 0)
3424 return r;
3425
3426 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3427 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
3428
3429 r = setup_pts(directory);
3430 if (r < 0)
3431 return r;
3432
3433 r = setup_propagate(directory);
3434 if (r < 0)
3435 return r;
3436
3437 r = setup_keyring();
3438 if (r < 0)
3439 return r;
3440
3441 r = setup_timezone(directory);
3442 if (r < 0)
3443 return r;
3444
3445 r = setup_resolv_conf(directory);
3446 if (r < 0)
3447 return r;
3448
3449 r = setup_machine_id(directory);
3450 if (r < 0)
3451 return r;
3452
3453 r = setup_journal(directory);
3454 if (r < 0)
3455 return r;
3456
3457 r = mount_custom(
3458 directory,
3459 arg_custom_mounts,
3460 arg_n_custom_mounts,
3461 arg_userns_mode != USER_NAMESPACE_NO,
3462 arg_uid_shift,
3463 arg_uid_range,
3464 arg_selinux_apifs_context,
3465 false);
3466 if (r < 0)
3467 return r;
3468
3469 if (!arg_use_cgns) {
3470 r = mount_cgroups(
3471 directory,
3472 arg_unified_cgroup_hierarchy,
3473 arg_userns_mode != USER_NAMESPACE_NO,
3474 arg_uid_shift,
3475 arg_uid_range,
3476 arg_selinux_apifs_context,
3477 false);
3478 if (r < 0)
3479 return r;
3480 }
3481
3482 r = mount_move_root(directory);
3483 if (r < 0)
3484 return log_error_errno(r, "Failed to move root directory: %m");
3485
3486 fd = setup_sd_notify_child();
3487 if (fd < 0)
3488 return fd;
3489
3490 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3491 arg_clone_ns_flags |
3492 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3493 if (pid < 0)
3494 return log_error_errno(errno, "Failed to fork inner child: %m");
3495 if (pid == 0) {
3496 pid_socket = safe_close(pid_socket);
3497 uuid_socket = safe_close(uuid_socket);
3498 notify_socket = safe_close(notify_socket);
3499 uid_shift_socket = safe_close(uid_shift_socket);
3500
3501 /* The inner child has all namespaces that are
3502 * requested, so that we all are owned by the user if
3503 * user namespaces are turned on. */
3504
3505 if (arg_network_namespace_path) {
3506 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3507 if (r < 0)
3508 return log_error_errno(r, "Failed to join network namespace: %m");
3509 }
3510
3511 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
3512 if (r < 0)
3513 _exit(EXIT_FAILURE);
3514
3515 _exit(EXIT_SUCCESS);
3516 }
3517
3518 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3519 if (l < 0)
3520 return log_error_errno(errno, "Failed to send PID: %m");
3521 if (l != sizeof(pid))
3522 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3523 "Short write while sending PID.");
3524
3525 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3526 if (l < 0)
3527 return log_error_errno(errno, "Failed to send machine ID: %m");
3528 if (l != sizeof(arg_uuid))
3529 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3530 "Short write while sending machine ID.");
3531
3532 l = send_one_fd(notify_socket, fd, 0);
3533 if (l < 0)
3534 return log_error_errno(l, "Failed to send notify fd: %m");
3535
3536 pid_socket = safe_close(pid_socket);
3537 uuid_socket = safe_close(uuid_socket);
3538 notify_socket = safe_close(notify_socket);
3539 master_pty_socket = safe_close(master_pty_socket);
3540 kmsg_socket = safe_close(kmsg_socket);
3541 rtnl_socket = safe_close(rtnl_socket);
3542 netns_fd = safe_close(netns_fd);
3543
3544 return 0;
3545 }
3546
3547 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3548 bool tried_hashed = false;
3549 unsigned n_tries = 100;
3550 uid_t candidate;
3551 int r;
3552
3553 assert(shift);
3554 assert(ret_lock_file);
3555 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3556 assert(arg_uid_range == 0x10000U);
3557
3558 candidate = *shift;
3559
3560 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3561
3562 for (;;) {
3563 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3564 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3565
3566 if (--n_tries <= 0)
3567 return -EBUSY;
3568
3569 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3570 goto next;
3571 if ((candidate & UINT32_C(0xFFFF)) != 0)
3572 goto next;
3573
3574 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3575 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3576 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3577 goto next;
3578 if (r < 0)
3579 return r;
3580
3581 /* Make some superficial checks whether the range is currently known in the user database */
3582 if (getpwuid(candidate))
3583 goto next;
3584 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3585 goto next;
3586 if (getgrgid(candidate))
3587 goto next;
3588 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3589 goto next;
3590
3591 *ret_lock_file = lf;
3592 lf = (struct LockFile) LOCK_FILE_INIT;
3593 *shift = candidate;
3594 return 0;
3595
3596 next:
3597 if (arg_machine && !tried_hashed) {
3598 /* Try to hash the base from the container name */
3599
3600 static const uint8_t hash_key[] = {
3601 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3602 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3603 };
3604
3605 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3606
3607 tried_hashed = true;
3608 } else
3609 random_bytes(&candidate, sizeof(candidate));
3610
3611 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3612 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3613 }
3614 }
3615
3616 static int setup_uid_map(pid_t pid) {
3617 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3618 int r;
3619
3620 assert(pid > 1);
3621
3622 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3623 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3624 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3625 if (r < 0)
3626 return log_error_errno(r, "Failed to write UID map: %m");
3627
3628 /* We always assign the same UID and GID ranges */
3629 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3630 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3631 if (r < 0)
3632 return log_error_errno(r, "Failed to write GID map: %m");
3633
3634 return 0;
3635 }
3636
3637 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3638 char buf[NOTIFY_BUFFER_MAX+1];
3639 char *p = NULL;
3640 struct iovec iovec = {
3641 .iov_base = buf,
3642 .iov_len = sizeof(buf)-1,
3643 };
3644 union {
3645 struct cmsghdr cmsghdr;
3646 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3647 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3648 } control = {};
3649 struct msghdr msghdr = {
3650 .msg_iov = &iovec,
3651 .msg_iovlen = 1,
3652 .msg_control = &control,
3653 .msg_controllen = sizeof(control),
3654 };
3655 struct cmsghdr *cmsg;
3656 struct ucred *ucred = NULL;
3657 ssize_t n;
3658 pid_t inner_child_pid;
3659 _cleanup_strv_free_ char **tags = NULL;
3660
3661 assert(userdata);
3662
3663 inner_child_pid = PTR_TO_PID(userdata);
3664
3665 if (revents != EPOLLIN) {
3666 log_warning("Got unexpected poll event for notify fd.");
3667 return 0;
3668 }
3669
3670 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3671 if (n < 0) {
3672 if (IN_SET(errno, EAGAIN, EINTR))
3673 return 0;
3674
3675 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3676 }
3677 cmsg_close_all(&msghdr);
3678
3679 CMSG_FOREACH(cmsg, &msghdr) {
3680 if (cmsg->cmsg_level == SOL_SOCKET &&
3681 cmsg->cmsg_type == SCM_CREDENTIALS &&
3682 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3683
3684 ucred = (struct ucred*) CMSG_DATA(cmsg);
3685 }
3686 }
3687
3688 if (!ucred || ucred->pid != inner_child_pid) {
3689 log_debug("Received notify message without valid credentials. Ignoring.");
3690 return 0;
3691 }
3692
3693 if ((size_t) n >= sizeof(buf)) {
3694 log_warning("Received notify message exceeded maximum size. Ignoring.");
3695 return 0;
3696 }
3697
3698 buf[n] = 0;
3699 tags = strv_split(buf, "\n\r");
3700 if (!tags)
3701 return log_oom();
3702
3703 if (strv_find(tags, "READY=1"))
3704 (void) sd_notifyf(false, "READY=1\n");
3705
3706 p = strv_find_startswith(tags, "STATUS=");
3707 if (p)
3708 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
3709
3710 return 0;
3711 }
3712
3713 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3714 int r;
3715
3716 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3717 if (r < 0)
3718 return log_error_errno(r, "Failed to allocate notify event source: %m");
3719
3720 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3721
3722 return 0;
3723 }
3724
3725 static int merge_settings(Settings *settings, const char *path) {
3726 int rl;
3727
3728 assert(settings);
3729 assert(path);
3730
3731 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3732 * that this steals the fields of the Settings* structure, and hence modifies it. */
3733
3734 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3735 settings->start_mode >= 0) {
3736 arg_start_mode = settings->start_mode;
3737 strv_free_and_replace(arg_parameters, settings->parameters);
3738 }
3739
3740 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3741 arg_ephemeral = settings->ephemeral;
3742
3743 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3744 settings->root) {
3745
3746 if (!arg_settings_trusted)
3747 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3748 else
3749 free_and_replace(arg_directory, settings->root);
3750 }
3751
3752 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3753 settings->pivot_root_new) {
3754 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3755 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3756 }
3757
3758 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3759 settings->working_directory)
3760 free_and_replace(arg_chdir, settings->working_directory);
3761
3762 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3763 settings->environment)
3764 strv_free_and_replace(arg_setenv, settings->environment);
3765
3766 if ((arg_settings_mask & SETTING_USER) == 0) {
3767
3768 if (settings->user)
3769 free_and_replace(arg_user, settings->user);
3770
3771 if (uid_is_valid(settings->uid))
3772 arg_uid = settings->uid;
3773 if (gid_is_valid(settings->gid))
3774 arg_gid = settings->gid;
3775 if (settings->n_supplementary_gids > 0) {
3776 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3777 arg_n_supplementary_gids = settings->n_supplementary_gids;
3778 }
3779 }
3780
3781 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3782 uint64_t plus, minus;
3783
3784 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3785 * Settings structure */
3786
3787 plus = settings->capability;
3788 minus = settings->drop_capability;
3789
3790 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3791 if (settings_private_network(settings))
3792 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3793 else
3794 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3795 }
3796
3797 if (!arg_settings_trusted && plus != 0) {
3798 if (settings->capability != 0)
3799 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3800 } else
3801 arg_caps_retain |= plus;
3802
3803 arg_caps_retain &= ~minus;
3804
3805 /* Copy the full capabilities over too */
3806 if (capability_quintet_is_set(&settings->full_capabilities)) {
3807 if (!arg_settings_trusted)
3808 log_warning("Ignoring capability settings, file %s is not trusted.", path);
3809 else
3810 arg_full_capabilities = settings->full_capabilities;
3811 }
3812 }
3813
3814 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3815 settings->kill_signal > 0)
3816 arg_kill_signal = settings->kill_signal;
3817
3818 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3819 settings->personality != PERSONALITY_INVALID)
3820 arg_personality = settings->personality;
3821
3822 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3823 !sd_id128_is_null(settings->machine_id)) {
3824
3825 if (!arg_settings_trusted)
3826 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3827 else
3828 arg_uuid = settings->machine_id;
3829 }
3830
3831 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3832 settings->read_only >= 0)
3833 arg_read_only = settings->read_only;
3834
3835 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3836 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3837 arg_volatile_mode = settings->volatile_mode;
3838
3839 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3840 settings->n_custom_mounts > 0) {
3841
3842 if (!arg_settings_trusted)
3843 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3844 else {
3845 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3846 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3847 arg_n_custom_mounts = settings->n_custom_mounts;
3848 settings->n_custom_mounts = 0;
3849 }
3850 }
3851
3852 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3853 (settings->private_network >= 0 ||
3854 settings->network_veth >= 0 ||
3855 settings->network_bridge ||
3856 settings->network_zone ||
3857 settings->network_interfaces ||
3858 settings->network_macvlan ||
3859 settings->network_ipvlan ||
3860 settings->network_veth_extra ||
3861 settings->network_namespace_path)) {
3862
3863 if (!arg_settings_trusted)
3864 log_warning("Ignoring network settings, file %s is not trusted.", path);
3865 else {
3866 arg_network_veth = settings_network_veth(settings);
3867 arg_private_network = settings_private_network(settings);
3868
3869 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3870 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3871 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3872 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3873
3874 free_and_replace(arg_network_bridge, settings->network_bridge);
3875 free_and_replace(arg_network_zone, settings->network_zone);
3876
3877 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
3878 }
3879 }
3880
3881 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3882 settings->expose_ports) {
3883
3884 if (!arg_settings_trusted)
3885 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3886 else {
3887 expose_port_free_all(arg_expose_ports);
3888 arg_expose_ports = TAKE_PTR(settings->expose_ports);
3889 }
3890 }
3891
3892 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3893 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3894
3895 if (!arg_settings_trusted)
3896 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3897 else {
3898 arg_userns_mode = settings->userns_mode;
3899 arg_uid_shift = settings->uid_shift;
3900 arg_uid_range = settings->uid_range;
3901 arg_userns_chown = settings->userns_chown;
3902 }
3903 }
3904
3905 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3906 arg_notify_ready = settings->notify_ready;
3907
3908 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3909
3910 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
3911 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3912 else {
3913 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3914 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3915 }
3916
3917 #if HAVE_SECCOMP
3918 if (!arg_settings_trusted && settings->seccomp)
3919 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3920 else {
3921 seccomp_release(arg_seccomp);
3922 arg_seccomp = TAKE_PTR(settings->seccomp);
3923 }
3924 #endif
3925 }
3926
3927 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3928 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3929 continue;
3930
3931 if (!settings->rlimit[rl])
3932 continue;
3933
3934 if (!arg_settings_trusted) {
3935 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3936 continue;
3937 }
3938
3939 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3940 }
3941
3942 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3943 settings->hostname)
3944 free_and_replace(arg_hostname, settings->hostname);
3945
3946 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3947 settings->no_new_privileges >= 0)
3948 arg_no_new_privileges = settings->no_new_privileges;
3949
3950 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3951 settings->oom_score_adjust_set) {
3952
3953 if (!arg_settings_trusted)
3954 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3955 else {
3956 arg_oom_score_adjust = settings->oom_score_adjust;
3957 arg_oom_score_adjust_set = true;
3958 }
3959 }
3960
3961 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3962 settings->cpu_set.set) {
3963
3964 if (!arg_settings_trusted)
3965 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3966 else {
3967 cpu_set_reset(&arg_cpu_set);
3968 arg_cpu_set = settings->cpu_set;
3969 settings->cpu_set = (CPUSet) {};
3970 }
3971 }
3972
3973 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3974 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3975 arg_resolv_conf = settings->resolv_conf;
3976
3977 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3978 settings->link_journal != _LINK_JOURNAL_INVALID) {
3979
3980 if (!arg_settings_trusted)
3981 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3982 else {
3983 arg_link_journal = settings->link_journal;
3984 arg_link_journal_try = settings->link_journal_try;
3985 }
3986 }
3987
3988 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3989 settings->timezone != _TIMEZONE_MODE_INVALID)
3990 arg_timezone = settings->timezone;
3991
3992 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3993 settings->slice) {
3994
3995 if (!arg_settings_trusted)
3996 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3997 else
3998 free_and_replace(arg_slice, settings->slice);
3999 }
4000
4001 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4002 settings->use_cgns >= 0) {
4003
4004 if (!arg_settings_trusted)
4005 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4006 else
4007 arg_use_cgns = settings->use_cgns;
4008 }
4009
4010 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4011 settings->clone_ns_flags != (unsigned long) -1) {
4012
4013 if (!arg_settings_trusted)
4014 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4015 else
4016 arg_clone_ns_flags = settings->clone_ns_flags;
4017 }
4018
4019 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4020 settings->console_mode >= 0) {
4021
4022 if (!arg_settings_trusted)
4023 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4024 else
4025 arg_console_mode = settings->console_mode;
4026 }
4027
4028 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4029 * don't consult arg_settings_mask for them. */
4030
4031 sd_bus_message_unref(arg_property_message);
4032 arg_property_message = TAKE_PTR(settings->properties);
4033
4034 arg_console_width = settings->console_width;
4035 arg_console_height = settings->console_height;
4036
4037 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4038 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4039 arg_n_extra_nodes = settings->n_extra_nodes;
4040
4041 return 0;
4042 }
4043
4044 static int load_settings(void) {
4045 _cleanup_(settings_freep) Settings *settings = NULL;
4046 _cleanup_fclose_ FILE *f = NULL;
4047 _cleanup_free_ char *p = NULL;
4048 const char *fn, *i;
4049 int r;
4050
4051 if (arg_oci_bundle)
4052 return 0;
4053
4054 /* If all settings are masked, there's no point in looking for
4055 * the settings file */
4056 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4057 return 0;
4058
4059 fn = strjoina(arg_machine, ".nspawn");
4060
4061 /* We first look in the admin's directories in /etc and /run */
4062 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4063 _cleanup_free_ char *j = NULL;
4064
4065 j = path_join(i, fn);
4066 if (!j)
4067 return log_oom();
4068
4069 f = fopen(j, "re");
4070 if (f) {
4071 p = TAKE_PTR(j);
4072
4073 /* By default, we trust configuration from /etc and /run */
4074 if (arg_settings_trusted < 0)
4075 arg_settings_trusted = true;
4076
4077 break;
4078 }
4079
4080 if (errno != ENOENT)
4081 return log_error_errno(errno, "Failed to open %s: %m", j);
4082 }
4083
4084 if (!f) {
4085 /* After that, let's look for a file next to the
4086 * actual image we shall boot. */
4087
4088 if (arg_image) {
4089 p = file_in_same_dir(arg_image, fn);
4090 if (!p)
4091 return log_oom();
4092 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4093 p = file_in_same_dir(arg_directory, fn);
4094 if (!p)
4095 return log_oom();
4096 }
4097
4098 if (p) {
4099 f = fopen(p, "re");
4100 if (!f && errno != ENOENT)
4101 return log_error_errno(errno, "Failed to open %s: %m", p);
4102
4103 /* By default, we do not trust configuration from /var/lib/machines */
4104 if (arg_settings_trusted < 0)
4105 arg_settings_trusted = false;
4106 }
4107 }
4108
4109 if (!f)
4110 return 0;
4111
4112 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4113
4114 r = settings_load(f, p, &settings);
4115 if (r < 0)
4116 return r;
4117
4118 return merge_settings(settings, p);
4119 }
4120
4121 static int load_oci_bundle(void) {
4122 _cleanup_(settings_freep) Settings *settings = NULL;
4123 int r;
4124
4125 if (!arg_oci_bundle)
4126 return 0;
4127
4128 /* By default let's trust OCI bundles */
4129 if (arg_settings_trusted < 0)
4130 arg_settings_trusted = true;
4131
4132 r = oci_load(NULL, arg_oci_bundle, &settings);
4133 if (r < 0)
4134 return r;
4135
4136 return merge_settings(settings, arg_oci_bundle);
4137 }
4138
4139 static int run_container(
4140 DissectedImage *dissected_image,
4141 bool secondary,
4142 FDSet *fds,
4143 char veth_name[IFNAMSIZ], bool *veth_created,
4144 union in_addr_union *exposed,
4145 int *master, pid_t *pid, int *ret) {
4146
4147 static const struct sigaction sa = {
4148 .sa_handler = nop_signal_handler,
4149 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4150 };
4151
4152 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4153 _cleanup_close_ int etc_passwd_lock = -1;
4154 _cleanup_close_pair_ int
4155 kmsg_socket_pair[2] = { -1, -1 },
4156 rtnl_socket_pair[2] = { -1, -1 },
4157 pid_socket_pair[2] = { -1, -1 },
4158 uuid_socket_pair[2] = { -1, -1 },
4159 notify_socket_pair[2] = { -1, -1 },
4160 uid_shift_socket_pair[2] = { -1, -1 },
4161 master_pty_socket_pair[2] = { -1, -1 },
4162 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4163
4164 _cleanup_close_ int notify_socket = -1;
4165 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4166 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4167 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4168 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4169 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4170 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4171 ContainerStatus container_status = 0;
4172 int ifi = 0, r;
4173 ssize_t l;
4174 sigset_t mask_chld;
4175 _cleanup_close_ int netns_fd = -1;
4176
4177 assert_se(sigemptyset(&mask_chld) == 0);
4178 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4179
4180 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4181 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4182 * check with getpwuid() if the specific user already exists. Note that /etc might be
4183 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4184 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4185 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4186 * really ours. */
4187
4188 etc_passwd_lock = take_etc_passwd_lock(NULL);
4189 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4190 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4191 }
4192
4193 r = barrier_create(&barrier);
4194 if (r < 0)
4195 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4196
4197 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4198 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4199
4200 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4201 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4202
4203 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4204 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4205
4206 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4207 return log_error_errno(errno, "Failed to create id socket pair: %m");
4208
4209 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4210 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4211
4212 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4213 return log_error_errno(errno, "Failed to create console socket pair: %m");
4214
4215 if (arg_userns_mode != USER_NAMESPACE_NO)
4216 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4217 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4218
4219 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4220 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4221 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4222
4223 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4224 * parent's blocking calls and give it a chance to call wait() and terminate. */
4225 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4226 if (r < 0)
4227 return log_error_errno(errno, "Failed to change the signal mask: %m");
4228
4229 r = sigaction(SIGCHLD, &sa, NULL);
4230 if (r < 0)
4231 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4232
4233 if (arg_network_namespace_path) {
4234 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4235 if (netns_fd < 0)
4236 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4237
4238 r = fd_is_network_ns(netns_fd);
4239 if (r == -EUCLEAN)
4240 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4241 else if (r < 0)
4242 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4243 else if (r == 0)
4244 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4245 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4246 }
4247
4248 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4249 if (*pid < 0)
4250 return log_error_errno(errno, "clone() failed%s: %m",
4251 errno == EINVAL ?
4252 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4253
4254 if (*pid == 0) {
4255 /* The outer child only has a file system namespace. */
4256 barrier_set_role(&barrier, BARRIER_CHILD);
4257
4258 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4259 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4260 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4261 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4262 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4263 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4264 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4265 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4266
4267 (void) reset_all_signal_handlers();
4268 (void) reset_signal_mask();
4269
4270 r = outer_child(&barrier,
4271 arg_directory,
4272 dissected_image,
4273 secondary,
4274 pid_socket_pair[1],
4275 uuid_socket_pair[1],
4276 notify_socket_pair[1],
4277 kmsg_socket_pair[1],
4278 rtnl_socket_pair[1],
4279 uid_shift_socket_pair[1],
4280 master_pty_socket_pair[1],
4281 unified_cgroup_hierarchy_socket_pair[1],
4282 fds,
4283 netns_fd);
4284 if (r < 0)
4285 _exit(EXIT_FAILURE);
4286
4287 _exit(EXIT_SUCCESS);
4288 }
4289
4290 barrier_set_role(&barrier, BARRIER_PARENT);
4291
4292 fdset_close(fds);
4293
4294 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4295 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4296 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4297 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4298 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4299 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4300 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4301 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4302
4303 if (arg_userns_mode != USER_NAMESPACE_NO) {
4304 /* The child just let us know the UID shift it might have read from the image. */
4305 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4306 if (l < 0)
4307 return log_error_errno(errno, "Failed to read UID shift: %m");
4308 if (l != sizeof arg_uid_shift)
4309 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4310
4311 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4312 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4313 * image, but if that's already in use, pick a new one, and report back to the child,
4314 * which one we now picked. */
4315
4316 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4317 if (r < 0)
4318 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4319
4320 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4321 if (l < 0)
4322 return log_error_errno(errno, "Failed to send UID shift: %m");
4323 if (l != sizeof arg_uid_shift)
4324 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4325 }
4326 }
4327
4328 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4329 /* The child let us know the support cgroup mode it might have read from the image. */
4330 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4331 if (l < 0)
4332 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4333 if (l != sizeof(arg_unified_cgroup_hierarchy))
4334 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4335 l, l == 0 ? " The child is most likely dead." : "");
4336 }
4337
4338 /* Wait for the outer child. */
4339 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4340 if (r < 0)
4341 return r;
4342 if (r != EXIT_SUCCESS)
4343 return -EIO;
4344
4345 /* And now retrieve the PID of the inner child. */
4346 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4347 if (l < 0)
4348 return log_error_errno(errno, "Failed to read inner child PID: %m");
4349 if (l != sizeof *pid)
4350 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4351
4352 /* We also retrieve container UUID in case it was generated by outer child */
4353 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4354 if (l < 0)
4355 return log_error_errno(errno, "Failed to read container machine ID: %m");
4356 if (l != sizeof(arg_uuid))
4357 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4358
4359 /* We also retrieve the socket used for notifications generated by outer child */
4360 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4361 if (notify_socket < 0)
4362 return log_error_errno(notify_socket,
4363 "Failed to receive notification socket from the outer child: %m");
4364
4365 log_debug("Init process invoked as PID "PID_FMT, *pid);
4366
4367 if (arg_userns_mode != USER_NAMESPACE_NO) {
4368 if (!barrier_place_and_sync(&barrier)) /* #1 */
4369 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4370
4371 r = setup_uid_map(*pid);
4372 if (r < 0)
4373 return r;
4374
4375 (void) barrier_place(&barrier); /* #2 */
4376 }
4377
4378 if (arg_private_network) {
4379 if (!arg_network_namespace_path) {
4380 /* Wait until the child has unshared its network namespace. */
4381 if (!barrier_place_and_sync(&barrier)) /* #3 */
4382 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4383 }
4384
4385 r = move_network_interfaces(*pid, arg_network_interfaces);
4386 if (r < 0)
4387 return r;
4388
4389 if (arg_network_veth) {
4390 r = setup_veth(arg_machine, *pid, veth_name,
4391 arg_network_bridge || arg_network_zone);
4392 if (r < 0)
4393 return r;
4394 else if (r > 0)
4395 ifi = r;
4396
4397 if (arg_network_bridge) {
4398 /* Add the interface to a bridge */
4399 r = setup_bridge(veth_name, arg_network_bridge, false);
4400 if (r < 0)
4401 return r;
4402 if (r > 0)
4403 ifi = r;
4404 } else if (arg_network_zone) {
4405 /* Add the interface to a bridge, possibly creating it */
4406 r = setup_bridge(veth_name, arg_network_zone, true);
4407 if (r < 0)
4408 return r;
4409 if (r > 0)
4410 ifi = r;
4411 }
4412 }
4413
4414 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4415 if (r < 0)
4416 return r;
4417
4418 /* We created the primary and extra veth links now; let's remember this, so that we know to
4419 remove them later on. Note that we don't bother with removing veth links that were created
4420 here when their setup failed half-way, because in that case the kernel should be able to
4421 remove them on its own, since they cannot be referenced by anything yet. */
4422 *veth_created = true;
4423
4424 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4425 if (r < 0)
4426 return r;
4427
4428 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4429 if (r < 0)
4430 return r;
4431 }
4432
4433 if (arg_register || !arg_keep_unit) {
4434 r = sd_bus_default_system(&bus);
4435 if (r < 0)
4436 return log_error_errno(r, "Failed to open system bus: %m");
4437
4438 r = sd_bus_set_close_on_exit(bus, false);
4439 if (r < 0)
4440 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4441 }
4442
4443 if (!arg_keep_unit) {
4444 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4445 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4446 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4447
4448 r = sd_bus_match_signal_async(
4449 bus,
4450 NULL,
4451 "org.freedesktop.systemd1",
4452 NULL,
4453 "org.freedesktop.systemd1.Scope",
4454 "RequestStop",
4455 on_request_stop, NULL, PID_TO_PTR(*pid));
4456 if (r < 0)
4457 return log_error_errno(r, "Failed to request RequestStop match: %m");
4458 }
4459
4460 if (arg_register) {
4461 r = register_machine(
4462 bus,
4463 arg_machine,
4464 *pid,
4465 arg_directory,
4466 arg_uuid,
4467 ifi,
4468 arg_slice,
4469 arg_custom_mounts, arg_n_custom_mounts,
4470 arg_kill_signal,
4471 arg_property,
4472 arg_property_message,
4473 arg_keep_unit,
4474 arg_container_service_name);
4475 if (r < 0)
4476 return r;
4477
4478 } else if (!arg_keep_unit) {
4479 r = allocate_scope(
4480 bus,
4481 arg_machine,
4482 *pid,
4483 arg_slice,
4484 arg_custom_mounts, arg_n_custom_mounts,
4485 arg_kill_signal,
4486 arg_property,
4487 arg_property_message);
4488 if (r < 0)
4489 return r;
4490
4491 } else if (arg_slice || arg_property)
4492 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
4493
4494 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
4495 if (r < 0)
4496 return r;
4497
4498 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4499 if (r < 0)
4500 return r;
4501
4502 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4503 if (r < 0)
4504 return r;
4505
4506 /* Notify the child that the parent is ready with all
4507 * its setup (including cgroup-ification), and that
4508 * the child can now hand over control to the code to
4509 * run inside the container. */
4510 (void) barrier_place(&barrier); /* #4 */
4511
4512 /* Block SIGCHLD here, before notifying child.
4513 * process_pty() will handle it with the other signals. */
4514 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4515
4516 /* Reset signal to default */
4517 r = default_signals(SIGCHLD, -1);
4518 if (r < 0)
4519 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4520
4521 r = sd_event_new(&event);
4522 if (r < 0)
4523 return log_error_errno(r, "Failed to get default event source: %m");
4524
4525 (void) sd_event_set_watchdog(event, true);
4526
4527 if (bus) {
4528 r = sd_bus_attach_event(bus, event, 0);
4529 if (r < 0)
4530 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4531 }
4532
4533 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
4534 if (r < 0)
4535 return r;
4536
4537 /* Let the child know that we are ready and wait that the child is completely ready now. */
4538 if (!barrier_place_and_sync(&barrier)) /* #5 */
4539 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4540
4541 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4542 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4543 etc_passwd_lock = safe_close(etc_passwd_lock);
4544
4545 (void) sd_notifyf(false,
4546 "STATUS=Container running.\n"
4547 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4548 if (!arg_notify_ready)
4549 (void) sd_notify(false, "READY=1\n");
4550
4551 if (arg_kill_signal > 0) {
4552 /* Try to kill the init system on SIGINT or SIGTERM */
4553 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4554 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4555 } else {
4556 /* Immediately exit */
4557 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4558 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4559 }
4560
4561 /* Exit when the child exits */
4562 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4563
4564 if (arg_expose_ports) {
4565 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4566 if (r < 0)
4567 return r;
4568
4569 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4570 }
4571
4572 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4573
4574 if (arg_console_mode != CONSOLE_PIPE) {
4575 _cleanup_close_ int fd = -1;
4576 PTYForwardFlags flags = 0;
4577
4578 /* Retrieve the master pty allocated by inner child */
4579 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4580 if (fd < 0)
4581 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4582
4583 switch (arg_console_mode) {
4584
4585 case CONSOLE_READ_ONLY:
4586 flags |= PTY_FORWARD_READ_ONLY;
4587
4588 _fallthrough_;
4589
4590 case CONSOLE_INTERACTIVE:
4591 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4592
4593 r = pty_forward_new(event, fd, flags, &forward);
4594 if (r < 0)
4595 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4596
4597 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4598 (void) pty_forward_set_width_height(forward,
4599 arg_console_width,
4600 arg_console_height);
4601 break;
4602
4603 default:
4604 assert(arg_console_mode == CONSOLE_PASSIVE);
4605 }
4606
4607 *master = TAKE_FD(fd);
4608 }
4609
4610 r = sd_event_loop(event);
4611 if (r < 0)
4612 return log_error_errno(r, "Failed to run event loop: %m");
4613
4614 if (forward) {
4615 char last_char = 0;
4616
4617 (void) pty_forward_get_last_char(forward, &last_char);
4618 forward = pty_forward_free(forward);
4619
4620 if (!arg_quiet && last_char != '\n')
4621 putc('\n', stdout);
4622 }
4623
4624 /* Kill if it is not dead yet anyway */
4625 if (!arg_register && !arg_keep_unit && bus)
4626 terminate_scope(bus, arg_machine);
4627
4628 /* Normally redundant, but better safe than sorry */
4629 (void) kill(*pid, SIGKILL);
4630
4631 r = wait_for_container(*pid, &container_status);
4632 *pid = 0;
4633
4634 /* Tell machined that we are gone. */
4635 if (bus)
4636 (void) unregister_machine(bus, arg_machine);
4637
4638 if (r < 0)
4639 /* We failed to wait for the container, or the container exited abnormally. */
4640 return r;
4641 if (r > 0 || container_status == CONTAINER_TERMINATED) {
4642 /* r > 0 → The container exited with a non-zero status.
4643 * As a special case, we need to replace 133 with a different value,
4644 * because 133 is special-cased in the service file to reboot the container.
4645 * otherwise → The container exited with zero status and a reboot was not requested.
4646 */
4647 if (r == EXIT_FORCE_RESTART)
4648 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4649 *ret = r;
4650 return 0; /* finito */
4651 }
4652
4653 /* CONTAINER_REBOOTED, loop again */
4654
4655 if (arg_keep_unit) {
4656 /* Special handling if we are running as a service: instead of simply
4657 * restarting the machine we want to restart the entire service, so let's
4658 * inform systemd about this with the special exit code 133. The service
4659 * file uses RestartForceExitStatus=133 so that this results in a full
4660 * nspawn restart. This is necessary since we might have cgroup parameters
4661 * set we want to have flushed out. */
4662 *ret = EXIT_FORCE_RESTART;
4663 return 0; /* finito */
4664 }
4665
4666 expose_port_flush(arg_expose_ports, exposed);
4667
4668 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4669 *veth_created = false;
4670 return 1; /* loop again */
4671 }
4672
4673 static int initialize_rlimits(void) {
4674 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4675 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4676 * container execution environments. */
4677
4678 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4679 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4680 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4681 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4682 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4683 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4684 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4685 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4686 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4687 [RLIMIT_NICE] = { 0, 0 },
4688 [RLIMIT_NOFILE] = { 1024, 4096 },
4689 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4690 [RLIMIT_RTPRIO] = { 0, 0 },
4691 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4692 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4693
4694 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4695 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4696 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4697 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4698 * that PID 1 changes a number of other resource limits during early initialization which is why we
4699 * don't read the other limits from PID 1 but prefer the static table above. */
4700 };
4701
4702 int rl;
4703
4704 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4705 /* Let's only fill in what the user hasn't explicitly configured anyway */
4706 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4707 const struct rlimit *v;
4708 struct rlimit buffer;
4709
4710 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4711 /* For these two let's read the limits off PID 1. See above for an explanation. */
4712
4713 if (prlimit(1, rl, NULL, &buffer) < 0)
4714 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4715
4716 v = &buffer;
4717 } else
4718 v = kernel_defaults + rl;
4719
4720 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4721 if (!arg_rlimit[rl])
4722 return log_oom();
4723 }
4724
4725 if (DEBUG_LOGGING) {
4726 _cleanup_free_ char *k = NULL;
4727
4728 (void) rlimit_format(arg_rlimit[rl], &k);
4729 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4730 }
4731 }
4732
4733 return 0;
4734 }
4735
4736 static int run(int argc, char *argv[]) {
4737 bool secondary = false, remove_directory = false, remove_image = false,
4738 veth_created = false, remove_tmprootdir = false;
4739 _cleanup_close_ int master = -1;
4740 _cleanup_fdset_free_ FDSet *fds = NULL;
4741 int r, n_fd_passed, ret = EXIT_SUCCESS;
4742 char veth_name[IFNAMSIZ] = "";
4743 union in_addr_union exposed = {};
4744 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4745 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4746 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4747 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4748 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4749 pid_t pid = 0;
4750
4751 log_parse_environment();
4752 log_open();
4753
4754 r = parse_argv(argc, argv);
4755 if (r <= 0)
4756 goto finish;
4757
4758 r = must_be_root();
4759 if (r < 0)
4760 goto finish;
4761
4762 r = initialize_rlimits();
4763 if (r < 0)
4764 goto finish;
4765
4766 r = load_oci_bundle();
4767 if (r < 0)
4768 goto finish;
4769
4770 r = determine_names();
4771 if (r < 0)
4772 goto finish;
4773
4774 r = load_settings();
4775 if (r < 0)
4776 goto finish;
4777
4778 r = cg_unified();
4779 if (r < 0) {
4780 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4781 goto finish;
4782 }
4783
4784 r = verify_arguments();
4785 if (r < 0)
4786 goto finish;
4787
4788 /* Reapply environment settings. */
4789 (void) detect_unified_cgroup_hierarchy_from_environment();
4790
4791 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4792 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4793 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4794 (void) ignore_signals(SIGPIPE, -1);
4795
4796 n_fd_passed = sd_listen_fds(false);
4797 if (n_fd_passed > 0) {
4798 r = fdset_new_listen_fds(&fds, false);
4799 if (r < 0) {
4800 log_error_errno(r, "Failed to collect file descriptors: %m");
4801 goto finish;
4802 }
4803 }
4804
4805 /* The "default" umask. This is appropriate for most file and directory
4806 * operations performed by nspawn, and is the umask that will be used for
4807 * the child. Functions like copy_devnodes() change the umask temporarily. */
4808 umask(0022);
4809
4810 if (arg_directory) {
4811 assert(!arg_image);
4812
4813 /* Safety precaution: let's not allow running images from the live host OS image, as long as
4814 * /var from the host will propagate into container dynamically (because bad things happen if
4815 * two systems write to the same /var). Let's allow it for the special cases where /var is
4816 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
4817 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
4818 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
4819 r = -EINVAL;
4820 goto finish;
4821 }
4822
4823 if (arg_ephemeral) {
4824 _cleanup_free_ char *np = NULL;
4825
4826 r = chase_symlinks_and_update(&arg_directory, 0);
4827 if (r < 0)
4828 goto finish;
4829
4830 /* If the specified path is a mount point we generate the new snapshot immediately
4831 * inside it under a random name. However if the specified is not a mount point we
4832 * create the new snapshot in the parent directory, just next to it. */
4833 r = path_is_mount_point(arg_directory, NULL, 0);
4834 if (r < 0) {
4835 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4836 goto finish;
4837 }
4838 if (r > 0)
4839 r = tempfn_random_child(arg_directory, "machine.", &np);
4840 else
4841 r = tempfn_random(arg_directory, "machine.", &np);
4842 if (r < 0) {
4843 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4844 goto finish;
4845 }
4846
4847 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
4848 * only owned by us and noone else. */
4849 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4850 if (r < 0) {
4851 log_error_errno(r, "Failed to lock %s: %m", np);
4852 goto finish;
4853 }
4854
4855 {
4856 BLOCK_SIGNALS(SIGINT);
4857 r = btrfs_subvol_snapshot(arg_directory, np,
4858 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4859 BTRFS_SNAPSHOT_FALLBACK_COPY |
4860 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4861 BTRFS_SNAPSHOT_RECURSIVE |
4862 BTRFS_SNAPSHOT_QUOTA |
4863 BTRFS_SNAPSHOT_SIGINT);
4864 }
4865 if (r == -EINTR) {
4866 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
4867 goto finish;
4868 }
4869 if (r < 0) {
4870 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4871 goto finish;
4872 }
4873
4874 free_and_replace(arg_directory, np);
4875 remove_directory = true;
4876 } else {
4877 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4878 if (r < 0)
4879 goto finish;
4880
4881 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4882 if (r == -EBUSY) {
4883 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4884 goto finish;
4885 }
4886 if (r < 0) {
4887 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4888 goto finish;
4889 }
4890
4891 if (arg_template) {
4892 r = chase_symlinks_and_update(&arg_template, 0);
4893 if (r < 0)
4894 goto finish;
4895
4896 {
4897 BLOCK_SIGNALS(SIGINT);
4898 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4899 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4900 BTRFS_SNAPSHOT_FALLBACK_COPY |
4901 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4902 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4903 BTRFS_SNAPSHOT_RECURSIVE |
4904 BTRFS_SNAPSHOT_QUOTA |
4905 BTRFS_SNAPSHOT_SIGINT);
4906 }
4907 if (r == -EEXIST)
4908 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4909 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4910 else if (r == -EINTR) {
4911 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
4912 goto finish;
4913 } else if (r < 0) {
4914 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4915 goto finish;
4916 } else
4917 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4918 "Populated %s from template %s.", arg_directory, arg_template);
4919 }
4920 }
4921
4922 if (arg_start_mode == START_BOOT) {
4923 const char *p;
4924
4925 if (arg_pivot_root_new)
4926 p = prefix_roota(arg_directory, arg_pivot_root_new);
4927 else
4928 p = arg_directory;
4929
4930 if (path_is_os_tree(p) <= 0) {
4931 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4932 r = -EINVAL;
4933 goto finish;
4934 }
4935 } else {
4936 const char *p, *q;
4937
4938 if (arg_pivot_root_new)
4939 p = prefix_roota(arg_directory, arg_pivot_root_new);
4940 else
4941 p = arg_directory;
4942
4943 q = strjoina(p, "/usr/");
4944
4945 if (laccess(q, F_OK) < 0) {
4946 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4947 r = -EINVAL;
4948 goto finish;
4949 }
4950 }
4951
4952 } else {
4953 assert(arg_image);
4954 assert(!arg_template);
4955
4956 r = chase_symlinks_and_update(&arg_image, 0);
4957 if (r < 0)
4958 goto finish;
4959
4960 if (arg_ephemeral) {
4961 _cleanup_free_ char *np = NULL;
4962
4963 r = tempfn_random(arg_image, "machine.", &np);
4964 if (r < 0) {
4965 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4966 goto finish;
4967 }
4968
4969 /* Always take an exclusive lock on our own ephemeral copy. */
4970 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4971 if (r < 0) {
4972 r = log_error_errno(r, "Failed to create image lock: %m");
4973 goto finish;
4974 }
4975
4976 {
4977 BLOCK_SIGNALS(SIGINT);
4978 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
4979 }
4980 if (r == -EINTR) {
4981 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
4982 goto finish;
4983 }
4984 if (r < 0) {
4985 r = log_error_errno(r, "Failed to copy image file: %m");
4986 goto finish;
4987 }
4988
4989 free_and_replace(arg_image, np);
4990 remove_image = true;
4991 } else {
4992 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4993 if (r == -EBUSY) {
4994 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4995 goto finish;
4996 }
4997 if (r < 0) {
4998 r = log_error_errno(r, "Failed to create image lock: %m");
4999 goto finish;
5000 }
5001
5002 if (!arg_root_hash) {
5003 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
5004 if (r < 0) {
5005 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
5006 goto finish;
5007 }
5008 }
5009 }
5010
5011 if (!mkdtemp(tmprootdir)) {
5012 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5013 goto finish;
5014 }
5015
5016 remove_tmprootdir = true;
5017
5018 arg_directory = strdup(tmprootdir);
5019 if (!arg_directory) {
5020 r = log_oom();
5021 goto finish;
5022 }
5023
5024 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
5025 if (r < 0) {
5026 log_error_errno(r, "Failed to set up loopback block device: %m");
5027 goto finish;
5028 }
5029
5030 r = dissect_image_and_warn(
5031 loop->fd,
5032 arg_image,
5033 arg_root_hash, arg_root_hash_size,
5034 DISSECT_IMAGE_REQUIRE_ROOT,
5035 &dissected_image);
5036 if (r == -ENOPKG) {
5037 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5038 log_notice("Note that the disk image needs to\n"
5039 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5040 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5041 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
5042 " d) or contain a file system without a partition table\n"
5043 "in order to be bootable with systemd-nspawn.");
5044 goto finish;
5045 }
5046 if (r < 0)
5047 goto finish;
5048
5049 if (!arg_root_hash && dissected_image->can_verity)
5050 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5051
5052 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
5053 if (r < 0)
5054 goto finish;
5055
5056 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5057 if (remove_image && unlink(arg_image) >= 0)
5058 remove_image = false;
5059 }
5060
5061 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5062 if (r < 0)
5063 goto finish;
5064
5065 if (arg_console_mode < 0)
5066 arg_console_mode =
5067 isatty(STDIN_FILENO) > 0 &&
5068 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5069
5070 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5071 arg_quiet = true;
5072
5073 if (!arg_quiet)
5074 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5075 arg_machine, arg_image ?: arg_directory);
5076
5077 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5078
5079 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5080 r = log_error_errno(errno, "Failed to become subreaper: %m");
5081 goto finish;
5082 }
5083
5084 for (;;) {
5085 r = run_container(dissected_image,
5086 secondary,
5087 fds,
5088 veth_name, &veth_created,
5089 &exposed, &master,
5090 &pid, &ret);
5091 if (r <= 0)
5092 break;
5093 }
5094
5095 finish:
5096 (void) sd_notify(false,
5097 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5098 "STOPPING=1\nSTATUS=Terminating...");
5099
5100 if (pid > 0)
5101 (void) kill(pid, SIGKILL);
5102
5103 /* Try to flush whatever is still queued in the pty */
5104 if (master >= 0) {
5105 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
5106 master = safe_close(master);
5107 }
5108
5109 if (pid > 0)
5110 (void) wait_for_terminate(pid, NULL);
5111
5112 pager_close();
5113
5114 if (remove_directory && arg_directory) {
5115 int k;
5116
5117 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5118 if (k < 0)
5119 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5120 }
5121
5122 if (remove_image && arg_image) {
5123 if (unlink(arg_image) < 0)
5124 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5125 }
5126
5127 if (remove_tmprootdir) {
5128 if (rmdir(tmprootdir) < 0)
5129 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5130 }
5131
5132 if (arg_machine) {
5133 const char *p;
5134
5135 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5136 (void) rm_rf(p, REMOVE_ROOT);
5137 }
5138
5139 expose_port_flush(arg_expose_ports, &exposed);
5140
5141 if (veth_created)
5142 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5143 (void) remove_bridge(arg_network_zone);
5144
5145 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5146 expose_port_free_all(arg_expose_ports);
5147 rlimit_free_all(arg_rlimit);
5148 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5149
5150 if (r < 0)
5151 return r;
5152
5153 return ret;
5154 }
5155
5156 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);