]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #13994 from keszybz/bpf-refactor
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/personality.h>
15 #include <sys/prctl.h>
16 #include <sys/types.h>
17 #include <sys/wait.h>
18 #include <unistd.h>
19
20 #include "sd-bus.h"
21 #include "sd-daemon.h"
22 #include "sd-id128.h"
23
24 #include "alloc-util.h"
25 #include "barrier.h"
26 #include "base-filesystem.h"
27 #include "blkid-util.h"
28 #include "btrfs-util.h"
29 #include "bus-error.h"
30 #include "bus-util.h"
31 #include "cap-list.h"
32 #include "capability-util.h"
33 #include "cgroup-util.h"
34 #include "copy.h"
35 #include "cpu-set-util.h"
36 #include "dev-setup.h"
37 #include "dissect-image.h"
38 #include "env-util.h"
39 #include "fd-util.h"
40 #include "fdset.h"
41 #include "fileio.h"
42 #include "format-util.h"
43 #include "fs-util.h"
44 #include "gpt.h"
45 #include "hexdecoct.h"
46 #include "hostname-util.h"
47 #include "id128-util.h"
48 #include "log.h"
49 #include "loop-util.h"
50 #include "loopback-setup.h"
51 #include "machine-image.h"
52 #include "macro.h"
53 #include "main-func.h"
54 #include "missing_sched.h"
55 #include "mkdir.h"
56 #include "mount-util.h"
57 #include "mountpoint-util.h"
58 #include "namespace-util.h"
59 #include "netlink-util.h"
60 #include "nspawn-cgroup.h"
61 #include "nspawn-def.h"
62 #include "nspawn-expose-ports.h"
63 #include "nspawn-mount.h"
64 #include "nspawn-network.h"
65 #include "nspawn-oci.h"
66 #include "nspawn-patch-uid.h"
67 #include "nspawn-register.h"
68 #include "nspawn-seccomp.h"
69 #include "nspawn-settings.h"
70 #include "nspawn-setuid.h"
71 #include "nspawn-stub-pid1.h"
72 #include "nulstr-util.h"
73 #include "os-util.h"
74 #include "pager.h"
75 #include "parse-util.h"
76 #include "path-util.h"
77 #include "pretty-print.h"
78 #include "process-util.h"
79 #include "ptyfwd.h"
80 #include "random-util.h"
81 #include "raw-clone.h"
82 #include "rlimit-util.h"
83 #include "rm-rf.h"
84 #if HAVE_SECCOMP
85 #include "seccomp-util.h"
86 #endif
87 #include "selinux-util.h"
88 #include "signal-util.h"
89 #include "socket-util.h"
90 #include "stat-util.h"
91 #include "stdio-util.h"
92 #include "string-table.h"
93 #include "string-util.h"
94 #include "strv.h"
95 #include "sysctl-util.h"
96 #include "terminal-util.h"
97 #include "tmpfile-util.h"
98 #include "umask-util.h"
99 #include "unit-name.h"
100 #include "user-util.h"
101 #include "util.h"
102
103 #if HAVE_SPLIT_USR
104 #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
105 #else
106 #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
107 #endif
108
109 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
113
114 #define EXIT_FORCE_RESTART 133
115
116 typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
118 CONTAINER_REBOOTED,
119 } ContainerStatus;
120
121 static char *arg_directory = NULL;
122 static char *arg_template = NULL;
123 static char *arg_chdir = NULL;
124 static char *arg_pivot_root_new = NULL;
125 static char *arg_pivot_root_old = NULL;
126 static char *arg_user = NULL;
127 static uid_t arg_uid = UID_INVALID;
128 static gid_t arg_gid = GID_INVALID;
129 static gid_t* arg_supplementary_gids = NULL;
130 static size_t arg_n_supplementary_gids = 0;
131 static sd_id128_t arg_uuid = {};
132 static char *arg_machine = NULL; /* The name used by the host to refer to this */
133 static char *arg_hostname = NULL; /* The name the payload sees by default */
134 static const char *arg_selinux_context = NULL;
135 static const char *arg_selinux_apifs_context = NULL;
136 static char *arg_slice = NULL;
137 static bool arg_private_network = false;
138 static bool arg_read_only = false;
139 static StartMode arg_start_mode = START_PID1;
140 static bool arg_ephemeral = false;
141 static LinkJournal arg_link_journal = LINK_AUTO;
142 static bool arg_link_journal_try = false;
143 static uint64_t arg_caps_retain =
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
155 (1ULL << CAP_MKNOD) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETFCAP) |
160 (1ULL << CAP_SETGID) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_BOOT) |
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_TTY_CONFIG);
170 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
171 static CustomMount *arg_custom_mounts = NULL;
172 static size_t arg_n_custom_mounts = 0;
173 static char **arg_setenv = NULL;
174 static bool arg_quiet = false;
175 static bool arg_register = true;
176 static bool arg_keep_unit = false;
177 static char **arg_network_interfaces = NULL;
178 static char **arg_network_macvlan = NULL;
179 static char **arg_network_ipvlan = NULL;
180 static bool arg_network_veth = false;
181 static char **arg_network_veth_extra = NULL;
182 static char *arg_network_bridge = NULL;
183 static char *arg_network_zone = NULL;
184 static char *arg_network_namespace_path = NULL;
185 static PagerFlags arg_pager_flags = 0;
186 static unsigned long arg_personality = PERSONALITY_INVALID;
187 static char *arg_image = NULL;
188 static char *arg_oci_bundle = NULL;
189 static VolatileMode arg_volatile_mode = VOLATILE_NO;
190 static ExposePort *arg_expose_ports = NULL;
191 static char **arg_property = NULL;
192 static sd_bus_message *arg_property_message = NULL;
193 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
194 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
195 static bool arg_userns_chown = false;
196 static int arg_kill_signal = 0;
197 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
198 static SettingsMask arg_settings_mask = 0;
199 static int arg_settings_trusted = -1;
200 static char **arg_parameters = NULL;
201 static const char *arg_container_service_name = "systemd-nspawn";
202 static bool arg_notify_ready = false;
203 static bool arg_use_cgns = true;
204 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
205 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
206 static void *arg_root_hash = NULL;
207 static size_t arg_root_hash_size = 0;
208 static char **arg_syscall_whitelist = NULL;
209 static char **arg_syscall_blacklist = NULL;
210 #if HAVE_SECCOMP
211 static scmp_filter_ctx arg_seccomp = NULL;
212 #endif
213 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
214 static bool arg_no_new_privileges = false;
215 static int arg_oom_score_adjust = 0;
216 static bool arg_oom_score_adjust_set = false;
217 static CPUSet arg_cpu_set = {};
218 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
219 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
220 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
221 static DeviceNode* arg_extra_nodes = NULL;
222 static size_t arg_n_extra_nodes = 0;
223 static char **arg_sysctl = NULL;
224 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
225
226 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
227 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
228 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
229 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
230 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
231 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
232 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
233 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
234 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
235 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
248 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
252 #if HAVE_SECCOMP
253 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
254 #endif
255 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
256 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
257
258 static int handle_arg_console(const char *arg) {
259 if (streq(arg, "help")) {
260 puts("interactive\n"
261 "read-only\n"
262 "passive\n"
263 "pipe");
264 return 0;
265 }
266
267 if (streq(arg, "interactive"))
268 arg_console_mode = CONSOLE_INTERACTIVE;
269 else if (streq(arg, "read-only"))
270 arg_console_mode = CONSOLE_READ_ONLY;
271 else if (streq(arg, "passive"))
272 arg_console_mode = CONSOLE_PASSIVE;
273 else if (streq(arg, "pipe"))
274 arg_console_mode = CONSOLE_PIPE;
275 else
276 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
277
278 arg_settings_mask |= SETTING_CONSOLE_MODE;
279 return 1;
280 }
281
282 static int help(void) {
283 _cleanup_free_ char *link = NULL;
284 int r;
285
286 (void) pager_open(arg_pager_flags);
287
288 r = terminal_urlify_man("systemd-nspawn", "1", &link);
289 if (r < 0)
290 return log_oom();
291
292 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
293 "Spawn a command or OS in a light-weight container.\n\n"
294 " -h --help Show this help\n"
295 " --version Print version string\n"
296 " -q --quiet Do not show status information\n"
297 " --no-pager Do not pipe output into a pager\n"
298 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
299 "%3$sImage:%4$s\n"
300 " -D --directory=PATH Root directory for the container\n"
301 " --template=PATH Initialize root directory from template directory,\n"
302 " if missing\n"
303 " -x --ephemeral Run container with snapshot of root directory, and\n"
304 " remove it after exit\n"
305 " -i --image=PATH Root file system disk image (or device node) for\n"
306 " the container\n"
307 " --oci-bundle=PATH OCI bundle directory\n"
308 " --read-only Mount the root directory read-only\n"
309 " --volatile[=MODE] Run the system in volatile mode\n"
310 " --root-hash=HASH Specify verity root hash for root disk image\n"
311 " --pivot-root=PATH[:PATH]\n"
312 " Pivot root to given directory in the container\n\n"
313 "%3$sExecution:%4$s\n"
314 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
315 " -b --boot Boot up full system (i.e. invoke init)\n"
316 " --chdir=PATH Set working directory in the container\n"
317 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
318 " -u --user=USER Run the command under specified user or UID\n"
319 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
320 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
321 "%3$sSystem Identity:%4$s\n"
322 " -M --machine=NAME Set the machine name for the container\n"
323 " --hostname=NAME Override the hostname for the container\n"
324 " --uuid=UUID Set a specific machine UUID for the container\n\n"
325 "%3$sProperties:%4$s\n"
326 " -S --slice=SLICE Place the container in the specified slice\n"
327 " --property=NAME=VALUE Set scope unit property\n"
328 " --register=BOOLEAN Register container as machine\n"
329 " --keep-unit Do not register a scope for the machine, reuse\n"
330 " the service unit nspawn is running in\n\n"
331 "%3$sUser Namespacing:%4$s\n"
332 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
333 " --private-users[=UIDBASE[:NUIDS]]\n"
334 " Similar, but with user configured UID/GID range\n"
335 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
336 "%3$sNetworking:%4$s\n"
337 " --private-network Disable network in container\n"
338 " --network-interface=INTERFACE\n"
339 " Assign an existing network interface to the\n"
340 " container\n"
341 " --network-macvlan=INTERFACE\n"
342 " Create a macvlan network interface based on an\n"
343 " existing network interface to the container\n"
344 " --network-ipvlan=INTERFACE\n"
345 " Create a ipvlan network interface based on an\n"
346 " existing network interface to the container\n"
347 " -n --network-veth Add a virtual Ethernet connection between host\n"
348 " and container\n"
349 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
350 " Add an additional virtual Ethernet link between\n"
351 " host and container\n"
352 " --network-bridge=INTERFACE\n"
353 " Add a virtual Ethernet connection to the container\n"
354 " and attach it to an existing bridge on the host\n"
355 " --network-zone=NAME Similar, but attach the new interface to an\n"
356 " an automatically managed bridge interface\n"
357 " --network-namespace-path=PATH\n"
358 " Set network namespace to the one represented by\n"
359 " the specified kernel namespace file node\n"
360 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
361 " Expose a container IP port on the host\n\n"
362 "%3$sSecurity:%4$s\n"
363 " --capability=CAP In addition to the default, retain specified\n"
364 " capability\n"
365 " --drop-capability=CAP Drop the specified capability from the default set\n"
366 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
367 " --system-call-filter=LIST|~LIST\n"
368 " Permit/prohibit specific system calls\n"
369 " -Z --selinux-context=SECLABEL\n"
370 " Set the SELinux security context to be used by\n"
371 " processes in the container\n"
372 " -L --selinux-apifs-context=SECLABEL\n"
373 " Set the SELinux security context to be used by\n"
374 " API/tmpfs file systems in the container\n\n"
375 "%3$sResources:%4$s\n"
376 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
377 " --oom-score-adjust=VALUE\n"
378 " Adjust the OOM score value for the payload\n"
379 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
380 " --personality=ARCH Pick personality for this container\n\n"
381 "%3$sIntegration:%4$s\n"
382 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
383 " --timezone=MODE Select mode of /etc/localtime initialization\n"
384 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
385 " host, try-guest, try-host\n"
386 " -j Equivalent to --link-journal=try-guest\n\n"
387 "%3$sMounts:%4$s\n"
388 " --bind=PATH[:PATH[:OPTIONS]]\n"
389 " Bind mount a file or directory from the host into\n"
390 " the container\n"
391 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
392 " Similar, but creates a read-only bind mount\n"
393 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
394 " it\n"
395 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
396 " --overlay=PATH[:PATH...]:PATH\n"
397 " Create an overlay mount from the host to \n"
398 " the container\n"
399 " --overlay-ro=PATH[:PATH...]:PATH\n"
400 " Similar, but creates a read-only overlay mount\n\n"
401 "%3$sInput/Output:%4$s\n"
402 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
403 " set up for the container.\n"
404 " -P --pipe Equivalent to --console=pipe\n"
405 "\nSee the %2$s for details.\n"
406 , program_invocation_short_name
407 , link
408 , ansi_underline(), ansi_normal());
409
410 return 0;
411 }
412
413 static int custom_mount_check_all(void) {
414 size_t i;
415
416 for (i = 0; i < arg_n_custom_mounts; i++) {
417 CustomMount *m = &arg_custom_mounts[i];
418
419 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
420 if (arg_userns_chown)
421 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
422 "--private-users-chown may not be combined with custom root mounts.");
423 else if (arg_uid_shift == UID_INVALID)
424 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
425 "--private-users with automatic UID shift may not be combined with custom root mounts.");
426 }
427 }
428
429 return 0;
430 }
431
432 static int detect_unified_cgroup_hierarchy_from_environment(void) {
433 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
434 int r;
435
436 /* Allow the user to control whether the unified hierarchy is used */
437
438 e = getenv(var);
439 if (!e) {
440 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
441 var = "UNIFIED_CGROUP_HIERARCHY";
442 e = getenv(var);
443 }
444
445 if (!isempty(e)) {
446 r = parse_boolean(e);
447 if (r < 0)
448 return log_error_errno(r, "Failed to parse $%s: %m", var);
449 if (r > 0)
450 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
451 else
452 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
453 }
454
455 return 0;
456 }
457
458 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
459 int r;
460
461 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
462 * in the image actually supports. */
463 r = cg_all_unified();
464 if (r < 0)
465 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
466 if (r > 0) {
467 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
468 * routine only detects 231, so we'll have a false negative here for 230. */
469 r = systemd_installation_has_version(directory, 230);
470 if (r < 0)
471 return log_error_errno(r, "Failed to determine systemd version in container: %m");
472 if (r > 0)
473 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
474 else
475 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
476 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
477 /* Mixed cgroup hierarchy support was added in 233 */
478 r = systemd_installation_has_version(directory, 233);
479 if (r < 0)
480 return log_error_errno(r, "Failed to determine systemd version in container: %m");
481 if (r > 0)
482 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
483 else
484 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
485 } else
486 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
487
488 log_debug("Using %s hierarchy for container.",
489 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
490 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
491
492 return 0;
493 }
494
495 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
496 int r;
497
498 r = getenv_bool(name);
499 if (r == -ENXIO)
500 return 0;
501 if (r < 0)
502 return log_error_errno(r, "Failed to parse $%s: %m", name);
503
504 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
505 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
506 return 0;
507 }
508
509 static int parse_mount_settings_env(void) {
510 const char *e;
511 int r;
512
513 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
514 if (r < 0 && r != -ENXIO)
515 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
516 if (r >= 0)
517 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
518
519 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
520 if (streq_ptr(e, "network"))
521 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
522
523 else if (e) {
524 r = parse_boolean(e);
525 if (r < 0)
526 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
527
528 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
529 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
530 }
531
532 return 0;
533 }
534
535 static int parse_environment(void) {
536 const char *e;
537 int r;
538
539 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
540 if (r < 0)
541 return r;
542 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
543 if (r < 0)
544 return r;
545 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
546 if (r < 0)
547 return r;
548 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
549 if (r < 0)
550 return r;
551
552 r = parse_mount_settings_env();
553 if (r < 0)
554 return r;
555
556 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
557 * even if it is supported. If not supported, it has no effect. */
558 if (!cg_ns_supported())
559 arg_use_cgns = false;
560 else {
561 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
562 if (r < 0) {
563 if (r != -ENXIO)
564 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
565
566 arg_use_cgns = true;
567 } else {
568 arg_use_cgns = r > 0;
569 arg_settings_mask |= SETTING_USE_CGNS;
570 }
571 }
572
573 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
574 if (e)
575 arg_container_service_name = e;
576
577 return detect_unified_cgroup_hierarchy_from_environment();
578 }
579
580 static int parse_argv(int argc, char *argv[]) {
581 enum {
582 ARG_VERSION = 0x100,
583 ARG_PRIVATE_NETWORK,
584 ARG_UUID,
585 ARG_READ_ONLY,
586 ARG_CAPABILITY,
587 ARG_DROP_CAPABILITY,
588 ARG_LINK_JOURNAL,
589 ARG_BIND,
590 ARG_BIND_RO,
591 ARG_TMPFS,
592 ARG_OVERLAY,
593 ARG_OVERLAY_RO,
594 ARG_INACCESSIBLE,
595 ARG_SHARE_SYSTEM,
596 ARG_REGISTER,
597 ARG_KEEP_UNIT,
598 ARG_NETWORK_INTERFACE,
599 ARG_NETWORK_MACVLAN,
600 ARG_NETWORK_IPVLAN,
601 ARG_NETWORK_BRIDGE,
602 ARG_NETWORK_ZONE,
603 ARG_NETWORK_VETH_EXTRA,
604 ARG_NETWORK_NAMESPACE_PATH,
605 ARG_PERSONALITY,
606 ARG_VOLATILE,
607 ARG_TEMPLATE,
608 ARG_PROPERTY,
609 ARG_PRIVATE_USERS,
610 ARG_KILL_SIGNAL,
611 ARG_SETTINGS,
612 ARG_CHDIR,
613 ARG_PIVOT_ROOT,
614 ARG_PRIVATE_USERS_CHOWN,
615 ARG_NOTIFY_READY,
616 ARG_ROOT_HASH,
617 ARG_SYSTEM_CALL_FILTER,
618 ARG_RLIMIT,
619 ARG_HOSTNAME,
620 ARG_NO_NEW_PRIVILEGES,
621 ARG_OOM_SCORE_ADJUST,
622 ARG_CPU_AFFINITY,
623 ARG_RESOLV_CONF,
624 ARG_TIMEZONE,
625 ARG_CONSOLE,
626 ARG_PIPE,
627 ARG_OCI_BUNDLE,
628 ARG_NO_PAGER,
629 };
630
631 static const struct option options[] = {
632 { "help", no_argument, NULL, 'h' },
633 { "version", no_argument, NULL, ARG_VERSION },
634 { "directory", required_argument, NULL, 'D' },
635 { "template", required_argument, NULL, ARG_TEMPLATE },
636 { "ephemeral", no_argument, NULL, 'x' },
637 { "user", required_argument, NULL, 'u' },
638 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
639 { "as-pid2", no_argument, NULL, 'a' },
640 { "boot", no_argument, NULL, 'b' },
641 { "uuid", required_argument, NULL, ARG_UUID },
642 { "read-only", no_argument, NULL, ARG_READ_ONLY },
643 { "capability", required_argument, NULL, ARG_CAPABILITY },
644 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
645 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
646 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
647 { "bind", required_argument, NULL, ARG_BIND },
648 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
649 { "tmpfs", required_argument, NULL, ARG_TMPFS },
650 { "overlay", required_argument, NULL, ARG_OVERLAY },
651 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
652 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
653 { "machine", required_argument, NULL, 'M' },
654 { "hostname", required_argument, NULL, ARG_HOSTNAME },
655 { "slice", required_argument, NULL, 'S' },
656 { "setenv", required_argument, NULL, 'E' },
657 { "selinux-context", required_argument, NULL, 'Z' },
658 { "selinux-apifs-context", required_argument, NULL, 'L' },
659 { "quiet", no_argument, NULL, 'q' },
660 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
661 { "register", required_argument, NULL, ARG_REGISTER },
662 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
663 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
664 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
665 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
666 { "network-veth", no_argument, NULL, 'n' },
667 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
668 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
669 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
670 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
671 { "personality", required_argument, NULL, ARG_PERSONALITY },
672 { "image", required_argument, NULL, 'i' },
673 { "volatile", optional_argument, NULL, ARG_VOLATILE },
674 { "port", required_argument, NULL, 'p' },
675 { "property", required_argument, NULL, ARG_PROPERTY },
676 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
677 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
678 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
679 { "settings", required_argument, NULL, ARG_SETTINGS },
680 { "chdir", required_argument, NULL, ARG_CHDIR },
681 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
682 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
683 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
684 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
685 { "rlimit", required_argument, NULL, ARG_RLIMIT },
686 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
687 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
688 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
689 { "timezone", required_argument, NULL, ARG_TIMEZONE },
690 { "console", required_argument, NULL, ARG_CONSOLE },
691 { "pipe", no_argument, NULL, ARG_PIPE },
692 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
693 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
694 {}
695 };
696
697 int c, r;
698 const char *p;
699 uint64_t plus = 0, minus = 0;
700 bool mask_all_settings = false, mask_no_settings = false;
701
702 assert(argc >= 0);
703 assert(argv);
704
705 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
706 switch (c) {
707
708 case 'h':
709 return help();
710
711 case ARG_VERSION:
712 return version();
713
714 case 'D':
715 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
716 if (r < 0)
717 return r;
718
719 arg_settings_mask |= SETTING_DIRECTORY;
720 break;
721
722 case ARG_TEMPLATE:
723 r = parse_path_argument_and_warn(optarg, false, &arg_template);
724 if (r < 0)
725 return r;
726
727 arg_settings_mask |= SETTING_DIRECTORY;
728 break;
729
730 case 'i':
731 r = parse_path_argument_and_warn(optarg, false, &arg_image);
732 if (r < 0)
733 return r;
734
735 arg_settings_mask |= SETTING_DIRECTORY;
736 break;
737
738 case ARG_OCI_BUNDLE:
739 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
740 if (r < 0)
741 return r;
742
743 break;
744
745 case 'x':
746 arg_ephemeral = true;
747 arg_settings_mask |= SETTING_EPHEMERAL;
748 break;
749
750 case 'u':
751 r = free_and_strdup(&arg_user, optarg);
752 if (r < 0)
753 return log_oom();
754
755 arg_settings_mask |= SETTING_USER;
756 break;
757
758 case ARG_NETWORK_ZONE: {
759 char *j;
760
761 j = strjoin("vz-", optarg);
762 if (!j)
763 return log_oom();
764
765 if (!ifname_valid(j)) {
766 log_error("Network zone name not valid: %s", j);
767 free(j);
768 return -EINVAL;
769 }
770
771 free_and_replace(arg_network_zone, j);
772
773 arg_network_veth = true;
774 arg_private_network = true;
775 arg_settings_mask |= SETTING_NETWORK;
776 break;
777 }
778
779 case ARG_NETWORK_BRIDGE:
780
781 if (!ifname_valid(optarg))
782 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
783 "Bridge interface name not valid: %s", optarg);
784
785 r = free_and_strdup(&arg_network_bridge, optarg);
786 if (r < 0)
787 return log_oom();
788
789 _fallthrough_;
790 case 'n':
791 arg_network_veth = true;
792 arg_private_network = true;
793 arg_settings_mask |= SETTING_NETWORK;
794 break;
795
796 case ARG_NETWORK_VETH_EXTRA:
797 r = veth_extra_parse(&arg_network_veth_extra, optarg);
798 if (r < 0)
799 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
800
801 arg_private_network = true;
802 arg_settings_mask |= SETTING_NETWORK;
803 break;
804
805 case ARG_NETWORK_INTERFACE:
806 if (!ifname_valid(optarg))
807 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
808 "Network interface name not valid: %s", optarg);
809
810 if (strv_extend(&arg_network_interfaces, optarg) < 0)
811 return log_oom();
812
813 arg_private_network = true;
814 arg_settings_mask |= SETTING_NETWORK;
815 break;
816
817 case ARG_NETWORK_MACVLAN:
818
819 if (!ifname_valid(optarg))
820 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
821 "MACVLAN network interface name not valid: %s", optarg);
822
823 if (strv_extend(&arg_network_macvlan, optarg) < 0)
824 return log_oom();
825
826 arg_private_network = true;
827 arg_settings_mask |= SETTING_NETWORK;
828 break;
829
830 case ARG_NETWORK_IPVLAN:
831
832 if (!ifname_valid(optarg))
833 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
834 "IPVLAN network interface name not valid: %s", optarg);
835
836 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
837 return log_oom();
838
839 _fallthrough_;
840 case ARG_PRIVATE_NETWORK:
841 arg_private_network = true;
842 arg_settings_mask |= SETTING_NETWORK;
843 break;
844
845 case ARG_NETWORK_NAMESPACE_PATH:
846 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
847 if (r < 0)
848 return r;
849
850 arg_settings_mask |= SETTING_NETWORK;
851 break;
852
853 case 'b':
854 if (arg_start_mode == START_PID2)
855 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
856 "--boot and --as-pid2 may not be combined.");
857
858 arg_start_mode = START_BOOT;
859 arg_settings_mask |= SETTING_START_MODE;
860 break;
861
862 case 'a':
863 if (arg_start_mode == START_BOOT)
864 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
865 "--boot and --as-pid2 may not be combined.");
866
867 arg_start_mode = START_PID2;
868 arg_settings_mask |= SETTING_START_MODE;
869 break;
870
871 case ARG_UUID:
872 r = sd_id128_from_string(optarg, &arg_uuid);
873 if (r < 0)
874 return log_error_errno(r, "Invalid UUID: %s", optarg);
875
876 if (sd_id128_is_null(arg_uuid))
877 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
878 "Machine UUID may not be all zeroes.");
879
880 arg_settings_mask |= SETTING_MACHINE_ID;
881 break;
882
883 case 'S': {
884 _cleanup_free_ char *mangled = NULL;
885
886 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
887 if (r < 0)
888 return log_oom();
889
890 free_and_replace(arg_slice, mangled);
891 arg_settings_mask |= SETTING_SLICE;
892 break;
893 }
894
895 case 'M':
896 if (isempty(optarg))
897 arg_machine = mfree(arg_machine);
898 else {
899 if (!machine_name_is_valid(optarg))
900 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
901 "Invalid machine name: %s", optarg);
902
903 r = free_and_strdup(&arg_machine, optarg);
904 if (r < 0)
905 return log_oom();
906 }
907 break;
908
909 case ARG_HOSTNAME:
910 if (isempty(optarg))
911 arg_hostname = mfree(arg_hostname);
912 else {
913 if (!hostname_is_valid(optarg, false))
914 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
915 "Invalid hostname: %s", optarg);
916
917 r = free_and_strdup(&arg_hostname, optarg);
918 if (r < 0)
919 return log_oom();
920 }
921
922 arg_settings_mask |= SETTING_HOSTNAME;
923 break;
924
925 case 'Z':
926 arg_selinux_context = optarg;
927 break;
928
929 case 'L':
930 arg_selinux_apifs_context = optarg;
931 break;
932
933 case ARG_READ_ONLY:
934 arg_read_only = true;
935 arg_settings_mask |= SETTING_READ_ONLY;
936 break;
937
938 case ARG_CAPABILITY:
939 case ARG_DROP_CAPABILITY: {
940 p = optarg;
941 for (;;) {
942 _cleanup_free_ char *t = NULL;
943
944 r = extract_first_word(&p, &t, ",", 0);
945 if (r < 0)
946 return log_error_errno(r, "Failed to parse capability %s.", t);
947 if (r == 0)
948 break;
949
950 if (streq(t, "all")) {
951 if (c == ARG_CAPABILITY)
952 plus = (uint64_t) -1;
953 else
954 minus = (uint64_t) -1;
955 } else {
956 r = capability_from_name(t);
957 if (r < 0)
958 return log_error_errno(r, "Failed to parse capability %s.", t);
959
960 if (c == ARG_CAPABILITY)
961 plus |= 1ULL << r;
962 else
963 minus |= 1ULL << r;
964 }
965 }
966
967 arg_settings_mask |= SETTING_CAPABILITY;
968 break;
969 }
970
971 case ARG_NO_NEW_PRIVILEGES:
972 r = parse_boolean(optarg);
973 if (r < 0)
974 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
975
976 arg_no_new_privileges = r;
977 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
978 break;
979
980 case 'j':
981 arg_link_journal = LINK_GUEST;
982 arg_link_journal_try = true;
983 arg_settings_mask |= SETTING_LINK_JOURNAL;
984 break;
985
986 case ARG_LINK_JOURNAL:
987 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
988 if (r < 0)
989 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
990
991 arg_settings_mask |= SETTING_LINK_JOURNAL;
992 break;
993
994 case ARG_BIND:
995 case ARG_BIND_RO:
996 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
997 if (r < 0)
998 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
999
1000 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1001 break;
1002
1003 case ARG_TMPFS:
1004 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1005 if (r < 0)
1006 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1007
1008 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1009 break;
1010
1011 case ARG_OVERLAY:
1012 case ARG_OVERLAY_RO:
1013 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1014 if (r == -EADDRNOTAVAIL)
1015 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1016 if (r < 0)
1017 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1018
1019 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1020 break;
1021
1022 case ARG_INACCESSIBLE:
1023 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1024 if (r < 0)
1025 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1026
1027 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1028 break;
1029
1030 case 'E': {
1031 char **n;
1032
1033 if (!env_assignment_is_valid(optarg))
1034 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1035 "Environment variable assignment '%s' is not valid.", optarg);
1036
1037 n = strv_env_set(arg_setenv, optarg);
1038 if (!n)
1039 return log_oom();
1040
1041 strv_free_and_replace(arg_setenv, n);
1042 arg_settings_mask |= SETTING_ENVIRONMENT;
1043 break;
1044 }
1045
1046 case 'q':
1047 arg_quiet = true;
1048 break;
1049
1050 case ARG_SHARE_SYSTEM:
1051 /* We don't officially support this anymore, except for compat reasons. People should use the
1052 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1053 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1054 arg_clone_ns_flags = 0;
1055 break;
1056
1057 case ARG_REGISTER:
1058 r = parse_boolean(optarg);
1059 if (r < 0) {
1060 log_error("Failed to parse --register= argument: %s", optarg);
1061 return r;
1062 }
1063
1064 arg_register = r;
1065 break;
1066
1067 case ARG_KEEP_UNIT:
1068 arg_keep_unit = true;
1069 break;
1070
1071 case ARG_PERSONALITY:
1072
1073 arg_personality = personality_from_string(optarg);
1074 if (arg_personality == PERSONALITY_INVALID)
1075 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1076 "Unknown or unsupported personality '%s'.", optarg);
1077
1078 arg_settings_mask |= SETTING_PERSONALITY;
1079 break;
1080
1081 case ARG_VOLATILE:
1082
1083 if (!optarg)
1084 arg_volatile_mode = VOLATILE_YES;
1085 else if (streq(optarg, "help")) {
1086 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1087 return 0;
1088 } else {
1089 VolatileMode m;
1090
1091 m = volatile_mode_from_string(optarg);
1092 if (m < 0)
1093 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1094 "Failed to parse --volatile= argument: %s", optarg);
1095 else
1096 arg_volatile_mode = m;
1097 }
1098
1099 arg_settings_mask |= SETTING_VOLATILE_MODE;
1100 break;
1101
1102 case 'p':
1103 r = expose_port_parse(&arg_expose_ports, optarg);
1104 if (r == -EEXIST)
1105 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1106 if (r < 0)
1107 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1108
1109 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1110 break;
1111
1112 case ARG_PROPERTY:
1113 if (strv_extend(&arg_property, optarg) < 0)
1114 return log_oom();
1115
1116 break;
1117
1118 case ARG_PRIVATE_USERS: {
1119 int boolean = -1;
1120
1121 if (!optarg)
1122 boolean = true;
1123 else if (!in_charset(optarg, DIGITS))
1124 /* do *not* parse numbers as booleans */
1125 boolean = parse_boolean(optarg);
1126
1127 if (boolean == false) {
1128 /* no: User namespacing off */
1129 arg_userns_mode = USER_NAMESPACE_NO;
1130 arg_uid_shift = UID_INVALID;
1131 arg_uid_range = UINT32_C(0x10000);
1132 } else if (boolean == true) {
1133 /* yes: User namespacing on, UID range is read from root dir */
1134 arg_userns_mode = USER_NAMESPACE_FIXED;
1135 arg_uid_shift = UID_INVALID;
1136 arg_uid_range = UINT32_C(0x10000);
1137 } else if (streq(optarg, "pick")) {
1138 /* pick: User namespacing on, UID range is picked randomly */
1139 arg_userns_mode = USER_NAMESPACE_PICK;
1140 arg_uid_shift = UID_INVALID;
1141 arg_uid_range = UINT32_C(0x10000);
1142 } else {
1143 _cleanup_free_ char *buffer = NULL;
1144 const char *range, *shift;
1145
1146 /* anything else: User namespacing on, UID range is explicitly configured */
1147
1148 range = strchr(optarg, ':');
1149 if (range) {
1150 buffer = strndup(optarg, range - optarg);
1151 if (!buffer)
1152 return log_oom();
1153 shift = buffer;
1154
1155 range++;
1156 r = safe_atou32(range, &arg_uid_range);
1157 if (r < 0)
1158 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1159 } else
1160 shift = optarg;
1161
1162 r = parse_uid(shift, &arg_uid_shift);
1163 if (r < 0)
1164 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1165
1166 arg_userns_mode = USER_NAMESPACE_FIXED;
1167 }
1168
1169 if (arg_uid_range <= 0)
1170 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1171 "UID range cannot be 0.");
1172
1173 arg_settings_mask |= SETTING_USERNS;
1174 break;
1175 }
1176
1177 case 'U':
1178 if (userns_supported()) {
1179 arg_userns_mode = USER_NAMESPACE_PICK;
1180 arg_uid_shift = UID_INVALID;
1181 arg_uid_range = UINT32_C(0x10000);
1182
1183 arg_settings_mask |= SETTING_USERNS;
1184 }
1185
1186 break;
1187
1188 case ARG_PRIVATE_USERS_CHOWN:
1189 arg_userns_chown = true;
1190
1191 arg_settings_mask |= SETTING_USERNS;
1192 break;
1193
1194 case ARG_KILL_SIGNAL:
1195 if (streq(optarg, "help")) {
1196 DUMP_STRING_TABLE(signal, int, _NSIG);
1197 return 0;
1198 }
1199
1200 arg_kill_signal = signal_from_string(optarg);
1201 if (arg_kill_signal < 0)
1202 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1203 "Cannot parse signal: %s", optarg);
1204
1205 arg_settings_mask |= SETTING_KILL_SIGNAL;
1206 break;
1207
1208 case ARG_SETTINGS:
1209
1210 /* no → do not read files
1211 * yes → read files, do not override cmdline, trust only subset
1212 * override → read files, override cmdline, trust only subset
1213 * trusted → read files, do not override cmdline, trust all
1214 */
1215
1216 r = parse_boolean(optarg);
1217 if (r < 0) {
1218 if (streq(optarg, "trusted")) {
1219 mask_all_settings = false;
1220 mask_no_settings = false;
1221 arg_settings_trusted = true;
1222
1223 } else if (streq(optarg, "override")) {
1224 mask_all_settings = false;
1225 mask_no_settings = true;
1226 arg_settings_trusted = -1;
1227 } else
1228 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1229 } else if (r > 0) {
1230 /* yes */
1231 mask_all_settings = false;
1232 mask_no_settings = false;
1233 arg_settings_trusted = -1;
1234 } else {
1235 /* no */
1236 mask_all_settings = true;
1237 mask_no_settings = false;
1238 arg_settings_trusted = false;
1239 }
1240
1241 break;
1242
1243 case ARG_CHDIR:
1244 if (!path_is_absolute(optarg))
1245 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1246 "Working directory %s is not an absolute path.", optarg);
1247
1248 r = free_and_strdup(&arg_chdir, optarg);
1249 if (r < 0)
1250 return log_oom();
1251
1252 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1253 break;
1254
1255 case ARG_PIVOT_ROOT:
1256 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1257 if (r < 0)
1258 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1259
1260 arg_settings_mask |= SETTING_PIVOT_ROOT;
1261 break;
1262
1263 case ARG_NOTIFY_READY:
1264 r = parse_boolean(optarg);
1265 if (r < 0)
1266 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1267 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1268 arg_notify_ready = r;
1269 arg_settings_mask |= SETTING_NOTIFY_READY;
1270 break;
1271
1272 case ARG_ROOT_HASH: {
1273 void *k;
1274 size_t l;
1275
1276 r = unhexmem(optarg, strlen(optarg), &k, &l);
1277 if (r < 0)
1278 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1279 if (l < sizeof(sd_id128_t)) {
1280 free(k);
1281 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1282 }
1283
1284 free(arg_root_hash);
1285 arg_root_hash = k;
1286 arg_root_hash_size = l;
1287 break;
1288 }
1289
1290 case ARG_SYSTEM_CALL_FILTER: {
1291 bool negative;
1292 const char *items;
1293
1294 negative = optarg[0] == '~';
1295 items = negative ? optarg + 1 : optarg;
1296
1297 for (;;) {
1298 _cleanup_free_ char *word = NULL;
1299
1300 r = extract_first_word(&items, &word, NULL, 0);
1301 if (r == 0)
1302 break;
1303 if (r == -ENOMEM)
1304 return log_oom();
1305 if (r < 0)
1306 return log_error_errno(r, "Failed to parse system call filter: %m");
1307
1308 if (negative)
1309 r = strv_extend(&arg_syscall_blacklist, word);
1310 else
1311 r = strv_extend(&arg_syscall_whitelist, word);
1312 if (r < 0)
1313 return log_oom();
1314 }
1315
1316 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1317 break;
1318 }
1319
1320 case ARG_RLIMIT: {
1321 const char *eq;
1322 _cleanup_free_ char *name = NULL;
1323 int rl;
1324
1325 if (streq(optarg, "help")) {
1326 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1327 return 0;
1328 }
1329
1330 eq = strchr(optarg, '=');
1331 if (!eq)
1332 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1333 "--rlimit= expects an '=' assignment.");
1334
1335 name = strndup(optarg, eq - optarg);
1336 if (!name)
1337 return log_oom();
1338
1339 rl = rlimit_from_string_harder(name);
1340 if (rl < 0)
1341 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1342 "Unknown resource limit: %s", name);
1343
1344 if (!arg_rlimit[rl]) {
1345 arg_rlimit[rl] = new0(struct rlimit, 1);
1346 if (!arg_rlimit[rl])
1347 return log_oom();
1348 }
1349
1350 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1351 if (r < 0)
1352 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1353
1354 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1355 break;
1356 }
1357
1358 case ARG_OOM_SCORE_ADJUST:
1359 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1360 if (r < 0)
1361 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1362
1363 arg_oom_score_adjust_set = true;
1364 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1365 break;
1366
1367 case ARG_CPU_AFFINITY: {
1368 CPUSet cpuset;
1369
1370 r = parse_cpu_set(optarg, &cpuset);
1371 if (r < 0)
1372 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1373
1374 cpu_set_reset(&arg_cpu_set);
1375 arg_cpu_set = cpuset;
1376 arg_settings_mask |= SETTING_CPU_AFFINITY;
1377 break;
1378 }
1379
1380 case ARG_RESOLV_CONF:
1381 if (streq(optarg, "help")) {
1382 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1383 return 0;
1384 }
1385
1386 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1387 if (arg_resolv_conf < 0)
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1389 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1390
1391 arg_settings_mask |= SETTING_RESOLV_CONF;
1392 break;
1393
1394 case ARG_TIMEZONE:
1395 if (streq(optarg, "help")) {
1396 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1397 return 0;
1398 }
1399
1400 arg_timezone = timezone_mode_from_string(optarg);
1401 if (arg_timezone < 0)
1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1403 "Failed to parse /etc/localtime mode: %s", optarg);
1404
1405 arg_settings_mask |= SETTING_TIMEZONE;
1406 break;
1407
1408 case ARG_CONSOLE:
1409 r = handle_arg_console(optarg);
1410 if (r <= 0)
1411 return r;
1412 break;
1413
1414 case 'P':
1415 case ARG_PIPE:
1416 r = handle_arg_console("pipe");
1417 if (r <= 0)
1418 return r;
1419 break;
1420
1421 case ARG_NO_PAGER:
1422 arg_pager_flags |= PAGER_DISABLE;
1423 break;
1424
1425 case '?':
1426 return -EINVAL;
1427
1428 default:
1429 assert_not_reached("Unhandled option");
1430 }
1431
1432 if (argc > optind) {
1433 strv_free(arg_parameters);
1434 arg_parameters = strv_copy(argv + optind);
1435 if (!arg_parameters)
1436 return log_oom();
1437
1438 arg_settings_mask |= SETTING_START_MODE;
1439 }
1440
1441 if (arg_ephemeral && arg_template && !arg_directory)
1442 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1443 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1444 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1445 * --directory=". */
1446 arg_directory = TAKE_PTR(arg_template);
1447
1448 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1449
1450 /* Make sure to parse environment before we reset the settings mask below */
1451 r = parse_environment();
1452 if (r < 0)
1453 return r;
1454
1455 /* Load all settings from .nspawn files */
1456 if (mask_no_settings)
1457 arg_settings_mask = 0;
1458
1459 /* Don't load any settings from .nspawn files */
1460 if (mask_all_settings)
1461 arg_settings_mask = _SETTINGS_MASK_ALL;
1462
1463 return 1;
1464 }
1465
1466 static int verify_arguments(void) {
1467 int r;
1468
1469 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1470 /* If we are running the stub init in the container, we don't need to look at what the init
1471 * in the container supports, because we are not using it. Let's immediately pick the right
1472 * setting based on the host system configuration.
1473 *
1474 * We only do this, if the user didn't use an environment variable to override the detection.
1475 */
1476
1477 r = cg_all_unified();
1478 if (r < 0)
1479 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1480 if (r > 0)
1481 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1482 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1483 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1484 else
1485 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1486 }
1487
1488 if (arg_userns_mode != USER_NAMESPACE_NO)
1489 arg_mount_settings |= MOUNT_USE_USERNS;
1490
1491 if (arg_private_network)
1492 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1493
1494 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1495 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1496 arg_register = false;
1497 if (arg_start_mode != START_PID1)
1498 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1499 }
1500
1501 if (arg_userns_mode == USER_NAMESPACE_PICK)
1502 arg_userns_chown = true;
1503
1504 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1505 arg_kill_signal = SIGRTMIN+3;
1506
1507 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1508 arg_read_only = true;
1509
1510 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1511 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1512 * The latter is not technically a user session, but we don't need to labour the point. */
1513 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1514
1515 if (arg_directory && arg_image)
1516 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1517
1518 if (arg_template && arg_image)
1519 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1520
1521 if (arg_template && !(arg_directory || arg_machine))
1522 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1523
1524 if (arg_ephemeral && arg_template)
1525 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1526
1527 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1528 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1529
1530 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1531 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1532
1533 if (arg_userns_chown && arg_read_only)
1534 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1535 "--read-only and --private-users-chown may not be combined.");
1536
1537 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1538 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1539 * copy-up (in case of overlay) making the entire exercise pointless. */
1540 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1541 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1542
1543 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1544 * to avoid conflicts between different network options. */
1545 if (arg_network_namespace_path &&
1546 (arg_network_interfaces || arg_network_macvlan ||
1547 arg_network_ipvlan || arg_network_veth_extra ||
1548 arg_network_bridge || arg_network_zone ||
1549 arg_network_veth || arg_private_network))
1550 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1551
1552 if (arg_network_bridge && arg_network_zone)
1553 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1554 "--network-bridge= and --network-zone= may not be combined.");
1555
1556 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1557 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1558
1559 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1560 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1561
1562 if (arg_expose_ports && !arg_private_network)
1563 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1564
1565 #if ! HAVE_LIBIPTC
1566 if (arg_expose_ports)
1567 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1568 #endif
1569
1570 r = custom_mount_check_all();
1571 if (r < 0)
1572 return r;
1573
1574 return 0;
1575 }
1576
1577 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1578 assert(p);
1579
1580 if (arg_userns_mode == USER_NAMESPACE_NO)
1581 return 0;
1582
1583 if (uid == UID_INVALID && gid == GID_INVALID)
1584 return 0;
1585
1586 if (uid != UID_INVALID) {
1587 uid += arg_uid_shift;
1588
1589 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1590 return -EOVERFLOW;
1591 }
1592
1593 if (gid != GID_INVALID) {
1594 gid += (gid_t) arg_uid_shift;
1595
1596 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1597 return -EOVERFLOW;
1598 }
1599
1600 if (lchown(p, uid, gid) < 0)
1601 return -errno;
1602
1603 return 0;
1604 }
1605
1606 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1607 const char *q;
1608 int r;
1609
1610 q = prefix_roota(root, path);
1611 r = mkdir_errno_wrapper(q, mode);
1612 if (r == -EEXIST)
1613 return 0;
1614 if (r < 0)
1615 return r;
1616
1617 return userns_lchown(q, uid, gid);
1618 }
1619
1620 static const char *timezone_from_path(const char *path) {
1621 return PATH_STARTSWITH_SET(
1622 path,
1623 "../usr/share/zoneinfo/",
1624 "/usr/share/zoneinfo/");
1625 }
1626
1627 static bool etc_writable(void) {
1628 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1629 }
1630
1631 static int setup_timezone(const char *dest) {
1632 _cleanup_free_ char *p = NULL, *etc = NULL;
1633 const char *where, *check;
1634 TimezoneMode m;
1635 int r;
1636
1637 assert(dest);
1638
1639 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1640 r = readlink_malloc("/etc/localtime", &p);
1641 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1642 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1643 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1644 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1645 else if (r < 0) {
1646 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1647 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1648 * file.
1649 *
1650 * Example:
1651 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1652 */
1653 return 0;
1654 } else if (arg_timezone == TIMEZONE_AUTO)
1655 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1656 else
1657 m = arg_timezone;
1658 } else
1659 m = arg_timezone;
1660
1661 if (m == TIMEZONE_OFF)
1662 return 0;
1663
1664 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1665 if (r < 0) {
1666 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1667 return 0;
1668 }
1669
1670 where = strjoina(etc, "/localtime");
1671
1672 switch (m) {
1673
1674 case TIMEZONE_DELETE:
1675 if (unlink(where) < 0)
1676 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1677
1678 return 0;
1679
1680 case TIMEZONE_SYMLINK: {
1681 _cleanup_free_ char *q = NULL;
1682 const char *z, *what;
1683
1684 z = timezone_from_path(p);
1685 if (!z) {
1686 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1687 return 0;
1688 }
1689
1690 r = readlink_malloc(where, &q);
1691 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1692 return 0; /* Already pointing to the right place? Then do nothing .. */
1693
1694 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1695 r = chase_symlinks(check, dest, 0, NULL, NULL);
1696 if (r < 0)
1697 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1698 else {
1699 if (unlink(where) < 0 && errno != ENOENT) {
1700 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1701 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1702 return 0;
1703 }
1704
1705 what = strjoina("../usr/share/zoneinfo/", z);
1706 if (symlink(what, where) < 0) {
1707 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1708 errno, "Failed to correct timezone of container, ignoring: %m");
1709 return 0;
1710 }
1711
1712 break;
1713 }
1714
1715 _fallthrough_;
1716 }
1717
1718 case TIMEZONE_BIND: {
1719 _cleanup_free_ char *resolved = NULL;
1720 int found;
1721
1722 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1723 if (found < 0) {
1724 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1725 return 0;
1726 }
1727
1728 if (found == 0) /* missing? */
1729 (void) touch(resolved);
1730
1731 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1732 if (r >= 0)
1733 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1734
1735 _fallthrough_;
1736 }
1737
1738 case TIMEZONE_COPY:
1739 /* If mounting failed, try to copy */
1740 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1741 if (r < 0) {
1742 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1743 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1744 return 0;
1745 }
1746
1747 break;
1748
1749 default:
1750 assert_not_reached("unexpected mode");
1751 }
1752
1753 /* Fix permissions of the symlink or file copy we just created */
1754 r = userns_lchown(where, 0, 0);
1755 if (r < 0)
1756 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1757
1758 return 0;
1759 }
1760
1761 static int have_resolv_conf(const char *path) {
1762 assert(path);
1763
1764 if (access(path, F_OK) < 0) {
1765 if (errno == ENOENT)
1766 return 0;
1767
1768 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1769 }
1770
1771 return 1;
1772 }
1773
1774 static int resolved_listening(void) {
1775 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1776 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1777 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1778 int r;
1779
1780 /* Check if resolved is listening */
1781
1782 r = sd_bus_open_system(&bus);
1783 if (r < 0)
1784 return log_debug_errno(r, "Failed to open system bus: %m");
1785
1786 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1787 if (r < 0)
1788 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1789 if (r == 0)
1790 return 0;
1791
1792 r = sd_bus_get_property_string(bus,
1793 "org.freedesktop.resolve1",
1794 "/org/freedesktop/resolve1",
1795 "org.freedesktop.resolve1.Manager",
1796 "DNSStubListener",
1797 &error,
1798 &dns_stub_listener_mode);
1799 if (r < 0)
1800 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1801
1802 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1803 }
1804
1805 static int setup_resolv_conf(const char *dest) {
1806 _cleanup_free_ char *etc = NULL;
1807 const char *where, *what;
1808 ResolvConfMode m;
1809 int r;
1810
1811 assert(dest);
1812
1813 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1814 if (arg_private_network)
1815 m = RESOLV_CONF_OFF;
1816 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1817 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
1818 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1819 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
1820 else
1821 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
1822 } else
1823 m = arg_resolv_conf;
1824
1825 if (m == RESOLV_CONF_OFF)
1826 return 0;
1827
1828 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1829 if (r < 0) {
1830 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1831 return 0;
1832 }
1833
1834 where = strjoina(etc, "/resolv.conf");
1835
1836 if (m == RESOLV_CONF_DELETE) {
1837 if (unlink(where) < 0)
1838 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1839
1840 return 0;
1841 }
1842
1843 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1844 what = STATIC_RESOLV_CONF;
1845 else
1846 what = "/etc/resolv.conf";
1847
1848 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1849 _cleanup_free_ char *resolved = NULL;
1850 int found;
1851
1852 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1853 if (found < 0) {
1854 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1855 return 0;
1856 }
1857
1858 if (found == 0) /* missing? */
1859 (void) touch(resolved);
1860
1861 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1862 if (r >= 0)
1863 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1864 }
1865
1866 /* If that didn't work, let's copy the file */
1867 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
1868 if (r < 0) {
1869 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1870 * resolved or something similar runs inside and the symlink points there.
1871 *
1872 * If the disk image is read-only, there's also no point in complaining.
1873 */
1874 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1875 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1876 return 0;
1877 }
1878
1879 r = userns_lchown(where, 0, 0);
1880 if (r < 0)
1881 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1882
1883 return 0;
1884 }
1885
1886 static int setup_boot_id(void) {
1887 _cleanup_(unlink_and_freep) char *from = NULL;
1888 _cleanup_free_ char *path = NULL;
1889 sd_id128_t rnd = SD_ID128_NULL;
1890 const char *to;
1891 int r;
1892
1893 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
1894
1895 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
1896 if (r < 0)
1897 return log_error_errno(r, "Failed to generate random boot ID path: %m");
1898
1899 r = sd_id128_randomize(&rnd);
1900 if (r < 0)
1901 return log_error_errno(r, "Failed to generate random boot id: %m");
1902
1903 r = id128_write(path, ID128_UUID, rnd, false);
1904 if (r < 0)
1905 return log_error_errno(r, "Failed to write boot id: %m");
1906
1907 from = TAKE_PTR(path);
1908 to = "/proc/sys/kernel/random/boot_id";
1909
1910 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1911 if (r < 0)
1912 return r;
1913
1914 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1915 }
1916
1917 static int copy_devnodes(const char *dest) {
1918 static const char devnodes[] =
1919 "null\0"
1920 "zero\0"
1921 "full\0"
1922 "random\0"
1923 "urandom\0"
1924 "tty\0"
1925 "net/tun\0";
1926
1927 _cleanup_umask_ mode_t u;
1928 const char *d;
1929 int r = 0;
1930
1931 assert(dest);
1932
1933 u = umask(0000);
1934
1935 /* Create /dev/net, so that we can create /dev/net/tun in it */
1936 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1937 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1938
1939 NULSTR_FOREACH(d, devnodes) {
1940 _cleanup_free_ char *from = NULL, *to = NULL;
1941 struct stat st;
1942
1943 from = path_join("/dev/", d);
1944 if (!from)
1945 return log_oom();
1946
1947 to = path_join(dest, from);
1948 if (!to)
1949 return log_oom();
1950
1951 if (stat(from, &st) < 0) {
1952
1953 if (errno != ENOENT)
1954 return log_error_errno(errno, "Failed to stat %s: %m", from);
1955
1956 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1957 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1958 "%s is not a char or block device, cannot copy.", from);
1959 else {
1960 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1961
1962 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1963 /* Explicitly warn the user when /dev is already populated. */
1964 if (errno == EEXIST)
1965 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1966 if (errno != EPERM)
1967 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1968
1969 /* Some systems abusively restrict mknod but allow bind mounts. */
1970 r = touch(to);
1971 if (r < 0)
1972 return log_error_errno(r, "touch (%s) failed: %m", to);
1973 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1974 if (r < 0)
1975 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1976 }
1977
1978 r = userns_lchown(to, 0, 0);
1979 if (r < 0)
1980 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1981
1982 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
1983 if (!dn)
1984 return log_oom();
1985
1986 r = userns_mkdir(dest, dn, 0755, 0, 0);
1987 if (r < 0)
1988 return log_error_errno(r, "Failed to create '%s': %m", dn);
1989
1990 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1991 return log_oom();
1992
1993 prefixed = path_join(dest, sl);
1994 if (!prefixed)
1995 return log_oom();
1996
1997 t = path_join("..", d);
1998 if (!t)
1999 return log_oom();
2000
2001 if (symlink(t, prefixed) < 0)
2002 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2003 }
2004 }
2005
2006 return r;
2007 }
2008
2009 static int make_extra_nodes(const char *dest) {
2010 _cleanup_umask_ mode_t u;
2011 size_t i;
2012 int r;
2013
2014 u = umask(0000);
2015
2016 for (i = 0; i < arg_n_extra_nodes; i++) {
2017 _cleanup_free_ char *path = NULL;
2018 DeviceNode *n = arg_extra_nodes + i;
2019
2020 path = path_join(dest, n->path);
2021 if (!path)
2022 return log_oom();
2023
2024 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2025 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2026
2027 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2028 if (r < 0)
2029 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2030 }
2031
2032 return 0;
2033 }
2034
2035 static int setup_pts(const char *dest) {
2036 _cleanup_free_ char *options = NULL;
2037 const char *p;
2038 int r;
2039
2040 #if HAVE_SELINUX
2041 if (arg_selinux_apifs_context)
2042 (void) asprintf(&options,
2043 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2044 arg_uid_shift + TTY_GID,
2045 arg_selinux_apifs_context);
2046 else
2047 #endif
2048 (void) asprintf(&options,
2049 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2050 arg_uid_shift + TTY_GID);
2051
2052 if (!options)
2053 return log_oom();
2054
2055 /* Mount /dev/pts itself */
2056 p = prefix_roota(dest, "/dev/pts");
2057 r = mkdir_errno_wrapper(p, 0755);
2058 if (r < 0)
2059 return log_error_errno(r, "Failed to create /dev/pts: %m");
2060
2061 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2062 if (r < 0)
2063 return r;
2064 r = userns_lchown(p, 0, 0);
2065 if (r < 0)
2066 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2067
2068 /* Create /dev/ptmx symlink */
2069 p = prefix_roota(dest, "/dev/ptmx");
2070 if (symlink("pts/ptmx", p) < 0)
2071 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2072 r = userns_lchown(p, 0, 0);
2073 if (r < 0)
2074 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2075
2076 /* And fix /dev/pts/ptmx ownership */
2077 p = prefix_roota(dest, "/dev/pts/ptmx");
2078 r = userns_lchown(p, 0, 0);
2079 if (r < 0)
2080 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2081
2082 return 0;
2083 }
2084
2085 static int setup_stdio_as_dev_console(void) {
2086 int terminal;
2087 int r;
2088
2089 terminal = open_terminal("/dev/console", O_RDWR);
2090 if (terminal < 0)
2091 return log_error_errno(terminal, "Failed to open console: %m");
2092
2093 /* Make sure we can continue logging to the original stderr, even if
2094 * stderr points elsewhere now */
2095 r = log_dup_console();
2096 if (r < 0)
2097 return log_error_errno(r, "Failed to duplicate stderr: %m");
2098
2099 /* invalidates 'terminal' on success and failure */
2100 r = rearrange_stdio(terminal, terminal, terminal);
2101 if (r < 0)
2102 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2103
2104 return 0;
2105 }
2106
2107 static int setup_dev_console(const char *console) {
2108 _cleanup_free_ char *p = NULL;
2109 int r;
2110
2111 /* Create /dev/console symlink */
2112 r = path_make_relative("/dev", console, &p);
2113 if (r < 0)
2114 return log_error_errno(r, "Failed to create relative path: %m");
2115
2116 if (symlink(p, "/dev/console") < 0)
2117 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2118
2119 return 0;
2120 }
2121
2122 static int setup_keyring(void) {
2123 key_serial_t keyring;
2124
2125 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2126 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2127 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2128 * these system calls let's make sure we don't leak anything into the container. */
2129
2130 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2131 if (keyring == -1) {
2132 if (errno == ENOSYS)
2133 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2134 else if (IN_SET(errno, EACCES, EPERM))
2135 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2136 else
2137 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2138 }
2139
2140 return 0;
2141 }
2142
2143 static int setup_kmsg(int kmsg_socket) {
2144 _cleanup_(unlink_and_freep) char *from = NULL;
2145 _cleanup_free_ char *fifo = NULL;
2146 _cleanup_close_ int fd = -1;
2147 _cleanup_umask_ mode_t u;
2148 int r;
2149
2150 assert(kmsg_socket >= 0);
2151
2152 u = umask(0000);
2153
2154 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2155 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2156 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2157 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2158
2159 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to generate kmsg path: %m");
2162
2163 if (mkfifo(fifo, 0600) < 0)
2164 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2165
2166 from = TAKE_PTR(fifo);
2167
2168 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2169 if (r < 0)
2170 return r;
2171
2172 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2173 if (fd < 0)
2174 return log_error_errno(errno, "Failed to open fifo: %m");
2175
2176 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2177 r = send_one_fd(kmsg_socket, fd, 0);
2178 if (r < 0)
2179 return log_error_errno(r, "Failed to send FIFO fd: %m");
2180
2181 return 0;
2182 }
2183
2184 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2185 union in_addr_union *exposed = userdata;
2186
2187 assert(rtnl);
2188 assert(m);
2189 assert(exposed);
2190
2191 expose_port_execute(rtnl, arg_expose_ports, exposed);
2192 return 0;
2193 }
2194
2195 static int setup_hostname(void) {
2196 int r;
2197
2198 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2199 return 0;
2200
2201 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2202 if (r < 0)
2203 return log_error_errno(r, "Failed to set hostname: %m");
2204
2205 return 0;
2206 }
2207
2208 static int setup_journal(const char *directory) {
2209 _cleanup_free_ char *d = NULL;
2210 const char *dirname, *p, *q;
2211 sd_id128_t this_id;
2212 char id[33];
2213 bool try;
2214 int r;
2215
2216 /* Don't link journals in ephemeral mode */
2217 if (arg_ephemeral)
2218 return 0;
2219
2220 if (arg_link_journal == LINK_NO)
2221 return 0;
2222
2223 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2224
2225 r = sd_id128_get_machine(&this_id);
2226 if (r < 0)
2227 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2228
2229 if (sd_id128_equal(arg_uuid, this_id)) {
2230 log_full(try ? LOG_WARNING : LOG_ERR,
2231 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2232 if (try)
2233 return 0;
2234 return -EEXIST;
2235 }
2236
2237 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2238 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2239 if (r < 0) {
2240 bool ignore = r == -EROFS && try;
2241 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2242 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2243 return ignore ? 0 : r;
2244 }
2245 }
2246
2247 (void) sd_id128_to_string(arg_uuid, id);
2248
2249 p = strjoina("/var/log/journal/", id);
2250 q = prefix_roota(directory, p);
2251
2252 if (path_is_mount_point(p, NULL, 0) > 0) {
2253 if (try)
2254 return 0;
2255
2256 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2257 "%s: already a mount point, refusing to use for journal", p);
2258 }
2259
2260 if (path_is_mount_point(q, NULL, 0) > 0) {
2261 if (try)
2262 return 0;
2263
2264 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2265 "%s: already a mount point, refusing to use for journal", q);
2266 }
2267
2268 r = readlink_and_make_absolute(p, &d);
2269 if (r >= 0) {
2270 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2271 path_equal(d, q)) {
2272
2273 r = userns_mkdir(directory, p, 0755, 0, 0);
2274 if (r < 0)
2275 log_warning_errno(r, "Failed to create directory %s: %m", q);
2276 return 0;
2277 }
2278
2279 if (unlink(p) < 0)
2280 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2281 } else if (r == -EINVAL) {
2282
2283 if (arg_link_journal == LINK_GUEST &&
2284 rmdir(p) < 0) {
2285
2286 if (errno == ENOTDIR) {
2287 log_error("%s already exists and is neither a symlink nor a directory", p);
2288 return r;
2289 } else
2290 return log_error_errno(errno, "Failed to remove %s: %m", p);
2291 }
2292 } else if (r != -ENOENT)
2293 return log_error_errno(r, "readlink(%s) failed: %m", p);
2294
2295 if (arg_link_journal == LINK_GUEST) {
2296
2297 if (symlink(q, p) < 0) {
2298 if (try) {
2299 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2300 return 0;
2301 } else
2302 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2303 }
2304
2305 r = userns_mkdir(directory, p, 0755, 0, 0);
2306 if (r < 0)
2307 log_warning_errno(r, "Failed to create directory %s: %m", q);
2308 return 0;
2309 }
2310
2311 if (arg_link_journal == LINK_HOST) {
2312 /* don't create parents here — if the host doesn't have
2313 * permanent journal set up, don't force it here */
2314
2315 r = mkdir_errno_wrapper(p, 0755);
2316 if (r < 0 && r != -EEXIST) {
2317 if (try) {
2318 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2319 return 0;
2320 } else
2321 return log_error_errno(r, "Failed to create %s: %m", p);
2322 }
2323
2324 } else if (access(p, F_OK) < 0)
2325 return 0;
2326
2327 if (dir_is_empty(q) == 0)
2328 log_warning("%s is not empty, proceeding anyway.", q);
2329
2330 r = userns_mkdir(directory, p, 0755, 0, 0);
2331 if (r < 0)
2332 return log_error_errno(r, "Failed to create %s: %m", q);
2333
2334 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2335 if (r < 0)
2336 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2337
2338 return 0;
2339 }
2340
2341 static int drop_capabilities(uid_t uid) {
2342 CapabilityQuintet q;
2343
2344 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2345 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2346 * arg_caps_retain. */
2347
2348 if (capability_quintet_is_set(&arg_full_capabilities)) {
2349 q = arg_full_capabilities;
2350
2351 if (q.bounding == (uint64_t) -1)
2352 q.bounding = uid == 0 ? arg_caps_retain : 0;
2353
2354 if (q.effective == (uint64_t) -1)
2355 q.effective = uid == 0 ? q.bounding : 0;
2356
2357 if (q.inheritable == (uint64_t) -1)
2358 q.inheritable = uid == 0 ? q.bounding : 0;
2359
2360 if (q.permitted == (uint64_t) -1)
2361 q.permitted = uid == 0 ? q.bounding : 0;
2362
2363 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2364 q.ambient = 0;
2365
2366 if (capability_quintet_mangle(&q))
2367 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2368
2369 } else {
2370 q = (CapabilityQuintet) {
2371 .bounding = arg_caps_retain,
2372 .effective = uid == 0 ? arg_caps_retain : 0,
2373 .inheritable = uid == 0 ? arg_caps_retain : 0,
2374 .permitted = uid == 0 ? arg_caps_retain : 0,
2375 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2376 };
2377
2378 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2379 * in order to maintain the same behavior as systemd < 242. */
2380 if (capability_quintet_mangle(&q))
2381 log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2382 "Some capabilities will not be set because they are not in the current bounding set.");
2383
2384 }
2385
2386 return capability_quintet_enforce(&q);
2387 }
2388
2389 static int reset_audit_loginuid(void) {
2390 _cleanup_free_ char *p = NULL;
2391 int r;
2392
2393 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2394 return 0;
2395
2396 r = read_one_line_file("/proc/self/loginuid", &p);
2397 if (r == -ENOENT)
2398 return 0;
2399 if (r < 0)
2400 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2401
2402 /* Already reset? */
2403 if (streq(p, "4294967295"))
2404 return 0;
2405
2406 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2407 if (r < 0) {
2408 log_error_errno(r,
2409 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2410 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2411 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2412 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2413 "using systemd-nspawn. Sleeping for 5s... (%m)");
2414
2415 sleep(5);
2416 }
2417
2418 return 0;
2419 }
2420
2421 static int setup_propagate(const char *root) {
2422 const char *p, *q;
2423 int r;
2424
2425 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2426 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2427 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2428 (void) mkdir_p(p, 0600);
2429
2430 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2431 if (r < 0)
2432 return log_error_errno(r, "Failed to create /run/systemd: %m");
2433
2434 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2435 if (r < 0)
2436 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2437
2438 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2441
2442 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
2443 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2444 if (r < 0)
2445 return r;
2446
2447 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2448 if (r < 0)
2449 return r;
2450
2451 /* machined will MS_MOVE into that directory, and that's only
2452 * supported for non-shared mounts. */
2453 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2454 }
2455
2456 static int setup_machine_id(const char *directory) {
2457 const char *etc_machine_id;
2458 sd_id128_t id;
2459 int r;
2460
2461 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2462 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2463 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2464 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2465 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2466 * container behaves nicely). */
2467
2468 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2469
2470 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2471 if (r < 0) {
2472 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2473 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2474
2475 if (sd_id128_is_null(arg_uuid)) {
2476 r = sd_id128_randomize(&arg_uuid);
2477 if (r < 0)
2478 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2479 }
2480 } else {
2481 if (sd_id128_is_null(id))
2482 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2483 "Machine ID in container image is zero, refusing.");
2484
2485 arg_uuid = id;
2486 }
2487
2488 return 0;
2489 }
2490
2491 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2492 int r;
2493
2494 assert(directory);
2495
2496 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2497 return 0;
2498
2499 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2500 if (r == -EOPNOTSUPP)
2501 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2502 if (r == -EBADE)
2503 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2504 if (r < 0)
2505 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2506 if (r == 0)
2507 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2508 else
2509 log_debug("Patched directory tree to match UID/GID range.");
2510
2511 return r;
2512 }
2513
2514 /*
2515 * Return values:
2516 * < 0 : wait_for_terminate() failed to get the state of the
2517 * container, the container was terminated by a signal, or
2518 * failed for an unknown reason. No change is made to the
2519 * container argument.
2520 * > 0 : The program executed in the container terminated with an
2521 * error. The exit code of the program executed in the
2522 * container is returned. The container argument has been set
2523 * to CONTAINER_TERMINATED.
2524 * 0 : The container is being rebooted, has been shut down or exited
2525 * successfully. The container argument has been set to either
2526 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2527 *
2528 * That is, success is indicated by a return value of zero, and an
2529 * error is indicated by a non-zero value.
2530 */
2531 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2532 siginfo_t status;
2533 int r;
2534
2535 r = wait_for_terminate(pid, &status);
2536 if (r < 0)
2537 return log_warning_errno(r, "Failed to wait for container: %m");
2538
2539 switch (status.si_code) {
2540
2541 case CLD_EXITED:
2542 if (status.si_status == 0)
2543 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2544 else
2545 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2546
2547 *container = CONTAINER_TERMINATED;
2548 return status.si_status;
2549
2550 case CLD_KILLED:
2551 if (status.si_status == SIGINT) {
2552 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2553 *container = CONTAINER_TERMINATED;
2554 return 0;
2555
2556 } else if (status.si_status == SIGHUP) {
2557 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2558 *container = CONTAINER_REBOOTED;
2559 return 0;
2560 }
2561
2562 _fallthrough_;
2563 case CLD_DUMPED:
2564 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2565 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2566
2567 default:
2568 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2569 "Container %s failed due to unknown reason.", arg_machine);
2570 }
2571 }
2572
2573 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2574 pid_t pid;
2575
2576 pid = PTR_TO_PID(userdata);
2577 if (pid > 0) {
2578 if (kill(pid, arg_kill_signal) >= 0) {
2579 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2580 sd_event_source_set_userdata(s, NULL);
2581 return 0;
2582 }
2583 }
2584
2585 sd_event_exit(sd_event_source_get_event(s), 0);
2586 return 0;
2587 }
2588
2589 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2590 pid_t pid;
2591
2592 assert(s);
2593 assert(ssi);
2594
2595 pid = PTR_TO_PID(userdata);
2596
2597 for (;;) {
2598 siginfo_t si = {};
2599
2600 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2601 return log_error_errno(errno, "Failed to waitid(): %m");
2602 if (si.si_pid == 0) /* No pending children. */
2603 break;
2604 if (si.si_pid == pid) {
2605 /* The main process we care for has exited. Return from
2606 * signal handler but leave the zombie. */
2607 sd_event_exit(sd_event_source_get_event(s), 0);
2608 break;
2609 }
2610
2611 /* Reap all other children. */
2612 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2613 }
2614
2615 return 0;
2616 }
2617
2618 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2619 pid_t pid;
2620
2621 assert(m);
2622
2623 pid = PTR_TO_PID(userdata);
2624
2625 if (arg_kill_signal > 0) {
2626 log_info("Container termination requested. Attempting to halt container.");
2627 (void) kill(pid, arg_kill_signal);
2628 } else {
2629 log_info("Container termination requested. Exiting.");
2630 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2631 }
2632
2633 return 0;
2634 }
2635
2636 static int determine_names(void) {
2637 int r;
2638
2639 if (arg_template && !arg_directory && arg_machine) {
2640
2641 /* If --template= was specified then we should not
2642 * search for a machine, but instead create a new one
2643 * in /var/lib/machine. */
2644
2645 arg_directory = path_join("/var/lib/machines", arg_machine);
2646 if (!arg_directory)
2647 return log_oom();
2648 }
2649
2650 if (!arg_image && !arg_directory) {
2651 if (arg_machine) {
2652 _cleanup_(image_unrefp) Image *i = NULL;
2653
2654 r = image_find(IMAGE_MACHINE, arg_machine, &i);
2655 if (r == -ENOENT)
2656 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2657 if (r < 0)
2658 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2659
2660 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2661 r = free_and_strdup(&arg_image, i->path);
2662 else
2663 r = free_and_strdup(&arg_directory, i->path);
2664 if (r < 0)
2665 return log_oom();
2666
2667 if (!arg_ephemeral)
2668 arg_read_only = arg_read_only || i->read_only;
2669 } else {
2670 r = safe_getcwd(&arg_directory);
2671 if (r < 0)
2672 return log_error_errno(r, "Failed to determine current directory: %m");
2673 }
2674
2675 if (!arg_directory && !arg_image)
2676 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
2677 }
2678
2679 if (!arg_machine) {
2680 if (arg_directory && path_equal(arg_directory, "/"))
2681 arg_machine = gethostname_malloc();
2682 else {
2683 if (arg_image) {
2684 char *e;
2685
2686 arg_machine = strdup(basename(arg_image));
2687
2688 /* Truncate suffix if there is one */
2689 e = endswith(arg_machine, ".raw");
2690 if (e)
2691 *e = 0;
2692 } else
2693 arg_machine = strdup(basename(arg_directory));
2694 }
2695 if (!arg_machine)
2696 return log_oom();
2697
2698 hostname_cleanup(arg_machine);
2699 if (!machine_name_is_valid(arg_machine))
2700 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
2701
2702 if (arg_ephemeral) {
2703 char *b;
2704
2705 /* Add a random suffix when this is an
2706 * ephemeral machine, so that we can run many
2707 * instances at once without manually having
2708 * to specify -M each time. */
2709
2710 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2711 return log_oom();
2712
2713 free(arg_machine);
2714 arg_machine = b;
2715 }
2716 }
2717
2718 return 0;
2719 }
2720
2721 static int chase_symlinks_and_update(char **p, unsigned flags) {
2722 char *chased;
2723 int r;
2724
2725 assert(p);
2726
2727 if (!*p)
2728 return 0;
2729
2730 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
2731 if (r < 0)
2732 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2733
2734 return free_and_replace(*p, chased);
2735 }
2736
2737 static int determine_uid_shift(const char *directory) {
2738 int r;
2739
2740 if (arg_userns_mode == USER_NAMESPACE_NO) {
2741 arg_uid_shift = 0;
2742 return 0;
2743 }
2744
2745 if (arg_uid_shift == UID_INVALID) {
2746 struct stat st;
2747
2748 r = stat(directory, &st);
2749 if (r < 0)
2750 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2751
2752 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2753
2754 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2755 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2756 "UID and GID base of %s don't match.", directory);
2757
2758 arg_uid_range = UINT32_C(0x10000);
2759 }
2760
2761 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2762 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2763 "UID base too high for UID range.");
2764
2765 return 0;
2766 }
2767
2768 static unsigned long effective_clone_ns_flags(void) {
2769 unsigned long flags = arg_clone_ns_flags;
2770
2771 if (arg_private_network)
2772 flags |= CLONE_NEWNET;
2773 if (arg_use_cgns)
2774 flags |= CLONE_NEWCGROUP;
2775 if (arg_userns_mode != USER_NAMESPACE_NO)
2776 flags |= CLONE_NEWUSER;
2777
2778 return flags;
2779 }
2780
2781 static int patch_sysctl(void) {
2782
2783 /* This table is inspired by runc's sysctl() function */
2784 static const struct {
2785 const char *key;
2786 bool prefix;
2787 unsigned long clone_flags;
2788 } safe_sysctl[] = {
2789 { "kernel.hostname", false, CLONE_NEWUTS },
2790 { "kernel.domainname", false, CLONE_NEWUTS },
2791 { "kernel.msgmax", false, CLONE_NEWIPC },
2792 { "kernel.msgmnb", false, CLONE_NEWIPC },
2793 { "kernel.msgmni", false, CLONE_NEWIPC },
2794 { "kernel.sem", false, CLONE_NEWIPC },
2795 { "kernel.shmall", false, CLONE_NEWIPC },
2796 { "kernel.shmmax", false, CLONE_NEWIPC },
2797 { "kernel.shmmni", false, CLONE_NEWIPC },
2798 { "fs.mqueue.", true, CLONE_NEWIPC },
2799 { "net.", true, CLONE_NEWNET },
2800 };
2801
2802 unsigned long flags;
2803 char **k, **v;
2804 int r;
2805
2806 flags = effective_clone_ns_flags();
2807
2808 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2809 bool good = false;
2810 size_t i;
2811
2812 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2813
2814 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2815 continue;
2816
2817 if (safe_sysctl[i].prefix)
2818 good = startswith(*k, safe_sysctl[i].key);
2819 else
2820 good = streq(*k, safe_sysctl[i].key);
2821
2822 if (good)
2823 break;
2824 }
2825
2826 if (!good)
2827 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2828
2829 r = sysctl_write(*k, *v);
2830 if (r < 0)
2831 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2832 }
2833
2834 return 0;
2835 }
2836
2837 static int inner_child(
2838 Barrier *barrier,
2839 const char *directory,
2840 bool secondary,
2841 int kmsg_socket,
2842 int rtnl_socket,
2843 int master_pty_socket,
2844 FDSet *fds) {
2845
2846 _cleanup_free_ char *home = NULL;
2847 char as_uuid[37];
2848 size_t n_env = 1;
2849 const char *envp[] = {
2850 "PATH=" DEFAULT_PATH_COMPAT,
2851 NULL, /* container */
2852 NULL, /* TERM */
2853 NULL, /* HOME */
2854 NULL, /* USER */
2855 NULL, /* LOGNAME */
2856 NULL, /* container_uuid */
2857 NULL, /* LISTEN_FDS */
2858 NULL, /* LISTEN_PID */
2859 NULL, /* NOTIFY_SOCKET */
2860 NULL
2861 };
2862 const char *exec_target;
2863 _cleanup_strv_free_ char **env_use = NULL;
2864 int r, which_failed;
2865
2866 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2867 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2868 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2869 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2870 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2871 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2872 * namespace.
2873 *
2874 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2875 * unshare(). See below. */
2876
2877 assert(barrier);
2878 assert(directory);
2879 assert(kmsg_socket >= 0);
2880
2881 log_debug("Inner child is initializing.");
2882
2883 if (arg_userns_mode != USER_NAMESPACE_NO) {
2884 /* Tell the parent, that it now can write the UID map. */
2885 (void) barrier_place(barrier); /* #1 */
2886
2887 /* Wait until the parent wrote the UID map */
2888 if (!barrier_place_and_sync(barrier)) /* #2 */
2889 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2890 "Parent died too early");
2891 }
2892
2893 r = reset_uid_gid();
2894 if (r < 0)
2895 return log_error_errno(r, "Couldn't become new root: %m");
2896
2897 r = mount_all(NULL,
2898 arg_mount_settings | MOUNT_IN_USERNS,
2899 arg_uid_shift,
2900 arg_selinux_apifs_context);
2901 if (r < 0)
2902 return r;
2903
2904 if (!arg_network_namespace_path && arg_private_network) {
2905 r = unshare(CLONE_NEWNET);
2906 if (r < 0)
2907 return log_error_errno(errno, "Failed to unshare network namespace: %m");
2908
2909 /* Tell the parent that it can setup network interfaces. */
2910 (void) barrier_place(barrier); /* #3 */
2911 }
2912
2913 r = mount_sysfs(NULL, arg_mount_settings);
2914 if (r < 0)
2915 return r;
2916
2917 /* Wait until we are cgroup-ified, so that we
2918 * can mount the right cgroup path writable */
2919 if (!barrier_place_and_sync(barrier)) /* #4 */
2920 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2921 "Parent died too early");
2922
2923 if (arg_use_cgns) {
2924 r = unshare(CLONE_NEWCGROUP);
2925 if (r < 0)
2926 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2927 r = mount_cgroups(
2928 "",
2929 arg_unified_cgroup_hierarchy,
2930 arg_userns_mode != USER_NAMESPACE_NO,
2931 arg_uid_shift,
2932 arg_uid_range,
2933 arg_selinux_apifs_context,
2934 true);
2935 if (r < 0)
2936 return r;
2937 } else {
2938 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2939 if (r < 0)
2940 return r;
2941 }
2942
2943 r = setup_boot_id();
2944 if (r < 0)
2945 return r;
2946
2947 r = setup_kmsg(kmsg_socket);
2948 if (r < 0)
2949 return r;
2950 kmsg_socket = safe_close(kmsg_socket);
2951
2952 r = mount_custom(
2953 "/",
2954 arg_custom_mounts,
2955 arg_n_custom_mounts,
2956 false,
2957 0,
2958 0,
2959 arg_selinux_apifs_context,
2960 true);
2961 if (r < 0)
2962 return r;
2963
2964 if (setsid() < 0)
2965 return log_error_errno(errno, "setsid() failed: %m");
2966
2967 if (arg_private_network)
2968 loopback_setup();
2969
2970 if (arg_expose_ports) {
2971 r = expose_port_send_rtnl(rtnl_socket);
2972 if (r < 0)
2973 return r;
2974 rtnl_socket = safe_close(rtnl_socket);
2975 }
2976
2977 if (arg_console_mode != CONSOLE_PIPE) {
2978 _cleanup_close_ int master = -1;
2979 _cleanup_free_ char *console = NULL;
2980
2981 /* Allocate a pty and make it available as /dev/console. */
2982 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
2983 if (master < 0)
2984 return log_error_errno(master, "Failed to allocate a pty: %m");
2985
2986 r = setup_dev_console(console);
2987 if (r < 0)
2988 return log_error_errno(r, "Failed to setup /dev/console: %m");
2989
2990 r = send_one_fd(master_pty_socket, master, 0);
2991 if (r < 0)
2992 return log_error_errno(r, "Failed to send master fd: %m");
2993 master_pty_socket = safe_close(master_pty_socket);
2994
2995 r = setup_stdio_as_dev_console();
2996 if (r < 0)
2997 return r;
2998 }
2999
3000 r = patch_sysctl();
3001 if (r < 0)
3002 return r;
3003
3004 if (arg_oom_score_adjust_set) {
3005 r = set_oom_score_adjust(arg_oom_score_adjust);
3006 if (r < 0)
3007 return log_error_errno(r, "Failed to adjust OOM score: %m");
3008 }
3009
3010 if (arg_cpu_set.set)
3011 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3012 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3013
3014 (void) setup_hostname();
3015
3016 if (arg_personality != PERSONALITY_INVALID) {
3017 r = safe_personality(arg_personality);
3018 if (r < 0)
3019 return log_error_errno(r, "personality() failed: %m");
3020 } else if (secondary) {
3021 r = safe_personality(PER_LINUX32);
3022 if (r < 0)
3023 return log_error_errno(r, "personality() failed: %m");
3024 }
3025
3026 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3029
3030 #if HAVE_SECCOMP
3031 if (arg_seccomp) {
3032
3033 if (is_seccomp_available()) {
3034
3035 r = seccomp_load(arg_seccomp);
3036 if (ERRNO_IS_SECCOMP_FATAL(r))
3037 return log_error_errno(r, "Failed to install seccomp filter: %m");
3038 if (r < 0)
3039 log_debug_errno(r, "Failed to install seccomp filter: %m");
3040 }
3041 } else
3042 #endif
3043 {
3044 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3045 if (r < 0)
3046 return r;
3047 }
3048
3049 #if HAVE_SELINUX
3050 if (arg_selinux_context)
3051 if (setexeccon(arg_selinux_context) < 0)
3052 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3053 #endif
3054
3055 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3056 * if we need to later on. */
3057 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3058 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3059
3060 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3061 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3062 else
3063 r = change_uid_gid(arg_user, &home);
3064 if (r < 0)
3065 return r;
3066
3067 r = drop_capabilities(getuid());
3068 if (r < 0)
3069 return log_error_errno(r, "Dropping capabilities failed: %m");
3070
3071 if (arg_no_new_privileges)
3072 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3073 return log_error_errno(errno, "Failed to disable new privileges: %m");
3074
3075 /* LXC sets container=lxc, so follow the scheme here */
3076 envp[n_env++] = strjoina("container=", arg_container_service_name);
3077
3078 envp[n_env] = strv_find_prefix(environ, "TERM=");
3079 if (envp[n_env])
3080 n_env++;
3081
3082 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3083 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3084 return log_oom();
3085
3086 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3087 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3088 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3089 return log_oom();
3090
3091 assert(!sd_id128_is_null(arg_uuid));
3092
3093 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3094 return log_oom();
3095
3096 if (fdset_size(fds) > 0) {
3097 r = fdset_cloexec(fds, false);
3098 if (r < 0)
3099 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3100
3101 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3102 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3103 return log_oom();
3104 }
3105 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3106 return log_oom();
3107
3108 env_use = strv_env_merge(2, envp, arg_setenv);
3109 if (!env_use)
3110 return log_oom();
3111
3112 /* Let the parent know that we are ready and
3113 * wait until the parent is ready with the
3114 * setup, too... */
3115 if (!barrier_place_and_sync(barrier)) /* #5 */
3116 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3117 "Parent died too early");
3118
3119 if (arg_chdir)
3120 if (chdir(arg_chdir) < 0)
3121 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3122
3123 if (arg_start_mode == START_PID2) {
3124 r = stub_pid1(arg_uuid);
3125 if (r < 0)
3126 return r;
3127 }
3128
3129 log_debug("Inner child completed, invoking payload.");
3130
3131 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3132 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3133 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3134 log_close();
3135 log_set_open_when_needed(true);
3136
3137 (void) fdset_close_others(fds);
3138
3139 if (arg_start_mode == START_BOOT) {
3140 char **a;
3141 size_t m;
3142
3143 /* Automatically search for the init system */
3144
3145 m = strv_length(arg_parameters);
3146 a = newa(char*, m + 2);
3147 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3148 a[1 + m] = NULL;
3149
3150 a[0] = (char*) "/usr/lib/systemd/systemd";
3151 execve(a[0], a, env_use);
3152
3153 a[0] = (char*) "/lib/systemd/systemd";
3154 execve(a[0], a, env_use);
3155
3156 a[0] = (char*) "/sbin/init";
3157 execve(a[0], a, env_use);
3158
3159 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3160 } else if (!strv_isempty(arg_parameters)) {
3161 const char *dollar_path;
3162
3163 exec_target = arg_parameters[0];
3164
3165 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3166 * binary. */
3167 dollar_path = strv_env_get(env_use, "PATH");
3168 if (dollar_path) {
3169 if (putenv((char*) dollar_path) != 0)
3170 return log_error_errno(errno, "Failed to update $PATH: %m");
3171 }
3172
3173 execvpe(arg_parameters[0], arg_parameters, env_use);
3174 } else {
3175 if (!arg_chdir)
3176 /* If we cannot change the directory, we'll end up in /, that is expected. */
3177 (void) chdir(home ?: "/root");
3178
3179 execle("/bin/bash", "-bash", NULL, env_use);
3180 execle("/bin/sh", "-sh", NULL, env_use);
3181
3182 exec_target = "/bin/bash, /bin/sh";
3183 }
3184
3185 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3186 }
3187
3188 static int setup_sd_notify_child(void) {
3189 _cleanup_close_ int fd = -1;
3190 union sockaddr_union sa = {
3191 .un.sun_family = AF_UNIX,
3192 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3193 };
3194 int r;
3195
3196 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3197 if (fd < 0)
3198 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3199
3200 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3201 (void) sockaddr_un_unlink(&sa.un);
3202
3203 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3204 if (r < 0)
3205 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3206
3207 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3208 if (r < 0)
3209 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3210
3211 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3212 if (r < 0)
3213 return log_error_errno(r, "SO_PASSCRED failed: %m");
3214
3215 return TAKE_FD(fd);
3216 }
3217
3218 static int outer_child(
3219 Barrier *barrier,
3220 const char *directory,
3221 DissectedImage *dissected_image,
3222 bool secondary,
3223 int pid_socket,
3224 int uuid_socket,
3225 int notify_socket,
3226 int kmsg_socket,
3227 int rtnl_socket,
3228 int uid_shift_socket,
3229 int master_pty_socket,
3230 int unified_cgroup_hierarchy_socket,
3231 FDSet *fds,
3232 int netns_fd) {
3233
3234 _cleanup_close_ int fd = -1;
3235 pid_t pid;
3236 ssize_t l;
3237 int r;
3238
3239 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3240 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3241 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3242 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3243
3244 assert(barrier);
3245 assert(directory);
3246 assert(pid_socket >= 0);
3247 assert(uuid_socket >= 0);
3248 assert(notify_socket >= 0);
3249 assert(master_pty_socket >= 0);
3250 assert(kmsg_socket >= 0);
3251
3252 log_debug("Outer child is initializing.");
3253
3254 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3255 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3256
3257 r = reset_audit_loginuid();
3258 if (r < 0)
3259 return r;
3260
3261 /* Mark everything as slave, so that we still
3262 * receive mounts from the real root, but don't
3263 * propagate mounts to the real root. */
3264 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3265 if (r < 0)
3266 return r;
3267
3268 if (dissected_image) {
3269 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3270 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3271 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3272 * makes sure ESP partitions and userns are compatible. */
3273
3274 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3275 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3276 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3277 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3278 if (r < 0)
3279 return r;
3280 }
3281
3282 r = determine_uid_shift(directory);
3283 if (r < 0)
3284 return r;
3285
3286 if (arg_userns_mode != USER_NAMESPACE_NO) {
3287 /* Let the parent know which UID shift we read from the image */
3288 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3289 if (l < 0)
3290 return log_error_errno(errno, "Failed to send UID shift: %m");
3291 if (l != sizeof(arg_uid_shift))
3292 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3293 "Short write while sending UID shift.");
3294
3295 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3296 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3297 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3298 * not it will pick a different one, and send it back to us. */
3299
3300 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3301 if (l < 0)
3302 return log_error_errno(errno, "Failed to recv UID shift: %m");
3303 if (l != sizeof(arg_uid_shift))
3304 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3305 "Short read while receiving UID shift.");
3306 }
3307
3308 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3309 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3310 }
3311
3312 if (path_equal(directory, "/")) {
3313 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3314 * place, so that we can make changes to its mount structure (for example, to implement
3315 * --volatile=) without this interfering with our ability to access files such as
3316 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3317 * (instead of a temporary directory, since we are living in our own mount namspace here
3318 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3319 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3320
3321 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3322 if (r < 0)
3323 return r;
3324
3325 directory = "/run/systemd/nspawn-root";
3326
3327 } else if (!dissected_image) {
3328 /* Turn directory into bind mount (we need that so that we can move the bind mount to root
3329 * later on). */
3330 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3331 if (r < 0)
3332 return r;
3333 }
3334
3335 r = setup_pivot_root(
3336 directory,
3337 arg_pivot_root_new,
3338 arg_pivot_root_old);
3339 if (r < 0)
3340 return r;
3341
3342 r = setup_volatile_mode(
3343 directory,
3344 arg_volatile_mode,
3345 arg_userns_mode != USER_NAMESPACE_NO,
3346 arg_uid_shift,
3347 arg_uid_range,
3348 arg_selinux_apifs_context);
3349 if (r < 0)
3350 return r;
3351
3352 if (dissected_image) {
3353 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3354 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3355 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3356 if (r < 0)
3357 return r;
3358 }
3359
3360 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3361 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3362
3363 r = detect_unified_cgroup_hierarchy_from_image(directory);
3364 if (r < 0)
3365 return r;
3366
3367 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3368 if (l < 0)
3369 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3370 if (l != sizeof(arg_unified_cgroup_hierarchy))
3371 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3372 "Short write while sending cgroup mode.");
3373
3374 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3375 }
3376
3377 /* Mark everything as shared so our mounts get propagated down. This is
3378 * required to make new bind mounts available in systemd services
3379 * inside the container that create a new mount namespace.
3380 * See https://github.com/systemd/systemd/issues/3860
3381 * Further submounts (such as /dev) done after this will inherit the
3382 * shared propagation mode. */
3383 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3384 if (r < 0)
3385 return r;
3386
3387 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3388 if (r < 0)
3389 return r;
3390
3391 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3392 if (r < 0)
3393 return r;
3394
3395 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
3396 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3397 if (r < 0)
3398 return log_error_errno(r, "Failed to make tree read-only: %m");
3399 }
3400
3401 r = mount_all(directory,
3402 arg_mount_settings,
3403 arg_uid_shift,
3404 arg_selinux_apifs_context);
3405 if (r < 0)
3406 return r;
3407
3408 r = copy_devnodes(directory);
3409 if (r < 0)
3410 return r;
3411
3412 r = make_extra_nodes(directory);
3413 if (r < 0)
3414 return r;
3415
3416 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3417 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
3418
3419 r = setup_pts(directory);
3420 if (r < 0)
3421 return r;
3422
3423 r = setup_propagate(directory);
3424 if (r < 0)
3425 return r;
3426
3427 r = setup_keyring();
3428 if (r < 0)
3429 return r;
3430
3431 r = setup_timezone(directory);
3432 if (r < 0)
3433 return r;
3434
3435 r = setup_resolv_conf(directory);
3436 if (r < 0)
3437 return r;
3438
3439 r = setup_machine_id(directory);
3440 if (r < 0)
3441 return r;
3442
3443 r = setup_journal(directory);
3444 if (r < 0)
3445 return r;
3446
3447 r = mount_custom(
3448 directory,
3449 arg_custom_mounts,
3450 arg_n_custom_mounts,
3451 arg_userns_mode != USER_NAMESPACE_NO,
3452 arg_uid_shift,
3453 arg_uid_range,
3454 arg_selinux_apifs_context,
3455 false);
3456 if (r < 0)
3457 return r;
3458
3459 if (!arg_use_cgns) {
3460 r = mount_cgroups(
3461 directory,
3462 arg_unified_cgroup_hierarchy,
3463 arg_userns_mode != USER_NAMESPACE_NO,
3464 arg_uid_shift,
3465 arg_uid_range,
3466 arg_selinux_apifs_context,
3467 false);
3468 if (r < 0)
3469 return r;
3470 }
3471
3472 r = mount_move_root(directory);
3473 if (r < 0)
3474 return log_error_errno(r, "Failed to move root directory: %m");
3475
3476 fd = setup_sd_notify_child();
3477 if (fd < 0)
3478 return fd;
3479
3480 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3481 arg_clone_ns_flags |
3482 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3483 if (pid < 0)
3484 return log_error_errno(errno, "Failed to fork inner child: %m");
3485 if (pid == 0) {
3486 pid_socket = safe_close(pid_socket);
3487 uuid_socket = safe_close(uuid_socket);
3488 notify_socket = safe_close(notify_socket);
3489 uid_shift_socket = safe_close(uid_shift_socket);
3490
3491 /* The inner child has all namespaces that are
3492 * requested, so that we all are owned by the user if
3493 * user namespaces are turned on. */
3494
3495 if (arg_network_namespace_path) {
3496 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3497 if (r < 0)
3498 return log_error_errno(r, "Failed to join network namespace: %m");
3499 }
3500
3501 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
3502 if (r < 0)
3503 _exit(EXIT_FAILURE);
3504
3505 _exit(EXIT_SUCCESS);
3506 }
3507
3508 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3509 if (l < 0)
3510 return log_error_errno(errno, "Failed to send PID: %m");
3511 if (l != sizeof(pid))
3512 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3513 "Short write while sending PID.");
3514
3515 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3516 if (l < 0)
3517 return log_error_errno(errno, "Failed to send machine ID: %m");
3518 if (l != sizeof(arg_uuid))
3519 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3520 "Short write while sending machine ID.");
3521
3522 l = send_one_fd(notify_socket, fd, 0);
3523 if (l < 0)
3524 return log_error_errno(l, "Failed to send notify fd: %m");
3525
3526 pid_socket = safe_close(pid_socket);
3527 uuid_socket = safe_close(uuid_socket);
3528 notify_socket = safe_close(notify_socket);
3529 master_pty_socket = safe_close(master_pty_socket);
3530 kmsg_socket = safe_close(kmsg_socket);
3531 rtnl_socket = safe_close(rtnl_socket);
3532 netns_fd = safe_close(netns_fd);
3533
3534 return 0;
3535 }
3536
3537 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3538 bool tried_hashed = false;
3539 unsigned n_tries = 100;
3540 uid_t candidate;
3541 int r;
3542
3543 assert(shift);
3544 assert(ret_lock_file);
3545 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3546 assert(arg_uid_range == 0x10000U);
3547
3548 candidate = *shift;
3549
3550 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3551
3552 for (;;) {
3553 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3554 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3555
3556 if (--n_tries <= 0)
3557 return -EBUSY;
3558
3559 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3560 goto next;
3561 if ((candidate & UINT32_C(0xFFFF)) != 0)
3562 goto next;
3563
3564 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3565 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3566 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3567 goto next;
3568 if (r < 0)
3569 return r;
3570
3571 /* Make some superficial checks whether the range is currently known in the user database */
3572 if (getpwuid(candidate))
3573 goto next;
3574 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3575 goto next;
3576 if (getgrgid(candidate))
3577 goto next;
3578 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3579 goto next;
3580
3581 *ret_lock_file = lf;
3582 lf = (struct LockFile) LOCK_FILE_INIT;
3583 *shift = candidate;
3584 return 0;
3585
3586 next:
3587 if (arg_machine && !tried_hashed) {
3588 /* Try to hash the base from the container name */
3589
3590 static const uint8_t hash_key[] = {
3591 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3592 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3593 };
3594
3595 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3596
3597 tried_hashed = true;
3598 } else
3599 random_bytes(&candidate, sizeof(candidate));
3600
3601 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3602 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3603 }
3604 }
3605
3606 static int setup_uid_map(pid_t pid) {
3607 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3608 int r;
3609
3610 assert(pid > 1);
3611
3612 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3613 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3614 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3615 if (r < 0)
3616 return log_error_errno(r, "Failed to write UID map: %m");
3617
3618 /* We always assign the same UID and GID ranges */
3619 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3620 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3621 if (r < 0)
3622 return log_error_errno(r, "Failed to write GID map: %m");
3623
3624 return 0;
3625 }
3626
3627 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3628 char buf[NOTIFY_BUFFER_MAX+1];
3629 char *p = NULL;
3630 struct iovec iovec = {
3631 .iov_base = buf,
3632 .iov_len = sizeof(buf)-1,
3633 };
3634 union {
3635 struct cmsghdr cmsghdr;
3636 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3637 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3638 } control = {};
3639 struct msghdr msghdr = {
3640 .msg_iov = &iovec,
3641 .msg_iovlen = 1,
3642 .msg_control = &control,
3643 .msg_controllen = sizeof(control),
3644 };
3645 struct cmsghdr *cmsg;
3646 struct ucred *ucred = NULL;
3647 ssize_t n;
3648 pid_t inner_child_pid;
3649 _cleanup_strv_free_ char **tags = NULL;
3650
3651 assert(userdata);
3652
3653 inner_child_pid = PTR_TO_PID(userdata);
3654
3655 if (revents != EPOLLIN) {
3656 log_warning("Got unexpected poll event for notify fd.");
3657 return 0;
3658 }
3659
3660 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3661 if (n < 0) {
3662 if (IN_SET(errno, EAGAIN, EINTR))
3663 return 0;
3664
3665 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3666 }
3667 cmsg_close_all(&msghdr);
3668
3669 CMSG_FOREACH(cmsg, &msghdr) {
3670 if (cmsg->cmsg_level == SOL_SOCKET &&
3671 cmsg->cmsg_type == SCM_CREDENTIALS &&
3672 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3673
3674 ucred = (struct ucred*) CMSG_DATA(cmsg);
3675 }
3676 }
3677
3678 if (!ucred || ucred->pid != inner_child_pid) {
3679 log_debug("Received notify message without valid credentials. Ignoring.");
3680 return 0;
3681 }
3682
3683 if ((size_t) n >= sizeof(buf)) {
3684 log_warning("Received notify message exceeded maximum size. Ignoring.");
3685 return 0;
3686 }
3687
3688 buf[n] = 0;
3689 tags = strv_split(buf, "\n\r");
3690 if (!tags)
3691 return log_oom();
3692
3693 if (strv_find(tags, "READY=1"))
3694 (void) sd_notifyf(false, "READY=1\n");
3695
3696 p = strv_find_startswith(tags, "STATUS=");
3697 if (p)
3698 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
3699
3700 return 0;
3701 }
3702
3703 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3704 int r;
3705
3706 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3707 if (r < 0)
3708 return log_error_errno(r, "Failed to allocate notify event source: %m");
3709
3710 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3711
3712 return 0;
3713 }
3714
3715 static int merge_settings(Settings *settings, const char *path) {
3716 int rl;
3717
3718 assert(settings);
3719 assert(path);
3720
3721 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3722 * that this steals the fields of the Settings* structure, and hence modifies it. */
3723
3724 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3725 settings->start_mode >= 0) {
3726 arg_start_mode = settings->start_mode;
3727 strv_free_and_replace(arg_parameters, settings->parameters);
3728 }
3729
3730 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3731 arg_ephemeral = settings->ephemeral;
3732
3733 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3734 settings->root) {
3735
3736 if (!arg_settings_trusted)
3737 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3738 else
3739 free_and_replace(arg_directory, settings->root);
3740 }
3741
3742 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3743 settings->pivot_root_new) {
3744 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3745 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3746 }
3747
3748 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3749 settings->working_directory)
3750 free_and_replace(arg_chdir, settings->working_directory);
3751
3752 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3753 settings->environment)
3754 strv_free_and_replace(arg_setenv, settings->environment);
3755
3756 if ((arg_settings_mask & SETTING_USER) == 0) {
3757
3758 if (settings->user)
3759 free_and_replace(arg_user, settings->user);
3760
3761 if (uid_is_valid(settings->uid))
3762 arg_uid = settings->uid;
3763 if (gid_is_valid(settings->gid))
3764 arg_gid = settings->gid;
3765 if (settings->n_supplementary_gids > 0) {
3766 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3767 arg_n_supplementary_gids = settings->n_supplementary_gids;
3768 }
3769 }
3770
3771 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3772 uint64_t plus, minus;
3773
3774 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3775 * Settings structure */
3776
3777 plus = settings->capability;
3778 minus = settings->drop_capability;
3779
3780 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3781 if (settings_private_network(settings))
3782 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3783 else
3784 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3785 }
3786
3787 if (!arg_settings_trusted && plus != 0) {
3788 if (settings->capability != 0)
3789 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3790 } else
3791 arg_caps_retain |= plus;
3792
3793 arg_caps_retain &= ~minus;
3794
3795 /* Copy the full capabilities over too */
3796 if (capability_quintet_is_set(&settings->full_capabilities)) {
3797 if (!arg_settings_trusted)
3798 log_warning("Ignoring capability settings, file %s is not trusted.", path);
3799 else
3800 arg_full_capabilities = settings->full_capabilities;
3801 }
3802 }
3803
3804 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3805 settings->kill_signal > 0)
3806 arg_kill_signal = settings->kill_signal;
3807
3808 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3809 settings->personality != PERSONALITY_INVALID)
3810 arg_personality = settings->personality;
3811
3812 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3813 !sd_id128_is_null(settings->machine_id)) {
3814
3815 if (!arg_settings_trusted)
3816 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3817 else
3818 arg_uuid = settings->machine_id;
3819 }
3820
3821 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3822 settings->read_only >= 0)
3823 arg_read_only = settings->read_only;
3824
3825 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3826 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3827 arg_volatile_mode = settings->volatile_mode;
3828
3829 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3830 settings->n_custom_mounts > 0) {
3831
3832 if (!arg_settings_trusted)
3833 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3834 else {
3835 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3836 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3837 arg_n_custom_mounts = settings->n_custom_mounts;
3838 settings->n_custom_mounts = 0;
3839 }
3840 }
3841
3842 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3843 (settings->private_network >= 0 ||
3844 settings->network_veth >= 0 ||
3845 settings->network_bridge ||
3846 settings->network_zone ||
3847 settings->network_interfaces ||
3848 settings->network_macvlan ||
3849 settings->network_ipvlan ||
3850 settings->network_veth_extra ||
3851 settings->network_namespace_path)) {
3852
3853 if (!arg_settings_trusted)
3854 log_warning("Ignoring network settings, file %s is not trusted.", path);
3855 else {
3856 arg_network_veth = settings_network_veth(settings);
3857 arg_private_network = settings_private_network(settings);
3858
3859 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3860 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3861 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3862 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3863
3864 free_and_replace(arg_network_bridge, settings->network_bridge);
3865 free_and_replace(arg_network_zone, settings->network_zone);
3866
3867 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
3868 }
3869 }
3870
3871 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3872 settings->expose_ports) {
3873
3874 if (!arg_settings_trusted)
3875 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3876 else {
3877 expose_port_free_all(arg_expose_ports);
3878 arg_expose_ports = TAKE_PTR(settings->expose_ports);
3879 }
3880 }
3881
3882 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3883 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3884
3885 if (!arg_settings_trusted)
3886 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3887 else {
3888 arg_userns_mode = settings->userns_mode;
3889 arg_uid_shift = settings->uid_shift;
3890 arg_uid_range = settings->uid_range;
3891 arg_userns_chown = settings->userns_chown;
3892 }
3893 }
3894
3895 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3896 arg_notify_ready = settings->notify_ready;
3897
3898 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3899
3900 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
3901 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3902 else {
3903 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3904 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3905 }
3906
3907 #if HAVE_SECCOMP
3908 if (!arg_settings_trusted && settings->seccomp)
3909 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3910 else {
3911 seccomp_release(arg_seccomp);
3912 arg_seccomp = TAKE_PTR(settings->seccomp);
3913 }
3914 #endif
3915 }
3916
3917 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3918 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3919 continue;
3920
3921 if (!settings->rlimit[rl])
3922 continue;
3923
3924 if (!arg_settings_trusted) {
3925 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3926 continue;
3927 }
3928
3929 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3930 }
3931
3932 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3933 settings->hostname)
3934 free_and_replace(arg_hostname, settings->hostname);
3935
3936 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3937 settings->no_new_privileges >= 0)
3938 arg_no_new_privileges = settings->no_new_privileges;
3939
3940 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3941 settings->oom_score_adjust_set) {
3942
3943 if (!arg_settings_trusted)
3944 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3945 else {
3946 arg_oom_score_adjust = settings->oom_score_adjust;
3947 arg_oom_score_adjust_set = true;
3948 }
3949 }
3950
3951 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3952 settings->cpu_set.set) {
3953
3954 if (!arg_settings_trusted)
3955 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3956 else {
3957 cpu_set_reset(&arg_cpu_set);
3958 arg_cpu_set = settings->cpu_set;
3959 settings->cpu_set = (CPUSet) {};
3960 }
3961 }
3962
3963 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3964 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3965 arg_resolv_conf = settings->resolv_conf;
3966
3967 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3968 settings->link_journal != _LINK_JOURNAL_INVALID) {
3969
3970 if (!arg_settings_trusted)
3971 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3972 else {
3973 arg_link_journal = settings->link_journal;
3974 arg_link_journal_try = settings->link_journal_try;
3975 }
3976 }
3977
3978 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3979 settings->timezone != _TIMEZONE_MODE_INVALID)
3980 arg_timezone = settings->timezone;
3981
3982 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3983 settings->slice) {
3984
3985 if (!arg_settings_trusted)
3986 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3987 else
3988 free_and_replace(arg_slice, settings->slice);
3989 }
3990
3991 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
3992 settings->use_cgns >= 0) {
3993
3994 if (!arg_settings_trusted)
3995 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
3996 else
3997 arg_use_cgns = settings->use_cgns;
3998 }
3999
4000 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4001 settings->clone_ns_flags != (unsigned long) -1) {
4002
4003 if (!arg_settings_trusted)
4004 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4005 else
4006 arg_clone_ns_flags = settings->clone_ns_flags;
4007 }
4008
4009 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4010 settings->console_mode >= 0) {
4011
4012 if (!arg_settings_trusted)
4013 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4014 else
4015 arg_console_mode = settings->console_mode;
4016 }
4017
4018 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4019 * don't consult arg_settings_mask for them. */
4020
4021 sd_bus_message_unref(arg_property_message);
4022 arg_property_message = TAKE_PTR(settings->properties);
4023
4024 arg_console_width = settings->console_width;
4025 arg_console_height = settings->console_height;
4026
4027 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4028 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4029 arg_n_extra_nodes = settings->n_extra_nodes;
4030
4031 return 0;
4032 }
4033
4034 static int load_settings(void) {
4035 _cleanup_(settings_freep) Settings *settings = NULL;
4036 _cleanup_fclose_ FILE *f = NULL;
4037 _cleanup_free_ char *p = NULL;
4038 const char *fn, *i;
4039 int r;
4040
4041 if (arg_oci_bundle)
4042 return 0;
4043
4044 /* If all settings are masked, there's no point in looking for
4045 * the settings file */
4046 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4047 return 0;
4048
4049 fn = strjoina(arg_machine, ".nspawn");
4050
4051 /* We first look in the admin's directories in /etc and /run */
4052 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4053 _cleanup_free_ char *j = NULL;
4054
4055 j = path_join(i, fn);
4056 if (!j)
4057 return log_oom();
4058
4059 f = fopen(j, "re");
4060 if (f) {
4061 p = TAKE_PTR(j);
4062
4063 /* By default, we trust configuration from /etc and /run */
4064 if (arg_settings_trusted < 0)
4065 arg_settings_trusted = true;
4066
4067 break;
4068 }
4069
4070 if (errno != ENOENT)
4071 return log_error_errno(errno, "Failed to open %s: %m", j);
4072 }
4073
4074 if (!f) {
4075 /* After that, let's look for a file next to the
4076 * actual image we shall boot. */
4077
4078 if (arg_image) {
4079 p = file_in_same_dir(arg_image, fn);
4080 if (!p)
4081 return log_oom();
4082 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4083 p = file_in_same_dir(arg_directory, fn);
4084 if (!p)
4085 return log_oom();
4086 }
4087
4088 if (p) {
4089 f = fopen(p, "re");
4090 if (!f && errno != ENOENT)
4091 return log_error_errno(errno, "Failed to open %s: %m", p);
4092
4093 /* By default, we do not trust configuration from /var/lib/machines */
4094 if (arg_settings_trusted < 0)
4095 arg_settings_trusted = false;
4096 }
4097 }
4098
4099 if (!f)
4100 return 0;
4101
4102 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4103
4104 r = settings_load(f, p, &settings);
4105 if (r < 0)
4106 return r;
4107
4108 return merge_settings(settings, p);
4109 }
4110
4111 static int load_oci_bundle(void) {
4112 _cleanup_(settings_freep) Settings *settings = NULL;
4113 int r;
4114
4115 if (!arg_oci_bundle)
4116 return 0;
4117
4118 /* By default let's trust OCI bundles */
4119 if (arg_settings_trusted < 0)
4120 arg_settings_trusted = true;
4121
4122 r = oci_load(NULL, arg_oci_bundle, &settings);
4123 if (r < 0)
4124 return r;
4125
4126 return merge_settings(settings, arg_oci_bundle);
4127 }
4128
4129 static int run_container(
4130 DissectedImage *dissected_image,
4131 bool secondary,
4132 FDSet *fds,
4133 char veth_name[IFNAMSIZ], bool *veth_created,
4134 union in_addr_union *exposed,
4135 int *master, pid_t *pid, int *ret) {
4136
4137 static const struct sigaction sa = {
4138 .sa_handler = nop_signal_handler,
4139 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4140 };
4141
4142 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4143 _cleanup_close_ int etc_passwd_lock = -1;
4144 _cleanup_close_pair_ int
4145 kmsg_socket_pair[2] = { -1, -1 },
4146 rtnl_socket_pair[2] = { -1, -1 },
4147 pid_socket_pair[2] = { -1, -1 },
4148 uuid_socket_pair[2] = { -1, -1 },
4149 notify_socket_pair[2] = { -1, -1 },
4150 uid_shift_socket_pair[2] = { -1, -1 },
4151 master_pty_socket_pair[2] = { -1, -1 },
4152 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4153
4154 _cleanup_close_ int notify_socket = -1;
4155 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4156 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4157 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4158 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4159 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4160 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4161 ContainerStatus container_status = 0;
4162 int ifi = 0, r;
4163 ssize_t l;
4164 sigset_t mask_chld;
4165 _cleanup_close_ int netns_fd = -1;
4166
4167 assert_se(sigemptyset(&mask_chld) == 0);
4168 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4169
4170 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4171 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4172 * check with getpwuid() if the specific user already exists. Note that /etc might be
4173 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4174 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4175 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4176 * really ours. */
4177
4178 etc_passwd_lock = take_etc_passwd_lock(NULL);
4179 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4180 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4181 }
4182
4183 r = barrier_create(&barrier);
4184 if (r < 0)
4185 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4186
4187 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4188 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4189
4190 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4191 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4192
4193 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4194 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4195
4196 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4197 return log_error_errno(errno, "Failed to create id socket pair: %m");
4198
4199 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4200 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4201
4202 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4203 return log_error_errno(errno, "Failed to create console socket pair: %m");
4204
4205 if (arg_userns_mode != USER_NAMESPACE_NO)
4206 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4207 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4208
4209 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4210 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4211 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4212
4213 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4214 * parent's blocking calls and give it a chance to call wait() and terminate. */
4215 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4216 if (r < 0)
4217 return log_error_errno(errno, "Failed to change the signal mask: %m");
4218
4219 r = sigaction(SIGCHLD, &sa, NULL);
4220 if (r < 0)
4221 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4222
4223 if (arg_network_namespace_path) {
4224 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4225 if (netns_fd < 0)
4226 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4227
4228 r = fd_is_network_ns(netns_fd);
4229 if (r == -EUCLEAN)
4230 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4231 else if (r < 0)
4232 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4233 else if (r == 0)
4234 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4235 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4236 }
4237
4238 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4239 if (*pid < 0)
4240 return log_error_errno(errno, "clone() failed%s: %m",
4241 errno == EINVAL ?
4242 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4243
4244 if (*pid == 0) {
4245 /* The outer child only has a file system namespace. */
4246 barrier_set_role(&barrier, BARRIER_CHILD);
4247
4248 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4249 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4250 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4251 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4252 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4253 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4254 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4255 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4256
4257 (void) reset_all_signal_handlers();
4258 (void) reset_signal_mask();
4259
4260 r = outer_child(&barrier,
4261 arg_directory,
4262 dissected_image,
4263 secondary,
4264 pid_socket_pair[1],
4265 uuid_socket_pair[1],
4266 notify_socket_pair[1],
4267 kmsg_socket_pair[1],
4268 rtnl_socket_pair[1],
4269 uid_shift_socket_pair[1],
4270 master_pty_socket_pair[1],
4271 unified_cgroup_hierarchy_socket_pair[1],
4272 fds,
4273 netns_fd);
4274 if (r < 0)
4275 _exit(EXIT_FAILURE);
4276
4277 _exit(EXIT_SUCCESS);
4278 }
4279
4280 barrier_set_role(&barrier, BARRIER_PARENT);
4281
4282 fdset_close(fds);
4283
4284 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4285 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4286 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4287 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4288 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4289 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4290 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4291 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4292
4293 if (arg_userns_mode != USER_NAMESPACE_NO) {
4294 /* The child just let us know the UID shift it might have read from the image. */
4295 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4296 if (l < 0)
4297 return log_error_errno(errno, "Failed to read UID shift: %m");
4298 if (l != sizeof arg_uid_shift)
4299 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4300
4301 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4302 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4303 * image, but if that's already in use, pick a new one, and report back to the child,
4304 * which one we now picked. */
4305
4306 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4307 if (r < 0)
4308 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4309
4310 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4311 if (l < 0)
4312 return log_error_errno(errno, "Failed to send UID shift: %m");
4313 if (l != sizeof arg_uid_shift)
4314 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4315 }
4316 }
4317
4318 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4319 /* The child let us know the support cgroup mode it might have read from the image. */
4320 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4321 if (l < 0)
4322 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4323 if (l != sizeof(arg_unified_cgroup_hierarchy))
4324 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4325 l, l == 0 ? " The child is most likely dead." : "");
4326 }
4327
4328 /* Wait for the outer child. */
4329 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4330 if (r < 0)
4331 return r;
4332 if (r != EXIT_SUCCESS)
4333 return -EIO;
4334
4335 /* And now retrieve the PID of the inner child. */
4336 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4337 if (l < 0)
4338 return log_error_errno(errno, "Failed to read inner child PID: %m");
4339 if (l != sizeof *pid)
4340 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4341
4342 /* We also retrieve container UUID in case it was generated by outer child */
4343 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4344 if (l < 0)
4345 return log_error_errno(errno, "Failed to read container machine ID: %m");
4346 if (l != sizeof(arg_uuid))
4347 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4348
4349 /* We also retrieve the socket used for notifications generated by outer child */
4350 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4351 if (notify_socket < 0)
4352 return log_error_errno(notify_socket,
4353 "Failed to receive notification socket from the outer child: %m");
4354
4355 log_debug("Init process invoked as PID "PID_FMT, *pid);
4356
4357 if (arg_userns_mode != USER_NAMESPACE_NO) {
4358 if (!barrier_place_and_sync(&barrier)) /* #1 */
4359 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4360
4361 r = setup_uid_map(*pid);
4362 if (r < 0)
4363 return r;
4364
4365 (void) barrier_place(&barrier); /* #2 */
4366 }
4367
4368 if (arg_private_network) {
4369 if (!arg_network_namespace_path) {
4370 /* Wait until the child has unshared its network namespace. */
4371 if (!barrier_place_and_sync(&barrier)) /* #3 */
4372 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4373 }
4374
4375 r = move_network_interfaces(*pid, arg_network_interfaces);
4376 if (r < 0)
4377 return r;
4378
4379 if (arg_network_veth) {
4380 r = setup_veth(arg_machine, *pid, veth_name,
4381 arg_network_bridge || arg_network_zone);
4382 if (r < 0)
4383 return r;
4384 else if (r > 0)
4385 ifi = r;
4386
4387 if (arg_network_bridge) {
4388 /* Add the interface to a bridge */
4389 r = setup_bridge(veth_name, arg_network_bridge, false);
4390 if (r < 0)
4391 return r;
4392 if (r > 0)
4393 ifi = r;
4394 } else if (arg_network_zone) {
4395 /* Add the interface to a bridge, possibly creating it */
4396 r = setup_bridge(veth_name, arg_network_zone, true);
4397 if (r < 0)
4398 return r;
4399 if (r > 0)
4400 ifi = r;
4401 }
4402 }
4403
4404 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4405 if (r < 0)
4406 return r;
4407
4408 /* We created the primary and extra veth links now; let's remember this, so that we know to
4409 remove them later on. Note that we don't bother with removing veth links that were created
4410 here when their setup failed half-way, because in that case the kernel should be able to
4411 remove them on its own, since they cannot be referenced by anything yet. */
4412 *veth_created = true;
4413
4414 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4415 if (r < 0)
4416 return r;
4417
4418 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4419 if (r < 0)
4420 return r;
4421 }
4422
4423 if (arg_register || !arg_keep_unit) {
4424 r = sd_bus_default_system(&bus);
4425 if (r < 0)
4426 return log_error_errno(r, "Failed to open system bus: %m");
4427
4428 r = sd_bus_set_close_on_exit(bus, false);
4429 if (r < 0)
4430 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4431 }
4432
4433 if (!arg_keep_unit) {
4434 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4435 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4436 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4437
4438 r = sd_bus_match_signal_async(
4439 bus,
4440 NULL,
4441 "org.freedesktop.systemd1",
4442 NULL,
4443 "org.freedesktop.systemd1.Scope",
4444 "RequestStop",
4445 on_request_stop, NULL, PID_TO_PTR(*pid));
4446 if (r < 0)
4447 return log_error_errno(r, "Failed to request RequestStop match: %m");
4448 }
4449
4450 if (arg_register) {
4451 r = register_machine(
4452 bus,
4453 arg_machine,
4454 *pid,
4455 arg_directory,
4456 arg_uuid,
4457 ifi,
4458 arg_slice,
4459 arg_custom_mounts, arg_n_custom_mounts,
4460 arg_kill_signal,
4461 arg_property,
4462 arg_property_message,
4463 arg_keep_unit,
4464 arg_container_service_name);
4465 if (r < 0)
4466 return r;
4467
4468 } else if (!arg_keep_unit) {
4469 r = allocate_scope(
4470 bus,
4471 arg_machine,
4472 *pid,
4473 arg_slice,
4474 arg_custom_mounts, arg_n_custom_mounts,
4475 arg_kill_signal,
4476 arg_property,
4477 arg_property_message);
4478 if (r < 0)
4479 return r;
4480
4481 } else if (arg_slice || arg_property)
4482 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
4483
4484 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
4485 if (r < 0)
4486 return r;
4487
4488 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4489 if (r < 0)
4490 return r;
4491
4492 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4493 if (r < 0)
4494 return r;
4495
4496 /* Notify the child that the parent is ready with all
4497 * its setup (including cgroup-ification), and that
4498 * the child can now hand over control to the code to
4499 * run inside the container. */
4500 (void) barrier_place(&barrier); /* #4 */
4501
4502 /* Block SIGCHLD here, before notifying child.
4503 * process_pty() will handle it with the other signals. */
4504 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4505
4506 /* Reset signal to default */
4507 r = default_signals(SIGCHLD, -1);
4508 if (r < 0)
4509 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4510
4511 r = sd_event_new(&event);
4512 if (r < 0)
4513 return log_error_errno(r, "Failed to get default event source: %m");
4514
4515 (void) sd_event_set_watchdog(event, true);
4516
4517 if (bus) {
4518 r = sd_bus_attach_event(bus, event, 0);
4519 if (r < 0)
4520 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4521 }
4522
4523 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
4524 if (r < 0)
4525 return r;
4526
4527 /* Let the child know that we are ready and wait that the child is completely ready now. */
4528 if (!barrier_place_and_sync(&barrier)) /* #5 */
4529 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4530
4531 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4532 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4533 etc_passwd_lock = safe_close(etc_passwd_lock);
4534
4535 (void) sd_notifyf(false,
4536 "STATUS=Container running.\n"
4537 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4538 if (!arg_notify_ready)
4539 (void) sd_notify(false, "READY=1\n");
4540
4541 if (arg_kill_signal > 0) {
4542 /* Try to kill the init system on SIGINT or SIGTERM */
4543 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4544 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4545 } else {
4546 /* Immediately exit */
4547 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4548 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4549 }
4550
4551 /* Exit when the child exits */
4552 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4553
4554 if (arg_expose_ports) {
4555 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4556 if (r < 0)
4557 return r;
4558
4559 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4560 }
4561
4562 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4563
4564 if (arg_console_mode != CONSOLE_PIPE) {
4565 _cleanup_close_ int fd = -1;
4566 PTYForwardFlags flags = 0;
4567
4568 /* Retrieve the master pty allocated by inner child */
4569 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4570 if (fd < 0)
4571 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4572
4573 switch (arg_console_mode) {
4574
4575 case CONSOLE_READ_ONLY:
4576 flags |= PTY_FORWARD_READ_ONLY;
4577
4578 _fallthrough_;
4579
4580 case CONSOLE_INTERACTIVE:
4581 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4582
4583 r = pty_forward_new(event, fd, flags, &forward);
4584 if (r < 0)
4585 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4586
4587 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4588 (void) pty_forward_set_width_height(forward,
4589 arg_console_width,
4590 arg_console_height);
4591 break;
4592
4593 default:
4594 assert(arg_console_mode == CONSOLE_PASSIVE);
4595 }
4596
4597 *master = TAKE_FD(fd);
4598 }
4599
4600 r = sd_event_loop(event);
4601 if (r < 0)
4602 return log_error_errno(r, "Failed to run event loop: %m");
4603
4604 if (forward) {
4605 char last_char = 0;
4606
4607 (void) pty_forward_get_last_char(forward, &last_char);
4608 forward = pty_forward_free(forward);
4609
4610 if (!arg_quiet && last_char != '\n')
4611 putc('\n', stdout);
4612 }
4613
4614 /* Kill if it is not dead yet anyway */
4615 if (!arg_register && !arg_keep_unit && bus)
4616 terminate_scope(bus, arg_machine);
4617
4618 /* Normally redundant, but better safe than sorry */
4619 (void) kill(*pid, SIGKILL);
4620
4621 r = wait_for_container(*pid, &container_status);
4622 *pid = 0;
4623
4624 /* Tell machined that we are gone. */
4625 if (bus)
4626 (void) unregister_machine(bus, arg_machine);
4627
4628 if (r < 0)
4629 /* We failed to wait for the container, or the container exited abnormally. */
4630 return r;
4631 if (r > 0 || container_status == CONTAINER_TERMINATED) {
4632 /* r > 0 → The container exited with a non-zero status.
4633 * As a special case, we need to replace 133 with a different value,
4634 * because 133 is special-cased in the service file to reboot the container.
4635 * otherwise → The container exited with zero status and a reboot was not requested.
4636 */
4637 if (r == EXIT_FORCE_RESTART)
4638 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4639 *ret = r;
4640 return 0; /* finito */
4641 }
4642
4643 /* CONTAINER_REBOOTED, loop again */
4644
4645 if (arg_keep_unit) {
4646 /* Special handling if we are running as a service: instead of simply
4647 * restarting the machine we want to restart the entire service, so let's
4648 * inform systemd about this with the special exit code 133. The service
4649 * file uses RestartForceExitStatus=133 so that this results in a full
4650 * nspawn restart. This is necessary since we might have cgroup parameters
4651 * set we want to have flushed out. */
4652 *ret = EXIT_FORCE_RESTART;
4653 return 0; /* finito */
4654 }
4655
4656 expose_port_flush(arg_expose_ports, exposed);
4657
4658 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4659 *veth_created = false;
4660 return 1; /* loop again */
4661 }
4662
4663 static int initialize_rlimits(void) {
4664 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4665 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4666 * container execution environments. */
4667
4668 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4669 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4670 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4671 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4672 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4673 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4674 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4675 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4676 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4677 [RLIMIT_NICE] = { 0, 0 },
4678 [RLIMIT_NOFILE] = { 1024, 4096 },
4679 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4680 [RLIMIT_RTPRIO] = { 0, 0 },
4681 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4682 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4683
4684 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4685 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4686 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4687 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4688 * that PID 1 changes a number of other resource limits during early initialization which is why we
4689 * don't read the other limits from PID 1 but prefer the static table above. */
4690 };
4691
4692 int rl;
4693
4694 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4695 /* Let's only fill in what the user hasn't explicitly configured anyway */
4696 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4697 const struct rlimit *v;
4698 struct rlimit buffer;
4699
4700 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4701 /* For these two let's read the limits off PID 1. See above for an explanation. */
4702
4703 if (prlimit(1, rl, NULL, &buffer) < 0)
4704 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4705
4706 v = &buffer;
4707 } else
4708 v = kernel_defaults + rl;
4709
4710 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4711 if (!arg_rlimit[rl])
4712 return log_oom();
4713 }
4714
4715 if (DEBUG_LOGGING) {
4716 _cleanup_free_ char *k = NULL;
4717
4718 (void) rlimit_format(arg_rlimit[rl], &k);
4719 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4720 }
4721 }
4722
4723 return 0;
4724 }
4725
4726 static int run(int argc, char *argv[]) {
4727 bool secondary = false, remove_directory = false, remove_image = false,
4728 veth_created = false, remove_tmprootdir = false;
4729 _cleanup_close_ int master = -1;
4730 _cleanup_fdset_free_ FDSet *fds = NULL;
4731 int r, n_fd_passed, ret = EXIT_SUCCESS;
4732 char veth_name[IFNAMSIZ] = "";
4733 union in_addr_union exposed = {};
4734 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4735 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4736 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4737 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4738 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4739 pid_t pid = 0;
4740
4741 log_parse_environment();
4742 log_open();
4743
4744 r = parse_argv(argc, argv);
4745 if (r <= 0)
4746 goto finish;
4747
4748 r = must_be_root();
4749 if (r < 0)
4750 goto finish;
4751
4752 r = initialize_rlimits();
4753 if (r < 0)
4754 goto finish;
4755
4756 r = load_oci_bundle();
4757 if (r < 0)
4758 goto finish;
4759
4760 r = determine_names();
4761 if (r < 0)
4762 goto finish;
4763
4764 r = load_settings();
4765 if (r < 0)
4766 goto finish;
4767
4768 r = cg_unified();
4769 if (r < 0) {
4770 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4771 goto finish;
4772 }
4773
4774 r = verify_arguments();
4775 if (r < 0)
4776 goto finish;
4777
4778 /* Reapply environment settings. */
4779 (void) detect_unified_cgroup_hierarchy_from_environment();
4780
4781 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4782 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4783 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4784 (void) ignore_signals(SIGPIPE, -1);
4785
4786 n_fd_passed = sd_listen_fds(false);
4787 if (n_fd_passed > 0) {
4788 r = fdset_new_listen_fds(&fds, false);
4789 if (r < 0) {
4790 log_error_errno(r, "Failed to collect file descriptors: %m");
4791 goto finish;
4792 }
4793 }
4794
4795 /* The "default" umask. This is appropriate for most file and directory
4796 * operations performed by nspawn, and is the umask that will be used for
4797 * the child. Functions like copy_devnodes() change the umask temporarily. */
4798 umask(0022);
4799
4800 if (arg_directory) {
4801 assert(!arg_image);
4802
4803 /* Safety precaution: let's not allow running images from the live host OS image, as long as
4804 * /var from the host will propagate into container dynamically (because bad things happen if
4805 * two systems write to the same /var). Let's allow it for the special cases where /var is
4806 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
4807 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
4808 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
4809 r = -EINVAL;
4810 goto finish;
4811 }
4812
4813 if (arg_ephemeral) {
4814 _cleanup_free_ char *np = NULL;
4815
4816 r = chase_symlinks_and_update(&arg_directory, 0);
4817 if (r < 0)
4818 goto finish;
4819
4820 /* If the specified path is a mount point we generate the new snapshot immediately
4821 * inside it under a random name. However if the specified is not a mount point we
4822 * create the new snapshot in the parent directory, just next to it. */
4823 r = path_is_mount_point(arg_directory, NULL, 0);
4824 if (r < 0) {
4825 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4826 goto finish;
4827 }
4828 if (r > 0)
4829 r = tempfn_random_child(arg_directory, "machine.", &np);
4830 else
4831 r = tempfn_random(arg_directory, "machine.", &np);
4832 if (r < 0) {
4833 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4834 goto finish;
4835 }
4836
4837 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
4838 * only owned by us and noone else. */
4839 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4840 if (r < 0) {
4841 log_error_errno(r, "Failed to lock %s: %m", np);
4842 goto finish;
4843 }
4844
4845 {
4846 BLOCK_SIGNALS(SIGINT);
4847 r = btrfs_subvol_snapshot(arg_directory, np,
4848 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4849 BTRFS_SNAPSHOT_FALLBACK_COPY |
4850 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4851 BTRFS_SNAPSHOT_RECURSIVE |
4852 BTRFS_SNAPSHOT_QUOTA |
4853 BTRFS_SNAPSHOT_SIGINT);
4854 }
4855 if (r == -EINTR) {
4856 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
4857 goto finish;
4858 }
4859 if (r < 0) {
4860 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4861 goto finish;
4862 }
4863
4864 free_and_replace(arg_directory, np);
4865 remove_directory = true;
4866 } else {
4867 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4868 if (r < 0)
4869 goto finish;
4870
4871 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4872 if (r == -EBUSY) {
4873 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4874 goto finish;
4875 }
4876 if (r < 0) {
4877 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4878 goto finish;
4879 }
4880
4881 if (arg_template) {
4882 r = chase_symlinks_and_update(&arg_template, 0);
4883 if (r < 0)
4884 goto finish;
4885
4886 {
4887 BLOCK_SIGNALS(SIGINT);
4888 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4889 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4890 BTRFS_SNAPSHOT_FALLBACK_COPY |
4891 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4892 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4893 BTRFS_SNAPSHOT_RECURSIVE |
4894 BTRFS_SNAPSHOT_QUOTA |
4895 BTRFS_SNAPSHOT_SIGINT);
4896 }
4897 if (r == -EEXIST)
4898 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4899 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4900 else if (r == -EINTR) {
4901 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
4902 goto finish;
4903 } else if (r < 0) {
4904 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4905 goto finish;
4906 } else
4907 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4908 "Populated %s from template %s.", arg_directory, arg_template);
4909 }
4910 }
4911
4912 if (arg_start_mode == START_BOOT) {
4913 const char *p;
4914
4915 if (arg_pivot_root_new)
4916 p = prefix_roota(arg_directory, arg_pivot_root_new);
4917 else
4918 p = arg_directory;
4919
4920 if (path_is_os_tree(p) <= 0) {
4921 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4922 r = -EINVAL;
4923 goto finish;
4924 }
4925 } else {
4926 const char *p, *q;
4927
4928 if (arg_pivot_root_new)
4929 p = prefix_roota(arg_directory, arg_pivot_root_new);
4930 else
4931 p = arg_directory;
4932
4933 q = strjoina(p, "/usr/");
4934
4935 if (laccess(q, F_OK) < 0) {
4936 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4937 r = -EINVAL;
4938 goto finish;
4939 }
4940 }
4941
4942 } else {
4943 assert(arg_image);
4944 assert(!arg_template);
4945
4946 r = chase_symlinks_and_update(&arg_image, 0);
4947 if (r < 0)
4948 goto finish;
4949
4950 if (arg_ephemeral) {
4951 _cleanup_free_ char *np = NULL;
4952
4953 r = tempfn_random(arg_image, "machine.", &np);
4954 if (r < 0) {
4955 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4956 goto finish;
4957 }
4958
4959 /* Always take an exclusive lock on our own ephemeral copy. */
4960 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4961 if (r < 0) {
4962 r = log_error_errno(r, "Failed to create image lock: %m");
4963 goto finish;
4964 }
4965
4966 {
4967 BLOCK_SIGNALS(SIGINT);
4968 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
4969 }
4970 if (r == -EINTR) {
4971 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
4972 goto finish;
4973 }
4974 if (r < 0) {
4975 r = log_error_errno(r, "Failed to copy image file: %m");
4976 goto finish;
4977 }
4978
4979 free_and_replace(arg_image, np);
4980 remove_image = true;
4981 } else {
4982 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4983 if (r == -EBUSY) {
4984 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4985 goto finish;
4986 }
4987 if (r < 0) {
4988 r = log_error_errno(r, "Failed to create image lock: %m");
4989 goto finish;
4990 }
4991
4992 if (!arg_root_hash) {
4993 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4994 if (r < 0) {
4995 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4996 goto finish;
4997 }
4998 }
4999 }
5000
5001 if (!mkdtemp(tmprootdir)) {
5002 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5003 goto finish;
5004 }
5005
5006 remove_tmprootdir = true;
5007
5008 arg_directory = strdup(tmprootdir);
5009 if (!arg_directory) {
5010 r = log_oom();
5011 goto finish;
5012 }
5013
5014 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
5015 if (r < 0) {
5016 log_error_errno(r, "Failed to set up loopback block device: %m");
5017 goto finish;
5018 }
5019
5020 r = dissect_image_and_warn(
5021 loop->fd,
5022 arg_image,
5023 arg_root_hash, arg_root_hash_size,
5024 DISSECT_IMAGE_REQUIRE_ROOT,
5025 &dissected_image);
5026 if (r == -ENOPKG) {
5027 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5028 log_notice("Note that the disk image needs to\n"
5029 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5030 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5031 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
5032 " d) or contain a file system without a partition table\n"
5033 "in order to be bootable with systemd-nspawn.");
5034 goto finish;
5035 }
5036 if (r < 0)
5037 goto finish;
5038
5039 if (!arg_root_hash && dissected_image->can_verity)
5040 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5041
5042 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
5043 if (r < 0)
5044 goto finish;
5045
5046 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5047 if (remove_image && unlink(arg_image) >= 0)
5048 remove_image = false;
5049 }
5050
5051 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5052 if (r < 0)
5053 goto finish;
5054
5055 if (arg_console_mode < 0)
5056 arg_console_mode =
5057 isatty(STDIN_FILENO) > 0 &&
5058 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5059
5060 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5061 arg_quiet = true;
5062
5063 if (!arg_quiet)
5064 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5065 arg_machine, arg_image ?: arg_directory);
5066
5067 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5068
5069 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5070 r = log_error_errno(errno, "Failed to become subreaper: %m");
5071 goto finish;
5072 }
5073
5074 for (;;) {
5075 r = run_container(dissected_image,
5076 secondary,
5077 fds,
5078 veth_name, &veth_created,
5079 &exposed, &master,
5080 &pid, &ret);
5081 if (r <= 0)
5082 break;
5083 }
5084
5085 finish:
5086 (void) sd_notify(false,
5087 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5088 "STOPPING=1\nSTATUS=Terminating...");
5089
5090 if (pid > 0)
5091 (void) kill(pid, SIGKILL);
5092
5093 /* Try to flush whatever is still queued in the pty */
5094 if (master >= 0) {
5095 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
5096 master = safe_close(master);
5097 }
5098
5099 if (pid > 0)
5100 (void) wait_for_terminate(pid, NULL);
5101
5102 pager_close();
5103
5104 if (remove_directory && arg_directory) {
5105 int k;
5106
5107 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5108 if (k < 0)
5109 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5110 }
5111
5112 if (remove_image && arg_image) {
5113 if (unlink(arg_image) < 0)
5114 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5115 }
5116
5117 if (remove_tmprootdir) {
5118 if (rmdir(tmprootdir) < 0)
5119 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5120 }
5121
5122 if (arg_machine) {
5123 const char *p;
5124
5125 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5126 (void) rm_rf(p, REMOVE_ROOT);
5127 }
5128
5129 expose_port_flush(arg_expose_ports, &exposed);
5130
5131 if (veth_created)
5132 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5133 (void) remove_bridge(arg_network_zone);
5134
5135 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5136 expose_port_free_all(arg_expose_ports);
5137 rlimit_free_all(arg_rlimit);
5138 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5139
5140 if (r < 0)
5141 return r;
5142
5143 return ret;
5144 }
5145
5146 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);