]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: mangle slice name
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_BLKID
4 #include <blkid.h>
5 #endif
6 #include <errno.h>
7 #include <getopt.h>
8 #include <grp.h>
9 #include <linux/fs.h>
10 #include <linux/loop.h>
11 #include <pwd.h>
12 #include <sched.h>
13 #if HAVE_SELINUX
14 #include <selinux/selinux.h>
15 #endif
16 #include <signal.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/personality.h>
22 #include <sys/prctl.h>
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 #include <unistd.h>
26
27 #include "sd-bus.h"
28 #include "sd-daemon.h"
29 #include "sd-id128.h"
30
31 #include "alloc-util.h"
32 #include "barrier.h"
33 #include "base-filesystem.h"
34 #include "blkid-util.h"
35 #include "btrfs-util.h"
36 #include "bus-error.h"
37 #include "bus-util.h"
38 #include "cap-list.h"
39 #include "capability-util.h"
40 #include "cgroup-util.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "dev-setup.h"
44 #include "dissect-image.h"
45 #include "env-util.h"
46 #include "fd-util.h"
47 #include "fdset.h"
48 #include "fileio.h"
49 #include "format-util.h"
50 #include "fs-util.h"
51 #include "gpt.h"
52 #include "hexdecoct.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "log.h"
56 #include "loop-util.h"
57 #include "loopback-setup.h"
58 #include "machine-image.h"
59 #include "macro.h"
60 #include "main-func.h"
61 #include "missing_sched.h"
62 #include "mkdir.h"
63 #include "mount-util.h"
64 #include "mountpoint-util.h"
65 #include "namespace-util.h"
66 #include "netlink-util.h"
67 #include "nspawn-cgroup.h"
68 #include "nspawn-def.h"
69 #include "nspawn-expose-ports.h"
70 #include "nspawn-mount.h"
71 #include "nspawn-network.h"
72 #include "nspawn-oci.h"
73 #include "nspawn-patch-uid.h"
74 #include "nspawn-register.h"
75 #include "nspawn-seccomp.h"
76 #include "nspawn-settings.h"
77 #include "nspawn-setuid.h"
78 #include "nspawn-stub-pid1.h"
79 #include "nulstr-util.h"
80 #include "os-util.h"
81 #include "pager.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "pretty-print.h"
85 #include "process-util.h"
86 #include "ptyfwd.h"
87 #include "random-util.h"
88 #include "raw-clone.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #if HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "socket-util.h"
97 #include "stat-util.h"
98 #include "stdio-util.h"
99 #include "string-table.h"
100 #include "string-util.h"
101 #include "strv.h"
102 #include "sysctl-util.h"
103 #include "terminal-util.h"
104 #include "tmpfile-util.h"
105 #include "umask-util.h"
106 #include "unit-name.h"
107 #include "user-util.h"
108 #include "util.h"
109
110 #if HAVE_SPLIT_USR
111 #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
112 #else
113 #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
114 #endif
115
116 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
117 * nspawn_notify_socket_path is relative to the container
118 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
119 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
120
121 #define EXIT_FORCE_RESTART 133
122
123 typedef enum ContainerStatus {
124 CONTAINER_TERMINATED,
125 CONTAINER_REBOOTED,
126 } ContainerStatus;
127
128 static char *arg_directory = NULL;
129 static char *arg_template = NULL;
130 static char *arg_chdir = NULL;
131 static char *arg_pivot_root_new = NULL;
132 static char *arg_pivot_root_old = NULL;
133 static char *arg_user = NULL;
134 static uid_t arg_uid = UID_INVALID;
135 static gid_t arg_gid = GID_INVALID;
136 static gid_t* arg_supplementary_gids = NULL;
137 static size_t arg_n_supplementary_gids = 0;
138 static sd_id128_t arg_uuid = {};
139 static char *arg_machine = NULL; /* The name used by the host to refer to this */
140 static char *arg_hostname = NULL; /* The name the payload sees by default */
141 static const char *arg_selinux_context = NULL;
142 static const char *arg_selinux_apifs_context = NULL;
143 static char *arg_slice = NULL;
144 static bool arg_private_network = false;
145 static bool arg_read_only = false;
146 static StartMode arg_start_mode = START_PID1;
147 static bool arg_ephemeral = false;
148 static LinkJournal arg_link_journal = LINK_AUTO;
149 static bool arg_link_journal_try = false;
150 static uint64_t arg_caps_retain =
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_AUDIT_WRITE) |
153 (1ULL << CAP_CHOWN) |
154 (1ULL << CAP_DAC_OVERRIDE) |
155 (1ULL << CAP_DAC_READ_SEARCH) |
156 (1ULL << CAP_FOWNER) |
157 (1ULL << CAP_FSETID) |
158 (1ULL << CAP_IPC_OWNER) |
159 (1ULL << CAP_KILL) |
160 (1ULL << CAP_LEASE) |
161 (1ULL << CAP_LINUX_IMMUTABLE) |
162 (1ULL << CAP_MKNOD) |
163 (1ULL << CAP_NET_BIND_SERVICE) |
164 (1ULL << CAP_NET_BROADCAST) |
165 (1ULL << CAP_NET_RAW) |
166 (1ULL << CAP_SETFCAP) |
167 (1ULL << CAP_SETGID) |
168 (1ULL << CAP_SETPCAP) |
169 (1ULL << CAP_SETUID) |
170 (1ULL << CAP_SYS_ADMIN) |
171 (1ULL << CAP_SYS_BOOT) |
172 (1ULL << CAP_SYS_CHROOT) |
173 (1ULL << CAP_SYS_NICE) |
174 (1ULL << CAP_SYS_PTRACE) |
175 (1ULL << CAP_SYS_RESOURCE) |
176 (1ULL << CAP_SYS_TTY_CONFIG);
177 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
178 static CustomMount *arg_custom_mounts = NULL;
179 static size_t arg_n_custom_mounts = 0;
180 static char **arg_setenv = NULL;
181 static bool arg_quiet = false;
182 static bool arg_register = true;
183 static bool arg_keep_unit = false;
184 static char **arg_network_interfaces = NULL;
185 static char **arg_network_macvlan = NULL;
186 static char **arg_network_ipvlan = NULL;
187 static bool arg_network_veth = false;
188 static char **arg_network_veth_extra = NULL;
189 static char *arg_network_bridge = NULL;
190 static char *arg_network_zone = NULL;
191 static char *arg_network_namespace_path = NULL;
192 static PagerFlags arg_pager_flags = 0;
193 static unsigned long arg_personality = PERSONALITY_INVALID;
194 static char *arg_image = NULL;
195 static char *arg_oci_bundle = NULL;
196 static VolatileMode arg_volatile_mode = VOLATILE_NO;
197 static ExposePort *arg_expose_ports = NULL;
198 static char **arg_property = NULL;
199 static sd_bus_message *arg_property_message = NULL;
200 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
201 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
202 static bool arg_userns_chown = false;
203 static int arg_kill_signal = 0;
204 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
205 static SettingsMask arg_settings_mask = 0;
206 static int arg_settings_trusted = -1;
207 static char **arg_parameters = NULL;
208 static const char *arg_container_service_name = "systemd-nspawn";
209 static bool arg_notify_ready = false;
210 static bool arg_use_cgns = true;
211 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
212 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
213 static void *arg_root_hash = NULL;
214 static size_t arg_root_hash_size = 0;
215 static char **arg_syscall_whitelist = NULL;
216 static char **arg_syscall_blacklist = NULL;
217 #if HAVE_SECCOMP
218 static scmp_filter_ctx arg_seccomp = NULL;
219 #endif
220 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
221 static bool arg_no_new_privileges = false;
222 static int arg_oom_score_adjust = 0;
223 static bool arg_oom_score_adjust_set = false;
224 static CPUSet arg_cpu_set = {};
225 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
226 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
227 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
228 static DeviceNode* arg_extra_nodes = NULL;
229 static size_t arg_n_extra_nodes = 0;
230 static char **arg_sysctl = NULL;
231 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
232
233 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
234 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
235 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
255 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
259 #if HAVE_SECCOMP
260 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
261 #endif
262 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
263 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
264
265 static int handle_arg_console(const char *arg) {
266 if (streq(arg, "help")) {
267 puts("interactive\n"
268 "read-only\n"
269 "passive\n"
270 "pipe");
271 return 0;
272 }
273
274 if (streq(arg, "interactive"))
275 arg_console_mode = CONSOLE_INTERACTIVE;
276 else if (streq(arg, "read-only"))
277 arg_console_mode = CONSOLE_READ_ONLY;
278 else if (streq(arg, "passive"))
279 arg_console_mode = CONSOLE_PASSIVE;
280 else if (streq(arg, "pipe"))
281 arg_console_mode = CONSOLE_PIPE;
282 else
283 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
284
285 arg_settings_mask |= SETTING_CONSOLE_MODE;
286 return 1;
287 }
288
289 static int help(void) {
290 _cleanup_free_ char *link = NULL;
291 int r;
292
293 (void) pager_open(arg_pager_flags);
294
295 r = terminal_urlify_man("systemd-nspawn", "1", &link);
296 if (r < 0)
297 return log_oom();
298
299 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
300 "Spawn a command or OS in a light-weight container.\n\n"
301 " -h --help Show this help\n"
302 " --version Print version string\n"
303 " -q --quiet Do not show status information\n"
304 " --no-pager Do not pipe output into a pager\n"
305 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
306 "%3$sImage:%4$s\n"
307 " -D --directory=PATH Root directory for the container\n"
308 " --template=PATH Initialize root directory from template directory,\n"
309 " if missing\n"
310 " -x --ephemeral Run container with snapshot of root directory, and\n"
311 " remove it after exit\n"
312 " -i --image=PATH Root file system disk image (or device node) for\n"
313 " the container\n"
314 " --oci-bundle=PATH OCI bundle directory\n"
315 " --read-only Mount the root directory read-only\n"
316 " --volatile[=MODE] Run the system in volatile mode\n"
317 " --root-hash=HASH Specify verity root hash for root disk image\n"
318 " --pivot-root=PATH[:PATH]\n"
319 " Pivot root to given directory in the container\n\n"
320 "%3$sExecution:%4$s\n"
321 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
322 " -b --boot Boot up full system (i.e. invoke init)\n"
323 " --chdir=PATH Set working directory in the container\n"
324 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
325 " -u --user=USER Run the command under specified user or UID\n"
326 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
327 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
328 "%3$sSystem Identity:%4$s\n"
329 " -M --machine=NAME Set the machine name for the container\n"
330 " --hostname=NAME Override the hostname for the container\n"
331 " --uuid=UUID Set a specific machine UUID for the container\n\n"
332 "%3$sProperties:%4$s\n"
333 " -S --slice=SLICE Place the container in the specified slice\n"
334 " --property=NAME=VALUE Set scope unit property\n"
335 " --register=BOOLEAN Register container as machine\n"
336 " --keep-unit Do not register a scope for the machine, reuse\n"
337 " the service unit nspawn is running in\n\n"
338 "%3$sUser Namespacing:%4$s\n"
339 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
340 " --private-users[=UIDBASE[:NUIDS]]\n"
341 " Similar, but with user configured UID/GID range\n"
342 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
343 "%3$sNetworking:%4$s\n"
344 " --private-network Disable network in container\n"
345 " --network-interface=INTERFACE\n"
346 " Assign an existing network interface to the\n"
347 " container\n"
348 " --network-macvlan=INTERFACE\n"
349 " Create a macvlan network interface based on an\n"
350 " existing network interface to the container\n"
351 " --network-ipvlan=INTERFACE\n"
352 " Create a ipvlan network interface based on an\n"
353 " existing network interface to the container\n"
354 " -n --network-veth Add a virtual Ethernet connection between host\n"
355 " and container\n"
356 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
357 " Add an additional virtual Ethernet link between\n"
358 " host and container\n"
359 " --network-bridge=INTERFACE\n"
360 " Add a virtual Ethernet connection to the container\n"
361 " and attach it to an existing bridge on the host\n"
362 " --network-zone=NAME Similar, but attach the new interface to an\n"
363 " an automatically managed bridge interface\n"
364 " --network-namespace-path=PATH\n"
365 " Set network namespace to the one represented by\n"
366 " the specified kernel namespace file node\n"
367 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
368 " Expose a container IP port on the host\n\n"
369 "%3$sSecurity:%4$s\n"
370 " --capability=CAP In addition to the default, retain specified\n"
371 " capability\n"
372 " --drop-capability=CAP Drop the specified capability from the default set\n"
373 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
374 " --system-call-filter=LIST|~LIST\n"
375 " Permit/prohibit specific system calls\n"
376 " -Z --selinux-context=SECLABEL\n"
377 " Set the SELinux security context to be used by\n"
378 " processes in the container\n"
379 " -L --selinux-apifs-context=SECLABEL\n"
380 " Set the SELinux security context to be used by\n"
381 " API/tmpfs file systems in the container\n\n"
382 "%3$sResources:%4$s\n"
383 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
384 " --oom-score-adjust=VALUE\n"
385 " Adjust the OOM score value for the payload\n"
386 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
387 " --personality=ARCH Pick personality for this container\n\n"
388 "%3$sIntegration:%4$s\n"
389 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
390 " --timezone=MODE Select mode of /etc/localtime initialization\n"
391 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
392 " host, try-guest, try-host\n"
393 " -j Equivalent to --link-journal=try-guest\n\n"
394 "%3$sMounts:%4$s\n"
395 " --bind=PATH[:PATH[:OPTIONS]]\n"
396 " Bind mount a file or directory from the host into\n"
397 " the container\n"
398 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
399 " Similar, but creates a read-only bind mount\n"
400 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
401 " it\n"
402 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
403 " --overlay=PATH[:PATH...]:PATH\n"
404 " Create an overlay mount from the host to \n"
405 " the container\n"
406 " --overlay-ro=PATH[:PATH...]:PATH\n"
407 " Similar, but creates a read-only overlay mount\n\n"
408 "%3$sInput/Output:%4$s\n"
409 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
410 " set up for the container.\n"
411 " -P --pipe Equivalent to --console=pipe\n"
412 "\nSee the %2$s for details.\n"
413 , program_invocation_short_name
414 , link
415 , ansi_underline(), ansi_normal());
416
417 return 0;
418 }
419
420 static int custom_mount_check_all(void) {
421 size_t i;
422
423 for (i = 0; i < arg_n_custom_mounts; i++) {
424 CustomMount *m = &arg_custom_mounts[i];
425
426 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
427 if (arg_userns_chown)
428 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
429 "--private-users-chown may not be combined with custom root mounts.");
430 else if (arg_uid_shift == UID_INVALID)
431 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
432 "--private-users with automatic UID shift may not be combined with custom root mounts.");
433 }
434 }
435
436 return 0;
437 }
438
439 static int detect_unified_cgroup_hierarchy_from_environment(void) {
440 const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
441 int r;
442
443 /* Allow the user to control whether the unified hierarchy is used */
444
445 e = getenv(var);
446 if (!e) {
447 static bool warned = false;
448
449 var = "UNIFIED_CGROUP_HIERARCHY";
450 e = getenv(var);
451 if (e && !warned) {
452 log_info("$UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY.");
453 warned = true;
454 }
455 }
456
457 if (!isempty(e)) {
458 r = parse_boolean(e);
459 if (r < 0)
460 return log_error_errno(r, "Failed to parse $%s: %m", var);
461 if (r > 0)
462 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
463 else
464 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
465 }
466
467 return 0;
468 }
469
470 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
471 int r;
472
473 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
474 * in the image actually supports. */
475 r = cg_all_unified();
476 if (r < 0)
477 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
478 if (r > 0) {
479 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
480 * routine only detects 231, so we'll have a false negative here for 230. */
481 r = systemd_installation_has_version(directory, 230);
482 if (r < 0)
483 return log_error_errno(r, "Failed to determine systemd version in container: %m");
484 if (r > 0)
485 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
486 else
487 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
488 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
489 /* Mixed cgroup hierarchy support was added in 233 */
490 r = systemd_installation_has_version(directory, 233);
491 if (r < 0)
492 return log_error_errno(r, "Failed to determine systemd version in container: %m");
493 if (r > 0)
494 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
495 else
496 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
497 } else
498 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
499
500 log_debug("Using %s hierarchy for container.",
501 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
502 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
503
504 return 0;
505 }
506
507 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
508 int r;
509
510 r = getenv_bool(name);
511 if (r == -ENXIO)
512 return 0;
513 if (r < 0)
514 return log_error_errno(r, "Failed to parse $%s: %m", name);
515
516 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
517 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
518 return 0;
519 }
520
521 static int parse_mount_settings_env(void) {
522 const char *e;
523 int r;
524
525 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
526 if (r < 0 && r != -ENXIO)
527 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
528 if (r >= 0)
529 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
530
531 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
532 if (streq_ptr(e, "network"))
533 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
534
535 else if (e) {
536 r = parse_boolean(e);
537 if (r < 0)
538 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
539
540 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
541 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
542 }
543
544 return 0;
545 }
546
547 static int parse_environment(void) {
548 const char *e;
549 int r;
550
551 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
552 if (r < 0)
553 return r;
554 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
555 if (r < 0)
556 return r;
557 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
558 if (r < 0)
559 return r;
560 r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
561 if (r < 0)
562 return r;
563
564 r = parse_mount_settings_env();
565 if (r < 0)
566 return r;
567
568 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
569 * even if it is supported. If not supported, it has no effect. */
570 if (!cg_ns_supported())
571 arg_use_cgns = false;
572 else {
573 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
574 if (r < 0) {
575 if (r != -ENXIO)
576 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
577
578 arg_use_cgns = true;
579 } else {
580 arg_use_cgns = r > 0;
581 arg_settings_mask |= SETTING_USE_CGNS;
582 }
583 }
584
585 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
586 if (e)
587 arg_container_service_name = e;
588
589 return detect_unified_cgroup_hierarchy_from_environment();
590 }
591
592 static int parse_argv(int argc, char *argv[]) {
593 enum {
594 ARG_VERSION = 0x100,
595 ARG_PRIVATE_NETWORK,
596 ARG_UUID,
597 ARG_READ_ONLY,
598 ARG_CAPABILITY,
599 ARG_DROP_CAPABILITY,
600 ARG_LINK_JOURNAL,
601 ARG_BIND,
602 ARG_BIND_RO,
603 ARG_TMPFS,
604 ARG_OVERLAY,
605 ARG_OVERLAY_RO,
606 ARG_INACCESSIBLE,
607 ARG_SHARE_SYSTEM,
608 ARG_REGISTER,
609 ARG_KEEP_UNIT,
610 ARG_NETWORK_INTERFACE,
611 ARG_NETWORK_MACVLAN,
612 ARG_NETWORK_IPVLAN,
613 ARG_NETWORK_BRIDGE,
614 ARG_NETWORK_ZONE,
615 ARG_NETWORK_VETH_EXTRA,
616 ARG_NETWORK_NAMESPACE_PATH,
617 ARG_PERSONALITY,
618 ARG_VOLATILE,
619 ARG_TEMPLATE,
620 ARG_PROPERTY,
621 ARG_PRIVATE_USERS,
622 ARG_KILL_SIGNAL,
623 ARG_SETTINGS,
624 ARG_CHDIR,
625 ARG_PIVOT_ROOT,
626 ARG_PRIVATE_USERS_CHOWN,
627 ARG_NOTIFY_READY,
628 ARG_ROOT_HASH,
629 ARG_SYSTEM_CALL_FILTER,
630 ARG_RLIMIT,
631 ARG_HOSTNAME,
632 ARG_NO_NEW_PRIVILEGES,
633 ARG_OOM_SCORE_ADJUST,
634 ARG_CPU_AFFINITY,
635 ARG_RESOLV_CONF,
636 ARG_TIMEZONE,
637 ARG_CONSOLE,
638 ARG_PIPE,
639 ARG_OCI_BUNDLE,
640 ARG_NO_PAGER,
641 };
642
643 static const struct option options[] = {
644 { "help", no_argument, NULL, 'h' },
645 { "version", no_argument, NULL, ARG_VERSION },
646 { "directory", required_argument, NULL, 'D' },
647 { "template", required_argument, NULL, ARG_TEMPLATE },
648 { "ephemeral", no_argument, NULL, 'x' },
649 { "user", required_argument, NULL, 'u' },
650 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
651 { "as-pid2", no_argument, NULL, 'a' },
652 { "boot", no_argument, NULL, 'b' },
653 { "uuid", required_argument, NULL, ARG_UUID },
654 { "read-only", no_argument, NULL, ARG_READ_ONLY },
655 { "capability", required_argument, NULL, ARG_CAPABILITY },
656 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
657 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
658 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
659 { "bind", required_argument, NULL, ARG_BIND },
660 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
661 { "tmpfs", required_argument, NULL, ARG_TMPFS },
662 { "overlay", required_argument, NULL, ARG_OVERLAY },
663 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
664 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
665 { "machine", required_argument, NULL, 'M' },
666 { "hostname", required_argument, NULL, ARG_HOSTNAME },
667 { "slice", required_argument, NULL, 'S' },
668 { "setenv", required_argument, NULL, 'E' },
669 { "selinux-context", required_argument, NULL, 'Z' },
670 { "selinux-apifs-context", required_argument, NULL, 'L' },
671 { "quiet", no_argument, NULL, 'q' },
672 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
673 { "register", required_argument, NULL, ARG_REGISTER },
674 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
675 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
676 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
677 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
678 { "network-veth", no_argument, NULL, 'n' },
679 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
680 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
681 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
682 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
683 { "personality", required_argument, NULL, ARG_PERSONALITY },
684 { "image", required_argument, NULL, 'i' },
685 { "volatile", optional_argument, NULL, ARG_VOLATILE },
686 { "port", required_argument, NULL, 'p' },
687 { "property", required_argument, NULL, ARG_PROPERTY },
688 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
689 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
690 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
691 { "settings", required_argument, NULL, ARG_SETTINGS },
692 { "chdir", required_argument, NULL, ARG_CHDIR },
693 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
694 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
695 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
696 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
697 { "rlimit", required_argument, NULL, ARG_RLIMIT },
698 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
699 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
700 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
701 { "timezone", required_argument, NULL, ARG_TIMEZONE },
702 { "console", required_argument, NULL, ARG_CONSOLE },
703 { "pipe", no_argument, NULL, ARG_PIPE },
704 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
705 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
706 {}
707 };
708
709 int c, r;
710 const char *p;
711 uint64_t plus = 0, minus = 0;
712 bool mask_all_settings = false, mask_no_settings = false;
713
714 assert(argc >= 0);
715 assert(argv);
716
717 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
718 switch (c) {
719
720 case 'h':
721 return help();
722
723 case ARG_VERSION:
724 return version();
725
726 case 'D':
727 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
728 if (r < 0)
729 return r;
730
731 arg_settings_mask |= SETTING_DIRECTORY;
732 break;
733
734 case ARG_TEMPLATE:
735 r = parse_path_argument_and_warn(optarg, false, &arg_template);
736 if (r < 0)
737 return r;
738
739 arg_settings_mask |= SETTING_DIRECTORY;
740 break;
741
742 case 'i':
743 r = parse_path_argument_and_warn(optarg, false, &arg_image);
744 if (r < 0)
745 return r;
746
747 arg_settings_mask |= SETTING_DIRECTORY;
748 break;
749
750 case ARG_OCI_BUNDLE:
751 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
752 if (r < 0)
753 return r;
754
755 break;
756
757 case 'x':
758 arg_ephemeral = true;
759 arg_settings_mask |= SETTING_EPHEMERAL;
760 break;
761
762 case 'u':
763 r = free_and_strdup(&arg_user, optarg);
764 if (r < 0)
765 return log_oom();
766
767 arg_settings_mask |= SETTING_USER;
768 break;
769
770 case ARG_NETWORK_ZONE: {
771 char *j;
772
773 j = strjoin("vz-", optarg);
774 if (!j)
775 return log_oom();
776
777 if (!ifname_valid(j)) {
778 log_error("Network zone name not valid: %s", j);
779 free(j);
780 return -EINVAL;
781 }
782
783 free_and_replace(arg_network_zone, j);
784
785 arg_network_veth = true;
786 arg_private_network = true;
787 arg_settings_mask |= SETTING_NETWORK;
788 break;
789 }
790
791 case ARG_NETWORK_BRIDGE:
792
793 if (!ifname_valid(optarg))
794 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
795 "Bridge interface name not valid: %s", optarg);
796
797 r = free_and_strdup(&arg_network_bridge, optarg);
798 if (r < 0)
799 return log_oom();
800
801 _fallthrough_;
802 case 'n':
803 arg_network_veth = true;
804 arg_private_network = true;
805 arg_settings_mask |= SETTING_NETWORK;
806 break;
807
808 case ARG_NETWORK_VETH_EXTRA:
809 r = veth_extra_parse(&arg_network_veth_extra, optarg);
810 if (r < 0)
811 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
812
813 arg_private_network = true;
814 arg_settings_mask |= SETTING_NETWORK;
815 break;
816
817 case ARG_NETWORK_INTERFACE:
818 if (!ifname_valid(optarg))
819 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
820 "Network interface name not valid: %s", optarg);
821
822 if (strv_extend(&arg_network_interfaces, optarg) < 0)
823 return log_oom();
824
825 arg_private_network = true;
826 arg_settings_mask |= SETTING_NETWORK;
827 break;
828
829 case ARG_NETWORK_MACVLAN:
830
831 if (!ifname_valid(optarg))
832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
833 "MACVLAN network interface name not valid: %s", optarg);
834
835 if (strv_extend(&arg_network_macvlan, optarg) < 0)
836 return log_oom();
837
838 arg_private_network = true;
839 arg_settings_mask |= SETTING_NETWORK;
840 break;
841
842 case ARG_NETWORK_IPVLAN:
843
844 if (!ifname_valid(optarg))
845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
846 "IPVLAN network interface name not valid: %s", optarg);
847
848 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
849 return log_oom();
850
851 _fallthrough_;
852 case ARG_PRIVATE_NETWORK:
853 arg_private_network = true;
854 arg_settings_mask |= SETTING_NETWORK;
855 break;
856
857 case ARG_NETWORK_NAMESPACE_PATH:
858 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
859 if (r < 0)
860 return r;
861
862 arg_settings_mask |= SETTING_NETWORK;
863 break;
864
865 case 'b':
866 if (arg_start_mode == START_PID2)
867 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
868 "--boot and --as-pid2 may not be combined.");
869
870 arg_start_mode = START_BOOT;
871 arg_settings_mask |= SETTING_START_MODE;
872 break;
873
874 case 'a':
875 if (arg_start_mode == START_BOOT)
876 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
877 "--boot and --as-pid2 may not be combined.");
878
879 arg_start_mode = START_PID2;
880 arg_settings_mask |= SETTING_START_MODE;
881 break;
882
883 case ARG_UUID:
884 r = sd_id128_from_string(optarg, &arg_uuid);
885 if (r < 0)
886 return log_error_errno(r, "Invalid UUID: %s", optarg);
887
888 if (sd_id128_is_null(arg_uuid))
889 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
890 "Machine UUID may not be all zeroes.");
891
892 arg_settings_mask |= SETTING_MACHINE_ID;
893 break;
894
895 case 'S': {
896 _cleanup_free_ char *mangled = NULL;
897
898 r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
899 if (r < 0)
900 return log_oom();
901
902 free_and_replace(arg_slice, mangled);
903 arg_settings_mask |= SETTING_SLICE;
904 break;
905 }
906
907 case 'M':
908 if (isempty(optarg))
909 arg_machine = mfree(arg_machine);
910 else {
911 if (!machine_name_is_valid(optarg))
912 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
913 "Invalid machine name: %s", optarg);
914
915 r = free_and_strdup(&arg_machine, optarg);
916 if (r < 0)
917 return log_oom();
918 }
919 break;
920
921 case ARG_HOSTNAME:
922 if (isempty(optarg))
923 arg_hostname = mfree(arg_hostname);
924 else {
925 if (!hostname_is_valid(optarg, false))
926 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
927 "Invalid hostname: %s", optarg);
928
929 r = free_and_strdup(&arg_hostname, optarg);
930 if (r < 0)
931 return log_oom();
932 }
933
934 arg_settings_mask |= SETTING_HOSTNAME;
935 break;
936
937 case 'Z':
938 arg_selinux_context = optarg;
939 break;
940
941 case 'L':
942 arg_selinux_apifs_context = optarg;
943 break;
944
945 case ARG_READ_ONLY:
946 arg_read_only = true;
947 arg_settings_mask |= SETTING_READ_ONLY;
948 break;
949
950 case ARG_CAPABILITY:
951 case ARG_DROP_CAPABILITY: {
952 p = optarg;
953 for (;;) {
954 _cleanup_free_ char *t = NULL;
955
956 r = extract_first_word(&p, &t, ",", 0);
957 if (r < 0)
958 return log_error_errno(r, "Failed to parse capability %s.", t);
959 if (r == 0)
960 break;
961
962 if (streq(t, "all")) {
963 if (c == ARG_CAPABILITY)
964 plus = (uint64_t) -1;
965 else
966 minus = (uint64_t) -1;
967 } else {
968 r = capability_from_name(t);
969 if (r < 0)
970 return log_error_errno(r, "Failed to parse capability %s.", t);
971
972 if (c == ARG_CAPABILITY)
973 plus |= 1ULL << r;
974 else
975 minus |= 1ULL << r;
976 }
977 }
978
979 arg_settings_mask |= SETTING_CAPABILITY;
980 break;
981 }
982
983 case ARG_NO_NEW_PRIVILEGES:
984 r = parse_boolean(optarg);
985 if (r < 0)
986 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
987
988 arg_no_new_privileges = r;
989 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
990 break;
991
992 case 'j':
993 arg_link_journal = LINK_GUEST;
994 arg_link_journal_try = true;
995 arg_settings_mask |= SETTING_LINK_JOURNAL;
996 break;
997
998 case ARG_LINK_JOURNAL:
999 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1000 if (r < 0)
1001 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1002
1003 arg_settings_mask |= SETTING_LINK_JOURNAL;
1004 break;
1005
1006 case ARG_BIND:
1007 case ARG_BIND_RO:
1008 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1009 if (r < 0)
1010 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1011
1012 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1013 break;
1014
1015 case ARG_TMPFS:
1016 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1017 if (r < 0)
1018 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1019
1020 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1021 break;
1022
1023 case ARG_OVERLAY:
1024 case ARG_OVERLAY_RO:
1025 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1026 if (r == -EADDRNOTAVAIL)
1027 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1028 if (r < 0)
1029 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1030
1031 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1032 break;
1033
1034 case ARG_INACCESSIBLE:
1035 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1036 if (r < 0)
1037 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1038
1039 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1040 break;
1041
1042 case 'E': {
1043 char **n;
1044
1045 if (!env_assignment_is_valid(optarg))
1046 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1047 "Environment variable assignment '%s' is not valid.", optarg);
1048
1049 n = strv_env_set(arg_setenv, optarg);
1050 if (!n)
1051 return log_oom();
1052
1053 strv_free_and_replace(arg_setenv, n);
1054 arg_settings_mask |= SETTING_ENVIRONMENT;
1055 break;
1056 }
1057
1058 case 'q':
1059 arg_quiet = true;
1060 break;
1061
1062 case ARG_SHARE_SYSTEM:
1063 /* We don't officially support this anymore, except for compat reasons. People should use the
1064 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1065 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1066 arg_clone_ns_flags = 0;
1067 break;
1068
1069 case ARG_REGISTER:
1070 r = parse_boolean(optarg);
1071 if (r < 0) {
1072 log_error("Failed to parse --register= argument: %s", optarg);
1073 return r;
1074 }
1075
1076 arg_register = r;
1077 break;
1078
1079 case ARG_KEEP_UNIT:
1080 arg_keep_unit = true;
1081 break;
1082
1083 case ARG_PERSONALITY:
1084
1085 arg_personality = personality_from_string(optarg);
1086 if (arg_personality == PERSONALITY_INVALID)
1087 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1088 "Unknown or unsupported personality '%s'.", optarg);
1089
1090 arg_settings_mask |= SETTING_PERSONALITY;
1091 break;
1092
1093 case ARG_VOLATILE:
1094
1095 if (!optarg)
1096 arg_volatile_mode = VOLATILE_YES;
1097 else if (streq(optarg, "help")) {
1098 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1099 return 0;
1100 } else {
1101 VolatileMode m;
1102
1103 m = volatile_mode_from_string(optarg);
1104 if (m < 0)
1105 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1106 "Failed to parse --volatile= argument: %s", optarg);
1107 else
1108 arg_volatile_mode = m;
1109 }
1110
1111 arg_settings_mask |= SETTING_VOLATILE_MODE;
1112 break;
1113
1114 case 'p':
1115 r = expose_port_parse(&arg_expose_ports, optarg);
1116 if (r == -EEXIST)
1117 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1118 if (r < 0)
1119 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1120
1121 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1122 break;
1123
1124 case ARG_PROPERTY:
1125 if (strv_extend(&arg_property, optarg) < 0)
1126 return log_oom();
1127
1128 break;
1129
1130 case ARG_PRIVATE_USERS: {
1131 int boolean = -1;
1132
1133 if (!optarg)
1134 boolean = true;
1135 else if (!in_charset(optarg, DIGITS))
1136 /* do *not* parse numbers as booleans */
1137 boolean = parse_boolean(optarg);
1138
1139 if (boolean == false) {
1140 /* no: User namespacing off */
1141 arg_userns_mode = USER_NAMESPACE_NO;
1142 arg_uid_shift = UID_INVALID;
1143 arg_uid_range = UINT32_C(0x10000);
1144 } else if (boolean == true) {
1145 /* yes: User namespacing on, UID range is read from root dir */
1146 arg_userns_mode = USER_NAMESPACE_FIXED;
1147 arg_uid_shift = UID_INVALID;
1148 arg_uid_range = UINT32_C(0x10000);
1149 } else if (streq(optarg, "pick")) {
1150 /* pick: User namespacing on, UID range is picked randomly */
1151 arg_userns_mode = USER_NAMESPACE_PICK;
1152 arg_uid_shift = UID_INVALID;
1153 arg_uid_range = UINT32_C(0x10000);
1154 } else {
1155 _cleanup_free_ char *buffer = NULL;
1156 const char *range, *shift;
1157
1158 /* anything else: User namespacing on, UID range is explicitly configured */
1159
1160 range = strchr(optarg, ':');
1161 if (range) {
1162 buffer = strndup(optarg, range - optarg);
1163 if (!buffer)
1164 return log_oom();
1165 shift = buffer;
1166
1167 range++;
1168 r = safe_atou32(range, &arg_uid_range);
1169 if (r < 0)
1170 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1171 } else
1172 shift = optarg;
1173
1174 r = parse_uid(shift, &arg_uid_shift);
1175 if (r < 0)
1176 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1177
1178 arg_userns_mode = USER_NAMESPACE_FIXED;
1179 }
1180
1181 if (arg_uid_range <= 0)
1182 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1183 "UID range cannot be 0.");
1184
1185 arg_settings_mask |= SETTING_USERNS;
1186 break;
1187 }
1188
1189 case 'U':
1190 if (userns_supported()) {
1191 arg_userns_mode = USER_NAMESPACE_PICK;
1192 arg_uid_shift = UID_INVALID;
1193 arg_uid_range = UINT32_C(0x10000);
1194
1195 arg_settings_mask |= SETTING_USERNS;
1196 }
1197
1198 break;
1199
1200 case ARG_PRIVATE_USERS_CHOWN:
1201 arg_userns_chown = true;
1202
1203 arg_settings_mask |= SETTING_USERNS;
1204 break;
1205
1206 case ARG_KILL_SIGNAL:
1207 if (streq(optarg, "help")) {
1208 DUMP_STRING_TABLE(signal, int, _NSIG);
1209 return 0;
1210 }
1211
1212 arg_kill_signal = signal_from_string(optarg);
1213 if (arg_kill_signal < 0)
1214 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1215 "Cannot parse signal: %s", optarg);
1216
1217 arg_settings_mask |= SETTING_KILL_SIGNAL;
1218 break;
1219
1220 case ARG_SETTINGS:
1221
1222 /* no → do not read files
1223 * yes → read files, do not override cmdline, trust only subset
1224 * override → read files, override cmdline, trust only subset
1225 * trusted → read files, do not override cmdline, trust all
1226 */
1227
1228 r = parse_boolean(optarg);
1229 if (r < 0) {
1230 if (streq(optarg, "trusted")) {
1231 mask_all_settings = false;
1232 mask_no_settings = false;
1233 arg_settings_trusted = true;
1234
1235 } else if (streq(optarg, "override")) {
1236 mask_all_settings = false;
1237 mask_no_settings = true;
1238 arg_settings_trusted = -1;
1239 } else
1240 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1241 } else if (r > 0) {
1242 /* yes */
1243 mask_all_settings = false;
1244 mask_no_settings = false;
1245 arg_settings_trusted = -1;
1246 } else {
1247 /* no */
1248 mask_all_settings = true;
1249 mask_no_settings = false;
1250 arg_settings_trusted = false;
1251 }
1252
1253 break;
1254
1255 case ARG_CHDIR:
1256 if (!path_is_absolute(optarg))
1257 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1258 "Working directory %s is not an absolute path.", optarg);
1259
1260 r = free_and_strdup(&arg_chdir, optarg);
1261 if (r < 0)
1262 return log_oom();
1263
1264 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1265 break;
1266
1267 case ARG_PIVOT_ROOT:
1268 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1269 if (r < 0)
1270 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1271
1272 arg_settings_mask |= SETTING_PIVOT_ROOT;
1273 break;
1274
1275 case ARG_NOTIFY_READY:
1276 r = parse_boolean(optarg);
1277 if (r < 0)
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1279 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1280 arg_notify_ready = r;
1281 arg_settings_mask |= SETTING_NOTIFY_READY;
1282 break;
1283
1284 case ARG_ROOT_HASH: {
1285 void *k;
1286 size_t l;
1287
1288 r = unhexmem(optarg, strlen(optarg), &k, &l);
1289 if (r < 0)
1290 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1291 if (l < sizeof(sd_id128_t)) {
1292 free(k);
1293 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1294 }
1295
1296 free(arg_root_hash);
1297 arg_root_hash = k;
1298 arg_root_hash_size = l;
1299 break;
1300 }
1301
1302 case ARG_SYSTEM_CALL_FILTER: {
1303 bool negative;
1304 const char *items;
1305
1306 negative = optarg[0] == '~';
1307 items = negative ? optarg + 1 : optarg;
1308
1309 for (;;) {
1310 _cleanup_free_ char *word = NULL;
1311
1312 r = extract_first_word(&items, &word, NULL, 0);
1313 if (r == 0)
1314 break;
1315 if (r == -ENOMEM)
1316 return log_oom();
1317 if (r < 0)
1318 return log_error_errno(r, "Failed to parse system call filter: %m");
1319
1320 if (negative)
1321 r = strv_extend(&arg_syscall_blacklist, word);
1322 else
1323 r = strv_extend(&arg_syscall_whitelist, word);
1324 if (r < 0)
1325 return log_oom();
1326 }
1327
1328 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1329 break;
1330 }
1331
1332 case ARG_RLIMIT: {
1333 const char *eq;
1334 _cleanup_free_ char *name = NULL;
1335 int rl;
1336
1337 if (streq(optarg, "help")) {
1338 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1339 return 0;
1340 }
1341
1342 eq = strchr(optarg, '=');
1343 if (!eq)
1344 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1345 "--rlimit= expects an '=' assignment.");
1346
1347 name = strndup(optarg, eq - optarg);
1348 if (!name)
1349 return log_oom();
1350
1351 rl = rlimit_from_string_harder(name);
1352 if (rl < 0)
1353 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1354 "Unknown resource limit: %s", name);
1355
1356 if (!arg_rlimit[rl]) {
1357 arg_rlimit[rl] = new0(struct rlimit, 1);
1358 if (!arg_rlimit[rl])
1359 return log_oom();
1360 }
1361
1362 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1363 if (r < 0)
1364 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1365
1366 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1367 break;
1368 }
1369
1370 case ARG_OOM_SCORE_ADJUST:
1371 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1372 if (r < 0)
1373 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1374
1375 arg_oom_score_adjust_set = true;
1376 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1377 break;
1378
1379 case ARG_CPU_AFFINITY: {
1380 CPUSet cpuset;
1381
1382 r = parse_cpu_set(optarg, &cpuset);
1383 if (r < 0)
1384 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1385
1386 cpu_set_reset(&arg_cpu_set);
1387 arg_cpu_set = cpuset;
1388 arg_settings_mask |= SETTING_CPU_AFFINITY;
1389 break;
1390 }
1391
1392 case ARG_RESOLV_CONF:
1393 if (streq(optarg, "help")) {
1394 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1395 return 0;
1396 }
1397
1398 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1399 if (arg_resolv_conf < 0)
1400 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1401 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1402
1403 arg_settings_mask |= SETTING_RESOLV_CONF;
1404 break;
1405
1406 case ARG_TIMEZONE:
1407 if (streq(optarg, "help")) {
1408 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1409 return 0;
1410 }
1411
1412 arg_timezone = timezone_mode_from_string(optarg);
1413 if (arg_timezone < 0)
1414 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1415 "Failed to parse /etc/localtime mode: %s", optarg);
1416
1417 arg_settings_mask |= SETTING_TIMEZONE;
1418 break;
1419
1420 case ARG_CONSOLE:
1421 r = handle_arg_console(optarg);
1422 if (r <= 0)
1423 return r;
1424 break;
1425
1426 case 'P':
1427 case ARG_PIPE:
1428 r = handle_arg_console("pipe");
1429 if (r <= 0)
1430 return r;
1431 break;
1432
1433 case ARG_NO_PAGER:
1434 arg_pager_flags |= PAGER_DISABLE;
1435 break;
1436
1437 case '?':
1438 return -EINVAL;
1439
1440 default:
1441 assert_not_reached("Unhandled option");
1442 }
1443
1444 if (argc > optind) {
1445 strv_free(arg_parameters);
1446 arg_parameters = strv_copy(argv + optind);
1447 if (!arg_parameters)
1448 return log_oom();
1449
1450 arg_settings_mask |= SETTING_START_MODE;
1451 }
1452
1453 if (arg_ephemeral && arg_template && !arg_directory)
1454 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1455 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1456 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1457 * --directory=". */
1458 arg_directory = TAKE_PTR(arg_template);
1459
1460 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1461
1462 /* Make sure to parse environment before we reset the settings mask below */
1463 r = parse_environment();
1464 if (r < 0)
1465 return r;
1466
1467 /* Load all settings from .nspawn files */
1468 if (mask_no_settings)
1469 arg_settings_mask = 0;
1470
1471 /* Don't load any settings from .nspawn files */
1472 if (mask_all_settings)
1473 arg_settings_mask = _SETTINGS_MASK_ALL;
1474
1475 return 1;
1476 }
1477
1478 static int verify_arguments(void) {
1479 int r;
1480
1481 if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1482 /* If we are running the stub init in the container, we don't need to look at what the init
1483 * in the container supports, because we are not using it. Let's immediately pick the right
1484 * setting based on the host system configuration.
1485 *
1486 * We only do this, if the user didn't use an environment variable to override the detection.
1487 */
1488
1489 r = cg_all_unified();
1490 if (r < 0)
1491 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1492 if (r > 0)
1493 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1494 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1495 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1496 else
1497 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1498 }
1499
1500 if (arg_userns_mode != USER_NAMESPACE_NO)
1501 arg_mount_settings |= MOUNT_USE_USERNS;
1502
1503 if (arg_private_network)
1504 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1505
1506 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1507 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1508 arg_register = false;
1509 if (arg_start_mode != START_PID1)
1510 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1511 }
1512
1513 if (arg_userns_mode == USER_NAMESPACE_PICK)
1514 arg_userns_chown = true;
1515
1516 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1517 arg_kill_signal = SIGRTMIN+3;
1518
1519 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1520 arg_read_only = true;
1521
1522 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1523 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1524 * The latter is not technically a user session, but we don't need to labour the point. */
1525 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1526
1527 if (arg_directory && arg_image)
1528 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1529
1530 if (arg_template && arg_image)
1531 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1532
1533 if (arg_template && !(arg_directory || arg_machine))
1534 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1535
1536 if (arg_ephemeral && arg_template)
1537 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1538
1539 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1540 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1541
1542 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1543 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1544
1545 if (arg_userns_chown && arg_read_only)
1546 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1547 "--read-only and --private-users-chown may not be combined.");
1548
1549 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1550 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1551 * copy-up (in case of overlay) making the entire exercise pointless. */
1552 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1553 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1554
1555 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1556 * to avoid conflicts between different network options. */
1557 if (arg_network_namespace_path &&
1558 (arg_network_interfaces || arg_network_macvlan ||
1559 arg_network_ipvlan || arg_network_veth_extra ||
1560 arg_network_bridge || arg_network_zone ||
1561 arg_network_veth || arg_private_network))
1562 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1563
1564 if (arg_network_bridge && arg_network_zone)
1565 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1566 "--network-bridge= and --network-zone= may not be combined.");
1567
1568 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1569 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1570
1571 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1572 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1573
1574 if (arg_expose_ports && !arg_private_network)
1575 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1576
1577 #if ! HAVE_LIBIPTC
1578 if (arg_expose_ports)
1579 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1580 #endif
1581
1582 r = custom_mount_check_all();
1583 if (r < 0)
1584 return r;
1585
1586 return 0;
1587 }
1588
1589 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1590 assert(p);
1591
1592 if (arg_userns_mode == USER_NAMESPACE_NO)
1593 return 0;
1594
1595 if (uid == UID_INVALID && gid == GID_INVALID)
1596 return 0;
1597
1598 if (uid != UID_INVALID) {
1599 uid += arg_uid_shift;
1600
1601 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1602 return -EOVERFLOW;
1603 }
1604
1605 if (gid != GID_INVALID) {
1606 gid += (gid_t) arg_uid_shift;
1607
1608 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1609 return -EOVERFLOW;
1610 }
1611
1612 if (lchown(p, uid, gid) < 0)
1613 return -errno;
1614
1615 return 0;
1616 }
1617
1618 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1619 const char *q;
1620 int r;
1621
1622 q = prefix_roota(root, path);
1623 r = mkdir_errno_wrapper(q, mode);
1624 if (r == -EEXIST)
1625 return 0;
1626 if (r < 0)
1627 return r;
1628
1629 return userns_lchown(q, uid, gid);
1630 }
1631
1632 static const char *timezone_from_path(const char *path) {
1633 return PATH_STARTSWITH_SET(
1634 path,
1635 "../usr/share/zoneinfo/",
1636 "/usr/share/zoneinfo/");
1637 }
1638
1639 static bool etc_writable(void) {
1640 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1641 }
1642
1643 static int setup_timezone(const char *dest) {
1644 _cleanup_free_ char *p = NULL, *etc = NULL;
1645 const char *where, *check;
1646 TimezoneMode m;
1647 int r;
1648
1649 assert(dest);
1650
1651 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1652 r = readlink_malloc("/etc/localtime", &p);
1653 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1654 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1655 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1656 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1657 else if (r < 0) {
1658 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1659 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1660 * file.
1661 *
1662 * Example:
1663 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1664 */
1665 return 0;
1666 } else if (arg_timezone == TIMEZONE_AUTO)
1667 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1668 else
1669 m = arg_timezone;
1670 } else
1671 m = arg_timezone;
1672
1673 if (m == TIMEZONE_OFF)
1674 return 0;
1675
1676 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1677 if (r < 0) {
1678 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1679 return 0;
1680 }
1681
1682 where = strjoina(etc, "/localtime");
1683
1684 switch (m) {
1685
1686 case TIMEZONE_DELETE:
1687 if (unlink(where) < 0)
1688 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1689
1690 return 0;
1691
1692 case TIMEZONE_SYMLINK: {
1693 _cleanup_free_ char *q = NULL;
1694 const char *z, *what;
1695
1696 z = timezone_from_path(p);
1697 if (!z) {
1698 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1699 return 0;
1700 }
1701
1702 r = readlink_malloc(where, &q);
1703 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1704 return 0; /* Already pointing to the right place? Then do nothing .. */
1705
1706 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1707 r = chase_symlinks(check, dest, 0, NULL, NULL);
1708 if (r < 0)
1709 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1710 else {
1711 if (unlink(where) < 0 && errno != ENOENT) {
1712 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1713 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1714 return 0;
1715 }
1716
1717 what = strjoina("../usr/share/zoneinfo/", z);
1718 if (symlink(what, where) < 0) {
1719 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1720 errno, "Failed to correct timezone of container, ignoring: %m");
1721 return 0;
1722 }
1723
1724 break;
1725 }
1726
1727 _fallthrough_;
1728 }
1729
1730 case TIMEZONE_BIND: {
1731 _cleanup_free_ char *resolved = NULL;
1732 int found;
1733
1734 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1735 if (found < 0) {
1736 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1737 return 0;
1738 }
1739
1740 if (found == 0) /* missing? */
1741 (void) touch(resolved);
1742
1743 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1744 if (r >= 0)
1745 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1746
1747 _fallthrough_;
1748 }
1749
1750 case TIMEZONE_COPY:
1751 /* If mounting failed, try to copy */
1752 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1753 if (r < 0) {
1754 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1755 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1756 return 0;
1757 }
1758
1759 break;
1760
1761 default:
1762 assert_not_reached("unexpected mode");
1763 }
1764
1765 /* Fix permissions of the symlink or file copy we just created */
1766 r = userns_lchown(where, 0, 0);
1767 if (r < 0)
1768 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1769
1770 return 0;
1771 }
1772
1773 static int have_resolv_conf(const char *path) {
1774 assert(path);
1775
1776 if (access(path, F_OK) < 0) {
1777 if (errno == ENOENT)
1778 return 0;
1779
1780 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1781 }
1782
1783 return 1;
1784 }
1785
1786 static int resolved_listening(void) {
1787 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1788 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1789 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1790 int r;
1791
1792 /* Check if resolved is listening */
1793
1794 r = sd_bus_open_system(&bus);
1795 if (r < 0)
1796 return log_debug_errno(r, "Failed to open system bus: %m");
1797
1798 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1799 if (r < 0)
1800 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1801 if (r == 0)
1802 return 0;
1803
1804 r = sd_bus_get_property_string(bus,
1805 "org.freedesktop.resolve1",
1806 "/org/freedesktop/resolve1",
1807 "org.freedesktop.resolve1.Manager",
1808 "DNSStubListener",
1809 &error,
1810 &dns_stub_listener_mode);
1811 if (r < 0)
1812 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1813
1814 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1815 }
1816
1817 static int setup_resolv_conf(const char *dest) {
1818 _cleanup_free_ char *etc = NULL;
1819 const char *where, *what;
1820 ResolvConfMode m;
1821 int r;
1822
1823 assert(dest);
1824
1825 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1826 if (arg_private_network)
1827 m = RESOLV_CONF_OFF;
1828 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1829 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
1830 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1831 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
1832 else
1833 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
1834 } else
1835 m = arg_resolv_conf;
1836
1837 if (m == RESOLV_CONF_OFF)
1838 return 0;
1839
1840 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1841 if (r < 0) {
1842 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1843 return 0;
1844 }
1845
1846 where = strjoina(etc, "/resolv.conf");
1847
1848 if (m == RESOLV_CONF_DELETE) {
1849 if (unlink(where) < 0)
1850 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1851
1852 return 0;
1853 }
1854
1855 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1856 what = STATIC_RESOLV_CONF;
1857 else
1858 what = "/etc/resolv.conf";
1859
1860 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1861 _cleanup_free_ char *resolved = NULL;
1862 int found;
1863
1864 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
1865 if (found < 0) {
1866 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1867 return 0;
1868 }
1869
1870 if (found == 0) /* missing? */
1871 (void) touch(resolved);
1872
1873 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1874 if (r >= 0)
1875 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1876 }
1877
1878 /* If that didn't work, let's copy the file */
1879 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
1880 if (r < 0) {
1881 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1882 * resolved or something similar runs inside and the symlink points there.
1883 *
1884 * If the disk image is read-only, there's also no point in complaining.
1885 */
1886 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1887 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1888 return 0;
1889 }
1890
1891 r = userns_lchown(where, 0, 0);
1892 if (r < 0)
1893 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1894
1895 return 0;
1896 }
1897
1898 static int setup_boot_id(void) {
1899 _cleanup_(unlink_and_freep) char *from = NULL;
1900 _cleanup_free_ char *path = NULL;
1901 sd_id128_t rnd = SD_ID128_NULL;
1902 const char *to;
1903 int r;
1904
1905 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
1906
1907 r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
1908 if (r < 0)
1909 return log_error_errno(r, "Failed to generate random boot ID path: %m");
1910
1911 r = sd_id128_randomize(&rnd);
1912 if (r < 0)
1913 return log_error_errno(r, "Failed to generate random boot id: %m");
1914
1915 r = id128_write(path, ID128_UUID, rnd, false);
1916 if (r < 0)
1917 return log_error_errno(r, "Failed to write boot id: %m");
1918
1919 from = TAKE_PTR(path);
1920 to = "/proc/sys/kernel/random/boot_id";
1921
1922 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1923 if (r < 0)
1924 return r;
1925
1926 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1927 }
1928
1929 static int copy_devnodes(const char *dest) {
1930 static const char devnodes[] =
1931 "null\0"
1932 "zero\0"
1933 "full\0"
1934 "random\0"
1935 "urandom\0"
1936 "tty\0"
1937 "net/tun\0";
1938
1939 _cleanup_umask_ mode_t u;
1940 const char *d;
1941 int r = 0;
1942
1943 assert(dest);
1944
1945 u = umask(0000);
1946
1947 /* Create /dev/net, so that we can create /dev/net/tun in it */
1948 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1949 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1950
1951 NULSTR_FOREACH(d, devnodes) {
1952 _cleanup_free_ char *from = NULL, *to = NULL;
1953 struct stat st;
1954
1955 from = path_join("/dev/", d);
1956 if (!from)
1957 return log_oom();
1958
1959 to = path_join(dest, from);
1960 if (!to)
1961 return log_oom();
1962
1963 if (stat(from, &st) < 0) {
1964
1965 if (errno != ENOENT)
1966 return log_error_errno(errno, "Failed to stat %s: %m", from);
1967
1968 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1969 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1970 "%s is not a char or block device, cannot copy.", from);
1971 else {
1972 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1973
1974 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1975 /* Explicitly warn the user when /dev is already populated. */
1976 if (errno == EEXIST)
1977 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1978 if (errno != EPERM)
1979 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1980
1981 /* Some systems abusively restrict mknod but allow bind mounts. */
1982 r = touch(to);
1983 if (r < 0)
1984 return log_error_errno(r, "touch (%s) failed: %m", to);
1985 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1986 if (r < 0)
1987 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1988 }
1989
1990 r = userns_lchown(to, 0, 0);
1991 if (r < 0)
1992 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1993
1994 dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
1995 if (!dn)
1996 return log_oom();
1997
1998 r = userns_mkdir(dest, dn, 0755, 0, 0);
1999 if (r < 0)
2000 return log_error_errno(r, "Failed to create '%s': %m", dn);
2001
2002 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2003 return log_oom();
2004
2005 prefixed = path_join(dest, sl);
2006 if (!prefixed)
2007 return log_oom();
2008
2009 t = path_join("..", d);
2010 if (!t)
2011 return log_oom();
2012
2013 if (symlink(t, prefixed) < 0)
2014 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2015 }
2016 }
2017
2018 return r;
2019 }
2020
2021 static int make_extra_nodes(const char *dest) {
2022 _cleanup_umask_ mode_t u;
2023 size_t i;
2024 int r;
2025
2026 u = umask(0000);
2027
2028 for (i = 0; i < arg_n_extra_nodes; i++) {
2029 _cleanup_free_ char *path = NULL;
2030 DeviceNode *n = arg_extra_nodes + i;
2031
2032 path = path_join(dest, n->path);
2033 if (!path)
2034 return log_oom();
2035
2036 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2037 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2038
2039 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2040 if (r < 0)
2041 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2042 }
2043
2044 return 0;
2045 }
2046
2047 static int setup_pts(const char *dest) {
2048 _cleanup_free_ char *options = NULL;
2049 const char *p;
2050 int r;
2051
2052 #if HAVE_SELINUX
2053 if (arg_selinux_apifs_context)
2054 (void) asprintf(&options,
2055 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2056 arg_uid_shift + TTY_GID,
2057 arg_selinux_apifs_context);
2058 else
2059 #endif
2060 (void) asprintf(&options,
2061 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2062 arg_uid_shift + TTY_GID);
2063
2064 if (!options)
2065 return log_oom();
2066
2067 /* Mount /dev/pts itself */
2068 p = prefix_roota(dest, "/dev/pts");
2069 r = mkdir_errno_wrapper(p, 0755);
2070 if (r < 0)
2071 return log_error_errno(r, "Failed to create /dev/pts: %m");
2072
2073 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2074 if (r < 0)
2075 return r;
2076 r = userns_lchown(p, 0, 0);
2077 if (r < 0)
2078 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2079
2080 /* Create /dev/ptmx symlink */
2081 p = prefix_roota(dest, "/dev/ptmx");
2082 if (symlink("pts/ptmx", p) < 0)
2083 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2084 r = userns_lchown(p, 0, 0);
2085 if (r < 0)
2086 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2087
2088 /* And fix /dev/pts/ptmx ownership */
2089 p = prefix_roota(dest, "/dev/pts/ptmx");
2090 r = userns_lchown(p, 0, 0);
2091 if (r < 0)
2092 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2093
2094 return 0;
2095 }
2096
2097 static int setup_stdio_as_dev_console(void) {
2098 int terminal;
2099 int r;
2100
2101 terminal = open_terminal("/dev/console", O_RDWR);
2102 if (terminal < 0)
2103 return log_error_errno(terminal, "Failed to open console: %m");
2104
2105 /* Make sure we can continue logging to the original stderr, even if
2106 * stderr points elsewhere now */
2107 r = log_dup_console();
2108 if (r < 0)
2109 return log_error_errno(r, "Failed to duplicate stderr: %m");
2110
2111 /* invalidates 'terminal' on success and failure */
2112 r = rearrange_stdio(terminal, terminal, terminal);
2113 if (r < 0)
2114 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2115
2116 return 0;
2117 }
2118
2119 static int setup_dev_console(const char *console) {
2120 _cleanup_free_ char *p = NULL;
2121 int r;
2122
2123 /* Create /dev/console symlink */
2124 r = path_make_relative("/dev", console, &p);
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to create relative path: %m");
2127
2128 if (symlink(p, "/dev/console") < 0)
2129 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2130
2131 return 0;
2132 }
2133
2134 static int setup_keyring(void) {
2135 key_serial_t keyring;
2136
2137 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2138 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2139 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2140 * these system calls let's make sure we don't leak anything into the container. */
2141
2142 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2143 if (keyring == -1) {
2144 if (errno == ENOSYS)
2145 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2146 else if (IN_SET(errno, EACCES, EPERM))
2147 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2148 else
2149 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2150 }
2151
2152 return 0;
2153 }
2154
2155 static int setup_kmsg(int kmsg_socket) {
2156 _cleanup_(unlink_and_freep) char *from = NULL;
2157 _cleanup_free_ char *fifo = NULL;
2158 _cleanup_close_ int fd = -1;
2159 _cleanup_umask_ mode_t u;
2160 int r;
2161
2162 assert(kmsg_socket >= 0);
2163
2164 u = umask(0000);
2165
2166 /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2167 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2168 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2169 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2170
2171 r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2172 if (r < 0)
2173 return log_error_errno(r, "Failed to generate kmsg path: %m");
2174
2175 if (mkfifo(fifo, 0600) < 0)
2176 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2177
2178 from = TAKE_PTR(fifo);
2179
2180 r = mount_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2181 if (r < 0)
2182 return r;
2183
2184 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2185 if (fd < 0)
2186 return log_error_errno(errno, "Failed to open fifo: %m");
2187
2188 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2189 r = send_one_fd(kmsg_socket, fd, 0);
2190 if (r < 0)
2191 return log_error_errno(r, "Failed to send FIFO fd: %m");
2192
2193 return 0;
2194 }
2195
2196 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2197 union in_addr_union *exposed = userdata;
2198
2199 assert(rtnl);
2200 assert(m);
2201 assert(exposed);
2202
2203 expose_port_execute(rtnl, arg_expose_ports, exposed);
2204 return 0;
2205 }
2206
2207 static int setup_hostname(void) {
2208 int r;
2209
2210 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2211 return 0;
2212
2213 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to set hostname: %m");
2216
2217 return 0;
2218 }
2219
2220 static int setup_journal(const char *directory) {
2221 _cleanup_free_ char *d = NULL;
2222 const char *dirname, *p, *q;
2223 sd_id128_t this_id;
2224 char id[33];
2225 bool try;
2226 int r;
2227
2228 /* Don't link journals in ephemeral mode */
2229 if (arg_ephemeral)
2230 return 0;
2231
2232 if (arg_link_journal == LINK_NO)
2233 return 0;
2234
2235 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2236
2237 r = sd_id128_get_machine(&this_id);
2238 if (r < 0)
2239 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2240
2241 if (sd_id128_equal(arg_uuid, this_id)) {
2242 log_full(try ? LOG_WARNING : LOG_ERR,
2243 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2244 if (try)
2245 return 0;
2246 return -EEXIST;
2247 }
2248
2249 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2250 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2251 if (r < 0) {
2252 bool ignore = r == -EROFS && try;
2253 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2254 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2255 return ignore ? 0 : r;
2256 }
2257 }
2258
2259 (void) sd_id128_to_string(arg_uuid, id);
2260
2261 p = strjoina("/var/log/journal/", id);
2262 q = prefix_roota(directory, p);
2263
2264 if (path_is_mount_point(p, NULL, 0) > 0) {
2265 if (try)
2266 return 0;
2267
2268 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2269 "%s: already a mount point, refusing to use for journal", p);
2270 }
2271
2272 if (path_is_mount_point(q, NULL, 0) > 0) {
2273 if (try)
2274 return 0;
2275
2276 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2277 "%s: already a mount point, refusing to use for journal", q);
2278 }
2279
2280 r = readlink_and_make_absolute(p, &d);
2281 if (r >= 0) {
2282 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2283 path_equal(d, q)) {
2284
2285 r = userns_mkdir(directory, p, 0755, 0, 0);
2286 if (r < 0)
2287 log_warning_errno(r, "Failed to create directory %s: %m", q);
2288 return 0;
2289 }
2290
2291 if (unlink(p) < 0)
2292 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2293 } else if (r == -EINVAL) {
2294
2295 if (arg_link_journal == LINK_GUEST &&
2296 rmdir(p) < 0) {
2297
2298 if (errno == ENOTDIR) {
2299 log_error("%s already exists and is neither a symlink nor a directory", p);
2300 return r;
2301 } else
2302 return log_error_errno(errno, "Failed to remove %s: %m", p);
2303 }
2304 } else if (r != -ENOENT)
2305 return log_error_errno(r, "readlink(%s) failed: %m", p);
2306
2307 if (arg_link_journal == LINK_GUEST) {
2308
2309 if (symlink(q, p) < 0) {
2310 if (try) {
2311 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2312 return 0;
2313 } else
2314 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2315 }
2316
2317 r = userns_mkdir(directory, p, 0755, 0, 0);
2318 if (r < 0)
2319 log_warning_errno(r, "Failed to create directory %s: %m", q);
2320 return 0;
2321 }
2322
2323 if (arg_link_journal == LINK_HOST) {
2324 /* don't create parents here — if the host doesn't have
2325 * permanent journal set up, don't force it here */
2326
2327 r = mkdir_errno_wrapper(p, 0755);
2328 if (r < 0 && r != -EEXIST) {
2329 if (try) {
2330 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2331 return 0;
2332 } else
2333 return log_error_errno(r, "Failed to create %s: %m", p);
2334 }
2335
2336 } else if (access(p, F_OK) < 0)
2337 return 0;
2338
2339 if (dir_is_empty(q) == 0)
2340 log_warning("%s is not empty, proceeding anyway.", q);
2341
2342 r = userns_mkdir(directory, p, 0755, 0, 0);
2343 if (r < 0)
2344 return log_error_errno(r, "Failed to create %s: %m", q);
2345
2346 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2347 if (r < 0)
2348 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2349
2350 return 0;
2351 }
2352
2353 static int drop_capabilities(uid_t uid) {
2354 CapabilityQuintet q;
2355
2356 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2357 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2358 * arg_caps_retain. */
2359
2360 if (capability_quintet_is_set(&arg_full_capabilities)) {
2361 q = arg_full_capabilities;
2362
2363 if (q.bounding == (uint64_t) -1)
2364 q.bounding = uid == 0 ? arg_caps_retain : 0;
2365
2366 if (q.effective == (uint64_t) -1)
2367 q.effective = uid == 0 ? q.bounding : 0;
2368
2369 if (q.inheritable == (uint64_t) -1)
2370 q.inheritable = uid == 0 ? q.bounding : 0;
2371
2372 if (q.permitted == (uint64_t) -1)
2373 q.permitted = uid == 0 ? q.bounding : 0;
2374
2375 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2376 q.ambient = 0;
2377
2378 if (capability_quintet_mangle(&q))
2379 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2380
2381 } else {
2382 q = (CapabilityQuintet) {
2383 .bounding = arg_caps_retain,
2384 .effective = uid == 0 ? arg_caps_retain : 0,
2385 .inheritable = uid == 0 ? arg_caps_retain : 0,
2386 .permitted = uid == 0 ? arg_caps_retain : 0,
2387 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2388 };
2389
2390 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2391 * in order to maintain the same behavior as systemd < 242. */
2392 if (capability_quintet_mangle(&q))
2393 log_warning("Some capabilities will not be set because they are not in the current bounding set.");
2394
2395 }
2396
2397 return capability_quintet_enforce(&q);
2398 }
2399
2400 static int reset_audit_loginuid(void) {
2401 _cleanup_free_ char *p = NULL;
2402 int r;
2403
2404 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2405 return 0;
2406
2407 r = read_one_line_file("/proc/self/loginuid", &p);
2408 if (r == -ENOENT)
2409 return 0;
2410 if (r < 0)
2411 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2412
2413 /* Already reset? */
2414 if (streq(p, "4294967295"))
2415 return 0;
2416
2417 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2418 if (r < 0) {
2419 log_error_errno(r,
2420 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2421 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2422 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2423 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2424 "using systemd-nspawn. Sleeping for 5s... (%m)");
2425
2426 sleep(5);
2427 }
2428
2429 return 0;
2430 }
2431
2432 static int setup_propagate(const char *root) {
2433 const char *p, *q;
2434 int r;
2435
2436 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2437 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2438 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2439 (void) mkdir_p(p, 0600);
2440
2441 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2442 if (r < 0)
2443 return log_error_errno(r, "Failed to create /run/systemd: %m");
2444
2445 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2446 if (r < 0)
2447 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2448
2449 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2450 if (r < 0)
2451 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2452
2453 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
2454 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2455 if (r < 0)
2456 return r;
2457
2458 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2459 if (r < 0)
2460 return r;
2461
2462 /* machined will MS_MOVE into that directory, and that's only
2463 * supported for non-shared mounts. */
2464 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2465 }
2466
2467 static int setup_machine_id(const char *directory) {
2468 const char *etc_machine_id;
2469 sd_id128_t id;
2470 int r;
2471
2472 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2473 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2474 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2475 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2476 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2477 * container behaves nicely). */
2478
2479 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2480
2481 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2482 if (r < 0) {
2483 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2484 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2485
2486 if (sd_id128_is_null(arg_uuid)) {
2487 r = sd_id128_randomize(&arg_uuid);
2488 if (r < 0)
2489 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2490 }
2491 } else {
2492 if (sd_id128_is_null(id))
2493 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2494 "Machine ID in container image is zero, refusing.");
2495
2496 arg_uuid = id;
2497 }
2498
2499 return 0;
2500 }
2501
2502 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2503 int r;
2504
2505 assert(directory);
2506
2507 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2508 return 0;
2509
2510 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2511 if (r == -EOPNOTSUPP)
2512 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2513 if (r == -EBADE)
2514 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2515 if (r < 0)
2516 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2517 if (r == 0)
2518 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2519 else
2520 log_debug("Patched directory tree to match UID/GID range.");
2521
2522 return r;
2523 }
2524
2525 /*
2526 * Return values:
2527 * < 0 : wait_for_terminate() failed to get the state of the
2528 * container, the container was terminated by a signal, or
2529 * failed for an unknown reason. No change is made to the
2530 * container argument.
2531 * > 0 : The program executed in the container terminated with an
2532 * error. The exit code of the program executed in the
2533 * container is returned. The container argument has been set
2534 * to CONTAINER_TERMINATED.
2535 * 0 : The container is being rebooted, has been shut down or exited
2536 * successfully. The container argument has been set to either
2537 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2538 *
2539 * That is, success is indicated by a return value of zero, and an
2540 * error is indicated by a non-zero value.
2541 */
2542 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2543 siginfo_t status;
2544 int r;
2545
2546 r = wait_for_terminate(pid, &status);
2547 if (r < 0)
2548 return log_warning_errno(r, "Failed to wait for container: %m");
2549
2550 switch (status.si_code) {
2551
2552 case CLD_EXITED:
2553 if (status.si_status == 0)
2554 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2555 else
2556 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2557
2558 *container = CONTAINER_TERMINATED;
2559 return status.si_status;
2560
2561 case CLD_KILLED:
2562 if (status.si_status == SIGINT) {
2563 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2564 *container = CONTAINER_TERMINATED;
2565 return 0;
2566
2567 } else if (status.si_status == SIGHUP) {
2568 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2569 *container = CONTAINER_REBOOTED;
2570 return 0;
2571 }
2572
2573 _fallthrough_;
2574 case CLD_DUMPED:
2575 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2576 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2577
2578 default:
2579 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2580 "Container %s failed due to unknown reason.", arg_machine);
2581 }
2582 }
2583
2584 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2585 pid_t pid;
2586
2587 pid = PTR_TO_PID(userdata);
2588 if (pid > 0) {
2589 if (kill(pid, arg_kill_signal) >= 0) {
2590 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2591 sd_event_source_set_userdata(s, NULL);
2592 return 0;
2593 }
2594 }
2595
2596 sd_event_exit(sd_event_source_get_event(s), 0);
2597 return 0;
2598 }
2599
2600 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2601 pid_t pid;
2602
2603 assert(s);
2604 assert(ssi);
2605
2606 pid = PTR_TO_PID(userdata);
2607
2608 for (;;) {
2609 siginfo_t si = {};
2610
2611 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2612 return log_error_errno(errno, "Failed to waitid(): %m");
2613 if (si.si_pid == 0) /* No pending children. */
2614 break;
2615 if (si.si_pid == pid) {
2616 /* The main process we care for has exited. Return from
2617 * signal handler but leave the zombie. */
2618 sd_event_exit(sd_event_source_get_event(s), 0);
2619 break;
2620 }
2621
2622 /* Reap all other children. */
2623 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2624 }
2625
2626 return 0;
2627 }
2628
2629 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2630 pid_t pid;
2631
2632 assert(m);
2633
2634 pid = PTR_TO_PID(userdata);
2635
2636 if (arg_kill_signal > 0) {
2637 log_info("Container termination requested. Attempting to halt container.");
2638 (void) kill(pid, arg_kill_signal);
2639 } else {
2640 log_info("Container termination requested. Exiting.");
2641 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2642 }
2643
2644 return 0;
2645 }
2646
2647 static int determine_names(void) {
2648 int r;
2649
2650 if (arg_template && !arg_directory && arg_machine) {
2651
2652 /* If --template= was specified then we should not
2653 * search for a machine, but instead create a new one
2654 * in /var/lib/machine. */
2655
2656 arg_directory = path_join("/var/lib/machines", arg_machine);
2657 if (!arg_directory)
2658 return log_oom();
2659 }
2660
2661 if (!arg_image && !arg_directory) {
2662 if (arg_machine) {
2663 _cleanup_(image_unrefp) Image *i = NULL;
2664
2665 r = image_find(IMAGE_MACHINE, arg_machine, &i);
2666 if (r == -ENOENT)
2667 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2668 if (r < 0)
2669 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2670
2671 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2672 r = free_and_strdup(&arg_image, i->path);
2673 else
2674 r = free_and_strdup(&arg_directory, i->path);
2675 if (r < 0)
2676 return log_oom();
2677
2678 if (!arg_ephemeral)
2679 arg_read_only = arg_read_only || i->read_only;
2680 } else {
2681 r = safe_getcwd(&arg_directory);
2682 if (r < 0)
2683 return log_error_errno(r, "Failed to determine current directory: %m");
2684 }
2685
2686 if (!arg_directory && !arg_image)
2687 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
2688 }
2689
2690 if (!arg_machine) {
2691 if (arg_directory && path_equal(arg_directory, "/"))
2692 arg_machine = gethostname_malloc();
2693 else {
2694 if (arg_image) {
2695 char *e;
2696
2697 arg_machine = strdup(basename(arg_image));
2698
2699 /* Truncate suffix if there is one */
2700 e = endswith(arg_machine, ".raw");
2701 if (e)
2702 *e = 0;
2703 } else
2704 arg_machine = strdup(basename(arg_directory));
2705 }
2706 if (!arg_machine)
2707 return log_oom();
2708
2709 hostname_cleanup(arg_machine);
2710 if (!machine_name_is_valid(arg_machine))
2711 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
2712
2713 if (arg_ephemeral) {
2714 char *b;
2715
2716 /* Add a random suffix when this is an
2717 * ephemeral machine, so that we can run many
2718 * instances at once without manually having
2719 * to specify -M each time. */
2720
2721 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2722 return log_oom();
2723
2724 free(arg_machine);
2725 arg_machine = b;
2726 }
2727 }
2728
2729 return 0;
2730 }
2731
2732 static int chase_symlinks_and_update(char **p, unsigned flags) {
2733 char *chased;
2734 int r;
2735
2736 assert(p);
2737
2738 if (!*p)
2739 return 0;
2740
2741 r = chase_symlinks(*p, NULL, flags, &chased, NULL);
2742 if (r < 0)
2743 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2744
2745 return free_and_replace(*p, chased);
2746 }
2747
2748 static int determine_uid_shift(const char *directory) {
2749 int r;
2750
2751 if (arg_userns_mode == USER_NAMESPACE_NO) {
2752 arg_uid_shift = 0;
2753 return 0;
2754 }
2755
2756 if (arg_uid_shift == UID_INVALID) {
2757 struct stat st;
2758
2759 r = stat(directory, &st);
2760 if (r < 0)
2761 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2762
2763 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2764
2765 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2766 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2767 "UID and GID base of %s don't match.", directory);
2768
2769 arg_uid_range = UINT32_C(0x10000);
2770 }
2771
2772 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2773 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2774 "UID base too high for UID range.");
2775
2776 return 0;
2777 }
2778
2779 static unsigned long effective_clone_ns_flags(void) {
2780 unsigned long flags = arg_clone_ns_flags;
2781
2782 if (arg_private_network)
2783 flags |= CLONE_NEWNET;
2784 if (arg_use_cgns)
2785 flags |= CLONE_NEWCGROUP;
2786 if (arg_userns_mode != USER_NAMESPACE_NO)
2787 flags |= CLONE_NEWUSER;
2788
2789 return flags;
2790 }
2791
2792 static int patch_sysctl(void) {
2793
2794 /* This table is inspired by runc's sysctl() function */
2795 static const struct {
2796 const char *key;
2797 bool prefix;
2798 unsigned long clone_flags;
2799 } safe_sysctl[] = {
2800 { "kernel.hostname", false, CLONE_NEWUTS },
2801 { "kernel.domainname", false, CLONE_NEWUTS },
2802 { "kernel.msgmax", false, CLONE_NEWIPC },
2803 { "kernel.msgmnb", false, CLONE_NEWIPC },
2804 { "kernel.msgmni", false, CLONE_NEWIPC },
2805 { "kernel.sem", false, CLONE_NEWIPC },
2806 { "kernel.shmall", false, CLONE_NEWIPC },
2807 { "kernel.shmmax", false, CLONE_NEWIPC },
2808 { "kernel.shmmni", false, CLONE_NEWIPC },
2809 { "fs.mqueue.", true, CLONE_NEWIPC },
2810 { "net.", true, CLONE_NEWNET },
2811 };
2812
2813 unsigned long flags;
2814 char **k, **v;
2815 int r;
2816
2817 flags = effective_clone_ns_flags();
2818
2819 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2820 bool good = false;
2821 size_t i;
2822
2823 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2824
2825 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2826 continue;
2827
2828 if (safe_sysctl[i].prefix)
2829 good = startswith(*k, safe_sysctl[i].key);
2830 else
2831 good = streq(*k, safe_sysctl[i].key);
2832
2833 if (good)
2834 break;
2835 }
2836
2837 if (!good)
2838 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2839
2840 r = sysctl_write(*k, *v);
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2843 }
2844
2845 return 0;
2846 }
2847
2848 static int inner_child(
2849 Barrier *barrier,
2850 const char *directory,
2851 bool secondary,
2852 int kmsg_socket,
2853 int rtnl_socket,
2854 int master_pty_socket,
2855 FDSet *fds) {
2856
2857 _cleanup_free_ char *home = NULL;
2858 char as_uuid[37];
2859 size_t n_env = 1;
2860 const char *envp[] = {
2861 "PATH=" DEFAULT_PATH_COMPAT,
2862 NULL, /* container */
2863 NULL, /* TERM */
2864 NULL, /* HOME */
2865 NULL, /* USER */
2866 NULL, /* LOGNAME */
2867 NULL, /* container_uuid */
2868 NULL, /* LISTEN_FDS */
2869 NULL, /* LISTEN_PID */
2870 NULL, /* NOTIFY_SOCKET */
2871 NULL
2872 };
2873 const char *exec_target;
2874 _cleanup_strv_free_ char **env_use = NULL;
2875 int r, which_failed;
2876
2877 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2878 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2879 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2880 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2881 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2882 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2883 * namespace.
2884 *
2885 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2886 * unshare(). See below. */
2887
2888 assert(barrier);
2889 assert(directory);
2890 assert(kmsg_socket >= 0);
2891
2892 log_debug("Inner child is initializing.");
2893
2894 if (arg_userns_mode != USER_NAMESPACE_NO) {
2895 /* Tell the parent, that it now can write the UID map. */
2896 (void) barrier_place(barrier); /* #1 */
2897
2898 /* Wait until the parent wrote the UID map */
2899 if (!barrier_place_and_sync(barrier)) /* #2 */
2900 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2901 "Parent died too early");
2902 }
2903
2904 r = reset_uid_gid();
2905 if (r < 0)
2906 return log_error_errno(r, "Couldn't become new root: %m");
2907
2908 r = mount_all(NULL,
2909 arg_mount_settings | MOUNT_IN_USERNS,
2910 arg_uid_shift,
2911 arg_selinux_apifs_context);
2912 if (r < 0)
2913 return r;
2914
2915 if (!arg_network_namespace_path && arg_private_network) {
2916 r = unshare(CLONE_NEWNET);
2917 if (r < 0)
2918 return log_error_errno(errno, "Failed to unshare network namespace: %m");
2919
2920 /* Tell the parent that it can setup network interfaces. */
2921 (void) barrier_place(barrier); /* #3 */
2922 }
2923
2924 r = mount_sysfs(NULL, arg_mount_settings);
2925 if (r < 0)
2926 return r;
2927
2928 /* Wait until we are cgroup-ified, so that we
2929 * can mount the right cgroup path writable */
2930 if (!barrier_place_and_sync(barrier)) /* #4 */
2931 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2932 "Parent died too early");
2933
2934 if (arg_use_cgns) {
2935 r = unshare(CLONE_NEWCGROUP);
2936 if (r < 0)
2937 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2938 r = mount_cgroups(
2939 "",
2940 arg_unified_cgroup_hierarchy,
2941 arg_userns_mode != USER_NAMESPACE_NO,
2942 arg_uid_shift,
2943 arg_uid_range,
2944 arg_selinux_apifs_context,
2945 true);
2946 if (r < 0)
2947 return r;
2948 } else {
2949 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2950 if (r < 0)
2951 return r;
2952 }
2953
2954 r = setup_boot_id();
2955 if (r < 0)
2956 return r;
2957
2958 r = setup_kmsg(kmsg_socket);
2959 if (r < 0)
2960 return r;
2961 kmsg_socket = safe_close(kmsg_socket);
2962
2963 r = mount_custom(
2964 "/",
2965 arg_custom_mounts,
2966 arg_n_custom_mounts,
2967 false,
2968 0,
2969 0,
2970 arg_selinux_apifs_context,
2971 true);
2972 if (r < 0)
2973 return r;
2974
2975 if (setsid() < 0)
2976 return log_error_errno(errno, "setsid() failed: %m");
2977
2978 if (arg_private_network)
2979 loopback_setup();
2980
2981 if (arg_expose_ports) {
2982 r = expose_port_send_rtnl(rtnl_socket);
2983 if (r < 0)
2984 return r;
2985 rtnl_socket = safe_close(rtnl_socket);
2986 }
2987
2988 if (arg_console_mode != CONSOLE_PIPE) {
2989 _cleanup_close_ int master = -1;
2990 _cleanup_free_ char *console = NULL;
2991
2992 /* Allocate a pty and make it available as /dev/console. */
2993 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
2994 if (master < 0)
2995 return log_error_errno(master, "Failed to allocate a pty: %m");
2996
2997 r = setup_dev_console(console);
2998 if (r < 0)
2999 return log_error_errno(r, "Failed to setup /dev/console: %m");
3000
3001 r = send_one_fd(master_pty_socket, master, 0);
3002 if (r < 0)
3003 return log_error_errno(r, "Failed to send master fd: %m");
3004 master_pty_socket = safe_close(master_pty_socket);
3005
3006 r = setup_stdio_as_dev_console();
3007 if (r < 0)
3008 return r;
3009 }
3010
3011 r = patch_sysctl();
3012 if (r < 0)
3013 return r;
3014
3015 if (arg_oom_score_adjust_set) {
3016 r = set_oom_score_adjust(arg_oom_score_adjust);
3017 if (r < 0)
3018 return log_error_errno(r, "Failed to adjust OOM score: %m");
3019 }
3020
3021 if (arg_cpu_set.set)
3022 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3023 return log_error_errno(errno, "Failed to set CPU affinity: %m");
3024
3025 (void) setup_hostname();
3026
3027 if (arg_personality != PERSONALITY_INVALID) {
3028 r = safe_personality(arg_personality);
3029 if (r < 0)
3030 return log_error_errno(r, "personality() failed: %m");
3031 } else if (secondary) {
3032 r = safe_personality(PER_LINUX32);
3033 if (r < 0)
3034 return log_error_errno(r, "personality() failed: %m");
3035 }
3036
3037 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3038 if (r < 0)
3039 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3040
3041 #if HAVE_SECCOMP
3042 if (arg_seccomp) {
3043
3044 if (is_seccomp_available()) {
3045
3046 r = seccomp_load(arg_seccomp);
3047 if (ERRNO_IS_SECCOMP_FATAL(r))
3048 return log_error_errno(r, "Failed to install seccomp filter: %m");
3049 if (r < 0)
3050 log_debug_errno(r, "Failed to install seccomp filter: %m");
3051 }
3052 } else
3053 #endif
3054 {
3055 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3056 if (r < 0)
3057 return r;
3058 }
3059
3060 #if HAVE_SELINUX
3061 if (arg_selinux_context)
3062 if (setexeccon(arg_selinux_context) < 0)
3063 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3064 #endif
3065
3066 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3067 * if we need to later on. */
3068 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3069 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3070
3071 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3072 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
3073 else
3074 r = change_uid_gid(arg_user, &home);
3075 if (r < 0)
3076 return r;
3077
3078 r = drop_capabilities(getuid());
3079 if (r < 0)
3080 return log_error_errno(r, "Dropping capabilities failed: %m");
3081
3082 if (arg_no_new_privileges)
3083 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3084 return log_error_errno(errno, "Failed to disable new privileges: %m");
3085
3086 /* LXC sets container=lxc, so follow the scheme here */
3087 envp[n_env++] = strjoina("container=", arg_container_service_name);
3088
3089 envp[n_env] = strv_find_prefix(environ, "TERM=");
3090 if (envp[n_env])
3091 n_env++;
3092
3093 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3094 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3095 return log_oom();
3096
3097 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3098 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3099 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3100 return log_oom();
3101
3102 assert(!sd_id128_is_null(arg_uuid));
3103
3104 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3105 return log_oom();
3106
3107 if (fdset_size(fds) > 0) {
3108 r = fdset_cloexec(fds, false);
3109 if (r < 0)
3110 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3111
3112 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3113 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3114 return log_oom();
3115 }
3116 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3117 return log_oom();
3118
3119 env_use = strv_env_merge(2, envp, arg_setenv);
3120 if (!env_use)
3121 return log_oom();
3122
3123 /* Let the parent know that we are ready and
3124 * wait until the parent is ready with the
3125 * setup, too... */
3126 if (!barrier_place_and_sync(barrier)) /* #5 */
3127 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3128 "Parent died too early");
3129
3130 if (arg_chdir)
3131 if (chdir(arg_chdir) < 0)
3132 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3133
3134 if (arg_start_mode == START_PID2) {
3135 r = stub_pid1(arg_uuid);
3136 if (r < 0)
3137 return r;
3138 }
3139
3140 log_debug("Inner child completed, invoking payload.");
3141
3142 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3143 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3144 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3145 log_close();
3146 log_set_open_when_needed(true);
3147
3148 (void) fdset_close_others(fds);
3149
3150 if (arg_start_mode == START_BOOT) {
3151 char **a;
3152 size_t m;
3153
3154 /* Automatically search for the init system */
3155
3156 m = strv_length(arg_parameters);
3157 a = newa(char*, m + 2);
3158 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3159 a[1 + m] = NULL;
3160
3161 a[0] = (char*) "/usr/lib/systemd/systemd";
3162 execve(a[0], a, env_use);
3163
3164 a[0] = (char*) "/lib/systemd/systemd";
3165 execve(a[0], a, env_use);
3166
3167 a[0] = (char*) "/sbin/init";
3168 execve(a[0], a, env_use);
3169
3170 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3171 } else if (!strv_isempty(arg_parameters)) {
3172 const char *dollar_path;
3173
3174 exec_target = arg_parameters[0];
3175
3176 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3177 * binary. */
3178 dollar_path = strv_env_get(env_use, "PATH");
3179 if (dollar_path) {
3180 if (putenv((char*) dollar_path) != 0)
3181 return log_error_errno(errno, "Failed to update $PATH: %m");
3182 }
3183
3184 execvpe(arg_parameters[0], arg_parameters, env_use);
3185 } else {
3186 if (!arg_chdir)
3187 /* If we cannot change the directory, we'll end up in /, that is expected. */
3188 (void) chdir(home ?: "/root");
3189
3190 execle("/bin/bash", "-bash", NULL, env_use);
3191 execle("/bin/sh", "-sh", NULL, env_use);
3192
3193 exec_target = "/bin/bash, /bin/sh";
3194 }
3195
3196 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3197 }
3198
3199 static int setup_sd_notify_child(void) {
3200 _cleanup_close_ int fd = -1;
3201 union sockaddr_union sa = {
3202 .un.sun_family = AF_UNIX,
3203 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3204 };
3205 int r;
3206
3207 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3208 if (fd < 0)
3209 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3210
3211 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3212 (void) sockaddr_un_unlink(&sa.un);
3213
3214 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3215 if (r < 0)
3216 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3217
3218 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3219 if (r < 0)
3220 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3221
3222 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3223 if (r < 0)
3224 return log_error_errno(r, "SO_PASSCRED failed: %m");
3225
3226 return TAKE_FD(fd);
3227 }
3228
3229 static int outer_child(
3230 Barrier *barrier,
3231 const char *directory,
3232 DissectedImage *dissected_image,
3233 bool secondary,
3234 int pid_socket,
3235 int uuid_socket,
3236 int notify_socket,
3237 int kmsg_socket,
3238 int rtnl_socket,
3239 int uid_shift_socket,
3240 int master_pty_socket,
3241 int unified_cgroup_hierarchy_socket,
3242 FDSet *fds,
3243 int netns_fd) {
3244
3245 _cleanup_close_ int fd = -1;
3246 pid_t pid;
3247 ssize_t l;
3248 int r;
3249
3250 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3251 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3252 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3253 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3254
3255 assert(barrier);
3256 assert(directory);
3257 assert(pid_socket >= 0);
3258 assert(uuid_socket >= 0);
3259 assert(notify_socket >= 0);
3260 assert(master_pty_socket >= 0);
3261 assert(kmsg_socket >= 0);
3262
3263 log_debug("Outer child is initializing.");
3264
3265 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3266 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3267
3268 r = reset_audit_loginuid();
3269 if (r < 0)
3270 return r;
3271
3272 /* Mark everything as slave, so that we still
3273 * receive mounts from the real root, but don't
3274 * propagate mounts to the real root. */
3275 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3276 if (r < 0)
3277 return r;
3278
3279 if (dissected_image) {
3280 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3281 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3282 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3283 * makes sure ESP partitions and userns are compatible. */
3284
3285 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3286 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3287 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3288 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3289 if (r < 0)
3290 return r;
3291 }
3292
3293 r = determine_uid_shift(directory);
3294 if (r < 0)
3295 return r;
3296
3297 if (arg_userns_mode != USER_NAMESPACE_NO) {
3298 /* Let the parent know which UID shift we read from the image */
3299 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3300 if (l < 0)
3301 return log_error_errno(errno, "Failed to send UID shift: %m");
3302 if (l != sizeof(arg_uid_shift))
3303 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3304 "Short write while sending UID shift.");
3305
3306 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3307 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3308 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3309 * not it will pick a different one, and send it back to us. */
3310
3311 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3312 if (l < 0)
3313 return log_error_errno(errno, "Failed to recv UID shift: %m");
3314 if (l != sizeof(arg_uid_shift))
3315 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3316 "Short read while receiving UID shift.");
3317 }
3318
3319 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3320 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3321 }
3322
3323 if (path_equal(directory, "/")) {
3324 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3325 * place, so that we can make changes to its mount structure (for example, to implement
3326 * --volatile=) without this interfering with our ability to access files such as
3327 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3328 * (instead of a temporary directory, since we are living in our own mount namspace here
3329 * already, and thus don't need to be afraid of colliding with anyone else's mounts).*/
3330 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3331
3332 r = mount_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3333 if (r < 0)
3334 return r;
3335
3336 directory = "/run/systemd/nspawn-root";
3337
3338 } else if (!dissected_image) {
3339 /* Turn directory into bind mount (we need that so that we can move the bind mount to root
3340 * later on). */
3341 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3342 if (r < 0)
3343 return r;
3344 }
3345
3346 r = setup_pivot_root(
3347 directory,
3348 arg_pivot_root_new,
3349 arg_pivot_root_old);
3350 if (r < 0)
3351 return r;
3352
3353 r = setup_volatile_mode(
3354 directory,
3355 arg_volatile_mode,
3356 arg_userns_mode != USER_NAMESPACE_NO,
3357 arg_uid_shift,
3358 arg_uid_range,
3359 arg_selinux_apifs_context);
3360 if (r < 0)
3361 return r;
3362
3363 if (dissected_image) {
3364 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3365 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3366 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3367 if (r < 0)
3368 return r;
3369 }
3370
3371 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3372 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3373
3374 r = detect_unified_cgroup_hierarchy_from_image(directory);
3375 if (r < 0)
3376 return r;
3377
3378 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3379 if (l < 0)
3380 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3381 if (l != sizeof(arg_unified_cgroup_hierarchy))
3382 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3383 "Short write while sending cgroup mode.");
3384
3385 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3386 }
3387
3388 /* Mark everything as shared so our mounts get propagated down. This is
3389 * required to make new bind mounts available in systemd services
3390 * inside the container that create a new mount namespace.
3391 * See https://github.com/systemd/systemd/issues/3860
3392 * Further submounts (such as /dev) done after this will inherit the
3393 * shared propagation mode. */
3394 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3395 if (r < 0)
3396 return r;
3397
3398 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3399 if (r < 0)
3400 return r;
3401
3402 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3403 if (r < 0)
3404 return r;
3405
3406 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
3407 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3408 if (r < 0)
3409 return log_error_errno(r, "Failed to make tree read-only: %m");
3410 }
3411
3412 r = mount_all(directory,
3413 arg_mount_settings,
3414 arg_uid_shift,
3415 arg_selinux_apifs_context);
3416 if (r < 0)
3417 return r;
3418
3419 r = copy_devnodes(directory);
3420 if (r < 0)
3421 return r;
3422
3423 r = make_extra_nodes(directory);
3424 if (r < 0)
3425 return r;
3426
3427 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3428 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
3429
3430 r = setup_pts(directory);
3431 if (r < 0)
3432 return r;
3433
3434 r = setup_propagate(directory);
3435 if (r < 0)
3436 return r;
3437
3438 r = setup_keyring();
3439 if (r < 0)
3440 return r;
3441
3442 r = setup_timezone(directory);
3443 if (r < 0)
3444 return r;
3445
3446 r = setup_resolv_conf(directory);
3447 if (r < 0)
3448 return r;
3449
3450 r = setup_machine_id(directory);
3451 if (r < 0)
3452 return r;
3453
3454 r = setup_journal(directory);
3455 if (r < 0)
3456 return r;
3457
3458 r = mount_custom(
3459 directory,
3460 arg_custom_mounts,
3461 arg_n_custom_mounts,
3462 arg_userns_mode != USER_NAMESPACE_NO,
3463 arg_uid_shift,
3464 arg_uid_range,
3465 arg_selinux_apifs_context,
3466 false);
3467 if (r < 0)
3468 return r;
3469
3470 if (!arg_use_cgns) {
3471 r = mount_cgroups(
3472 directory,
3473 arg_unified_cgroup_hierarchy,
3474 arg_userns_mode != USER_NAMESPACE_NO,
3475 arg_uid_shift,
3476 arg_uid_range,
3477 arg_selinux_apifs_context,
3478 false);
3479 if (r < 0)
3480 return r;
3481 }
3482
3483 r = mount_move_root(directory);
3484 if (r < 0)
3485 return log_error_errno(r, "Failed to move root directory: %m");
3486
3487 fd = setup_sd_notify_child();
3488 if (fd < 0)
3489 return fd;
3490
3491 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3492 arg_clone_ns_flags |
3493 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3494 if (pid < 0)
3495 return log_error_errno(errno, "Failed to fork inner child: %m");
3496 if (pid == 0) {
3497 pid_socket = safe_close(pid_socket);
3498 uuid_socket = safe_close(uuid_socket);
3499 notify_socket = safe_close(notify_socket);
3500 uid_shift_socket = safe_close(uid_shift_socket);
3501
3502 /* The inner child has all namespaces that are
3503 * requested, so that we all are owned by the user if
3504 * user namespaces are turned on. */
3505
3506 if (arg_network_namespace_path) {
3507 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3508 if (r < 0)
3509 return log_error_errno(r, "Failed to join network namespace: %m");
3510 }
3511
3512 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds);
3513 if (r < 0)
3514 _exit(EXIT_FAILURE);
3515
3516 _exit(EXIT_SUCCESS);
3517 }
3518
3519 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3520 if (l < 0)
3521 return log_error_errno(errno, "Failed to send PID: %m");
3522 if (l != sizeof(pid))
3523 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3524 "Short write while sending PID.");
3525
3526 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3527 if (l < 0)
3528 return log_error_errno(errno, "Failed to send machine ID: %m");
3529 if (l != sizeof(arg_uuid))
3530 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3531 "Short write while sending machine ID.");
3532
3533 l = send_one_fd(notify_socket, fd, 0);
3534 if (l < 0)
3535 return log_error_errno(l, "Failed to send notify fd: %m");
3536
3537 pid_socket = safe_close(pid_socket);
3538 uuid_socket = safe_close(uuid_socket);
3539 notify_socket = safe_close(notify_socket);
3540 master_pty_socket = safe_close(master_pty_socket);
3541 kmsg_socket = safe_close(kmsg_socket);
3542 rtnl_socket = safe_close(rtnl_socket);
3543 netns_fd = safe_close(netns_fd);
3544
3545 return 0;
3546 }
3547
3548 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3549 bool tried_hashed = false;
3550 unsigned n_tries = 100;
3551 uid_t candidate;
3552 int r;
3553
3554 assert(shift);
3555 assert(ret_lock_file);
3556 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3557 assert(arg_uid_range == 0x10000U);
3558
3559 candidate = *shift;
3560
3561 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3562
3563 for (;;) {
3564 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3565 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3566
3567 if (--n_tries <= 0)
3568 return -EBUSY;
3569
3570 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3571 goto next;
3572 if ((candidate & UINT32_C(0xFFFF)) != 0)
3573 goto next;
3574
3575 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3576 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3577 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3578 goto next;
3579 if (r < 0)
3580 return r;
3581
3582 /* Make some superficial checks whether the range is currently known in the user database */
3583 if (getpwuid(candidate))
3584 goto next;
3585 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3586 goto next;
3587 if (getgrgid(candidate))
3588 goto next;
3589 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3590 goto next;
3591
3592 *ret_lock_file = lf;
3593 lf = (struct LockFile) LOCK_FILE_INIT;
3594 *shift = candidate;
3595 return 0;
3596
3597 next:
3598 if (arg_machine && !tried_hashed) {
3599 /* Try to hash the base from the container name */
3600
3601 static const uint8_t hash_key[] = {
3602 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3603 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3604 };
3605
3606 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3607
3608 tried_hashed = true;
3609 } else
3610 random_bytes(&candidate, sizeof(candidate));
3611
3612 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3613 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3614 }
3615 }
3616
3617 static int setup_uid_map(pid_t pid) {
3618 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3619 int r;
3620
3621 assert(pid > 1);
3622
3623 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3624 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3625 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3626 if (r < 0)
3627 return log_error_errno(r, "Failed to write UID map: %m");
3628
3629 /* We always assign the same UID and GID ranges */
3630 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3631 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3632 if (r < 0)
3633 return log_error_errno(r, "Failed to write GID map: %m");
3634
3635 return 0;
3636 }
3637
3638 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3639 char buf[NOTIFY_BUFFER_MAX+1];
3640 char *p = NULL;
3641 struct iovec iovec = {
3642 .iov_base = buf,
3643 .iov_len = sizeof(buf)-1,
3644 };
3645 union {
3646 struct cmsghdr cmsghdr;
3647 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3648 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3649 } control = {};
3650 struct msghdr msghdr = {
3651 .msg_iov = &iovec,
3652 .msg_iovlen = 1,
3653 .msg_control = &control,
3654 .msg_controllen = sizeof(control),
3655 };
3656 struct cmsghdr *cmsg;
3657 struct ucred *ucred = NULL;
3658 ssize_t n;
3659 pid_t inner_child_pid;
3660 _cleanup_strv_free_ char **tags = NULL;
3661
3662 assert(userdata);
3663
3664 inner_child_pid = PTR_TO_PID(userdata);
3665
3666 if (revents != EPOLLIN) {
3667 log_warning("Got unexpected poll event for notify fd.");
3668 return 0;
3669 }
3670
3671 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3672 if (n < 0) {
3673 if (IN_SET(errno, EAGAIN, EINTR))
3674 return 0;
3675
3676 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3677 }
3678 cmsg_close_all(&msghdr);
3679
3680 CMSG_FOREACH(cmsg, &msghdr) {
3681 if (cmsg->cmsg_level == SOL_SOCKET &&
3682 cmsg->cmsg_type == SCM_CREDENTIALS &&
3683 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3684
3685 ucred = (struct ucred*) CMSG_DATA(cmsg);
3686 }
3687 }
3688
3689 if (!ucred || ucred->pid != inner_child_pid) {
3690 log_debug("Received notify message without valid credentials. Ignoring.");
3691 return 0;
3692 }
3693
3694 if ((size_t) n >= sizeof(buf)) {
3695 log_warning("Received notify message exceeded maximum size. Ignoring.");
3696 return 0;
3697 }
3698
3699 buf[n] = 0;
3700 tags = strv_split(buf, "\n\r");
3701 if (!tags)
3702 return log_oom();
3703
3704 if (strv_find(tags, "READY=1"))
3705 (void) sd_notifyf(false, "READY=1\n");
3706
3707 p = strv_find_startswith(tags, "STATUS=");
3708 if (p)
3709 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
3710
3711 return 0;
3712 }
3713
3714 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3715 int r;
3716
3717 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3718 if (r < 0)
3719 return log_error_errno(r, "Failed to allocate notify event source: %m");
3720
3721 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3722
3723 return 0;
3724 }
3725
3726 static int merge_settings(Settings *settings, const char *path) {
3727 int rl;
3728
3729 assert(settings);
3730 assert(path);
3731
3732 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3733 * that this steals the fields of the Settings* structure, and hence modifies it. */
3734
3735 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3736 settings->start_mode >= 0) {
3737 arg_start_mode = settings->start_mode;
3738 strv_free_and_replace(arg_parameters, settings->parameters);
3739 }
3740
3741 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3742 arg_ephemeral = settings->ephemeral;
3743
3744 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3745 settings->root) {
3746
3747 if (!arg_settings_trusted)
3748 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3749 else
3750 free_and_replace(arg_directory, settings->root);
3751 }
3752
3753 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3754 settings->pivot_root_new) {
3755 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3756 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3757 }
3758
3759 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3760 settings->working_directory)
3761 free_and_replace(arg_chdir, settings->working_directory);
3762
3763 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3764 settings->environment)
3765 strv_free_and_replace(arg_setenv, settings->environment);
3766
3767 if ((arg_settings_mask & SETTING_USER) == 0) {
3768
3769 if (settings->user)
3770 free_and_replace(arg_user, settings->user);
3771
3772 if (uid_is_valid(settings->uid))
3773 arg_uid = settings->uid;
3774 if (gid_is_valid(settings->gid))
3775 arg_gid = settings->gid;
3776 if (settings->n_supplementary_gids > 0) {
3777 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3778 arg_n_supplementary_gids = settings->n_supplementary_gids;
3779 }
3780 }
3781
3782 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3783 uint64_t plus, minus;
3784
3785 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3786 * Settings structure */
3787
3788 plus = settings->capability;
3789 minus = settings->drop_capability;
3790
3791 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3792 if (settings_private_network(settings))
3793 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3794 else
3795 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3796 }
3797
3798 if (!arg_settings_trusted && plus != 0) {
3799 if (settings->capability != 0)
3800 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3801 } else
3802 arg_caps_retain |= plus;
3803
3804 arg_caps_retain &= ~minus;
3805
3806 /* Copy the full capabilities over too */
3807 if (capability_quintet_is_set(&settings->full_capabilities)) {
3808 if (!arg_settings_trusted)
3809 log_warning("Ignoring capability settings, file %s is not trusted.", path);
3810 else
3811 arg_full_capabilities = settings->full_capabilities;
3812 }
3813 }
3814
3815 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3816 settings->kill_signal > 0)
3817 arg_kill_signal = settings->kill_signal;
3818
3819 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3820 settings->personality != PERSONALITY_INVALID)
3821 arg_personality = settings->personality;
3822
3823 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3824 !sd_id128_is_null(settings->machine_id)) {
3825
3826 if (!arg_settings_trusted)
3827 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3828 else
3829 arg_uuid = settings->machine_id;
3830 }
3831
3832 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3833 settings->read_only >= 0)
3834 arg_read_only = settings->read_only;
3835
3836 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3837 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3838 arg_volatile_mode = settings->volatile_mode;
3839
3840 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3841 settings->n_custom_mounts > 0) {
3842
3843 if (!arg_settings_trusted)
3844 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3845 else {
3846 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3847 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3848 arg_n_custom_mounts = settings->n_custom_mounts;
3849 settings->n_custom_mounts = 0;
3850 }
3851 }
3852
3853 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3854 (settings->private_network >= 0 ||
3855 settings->network_veth >= 0 ||
3856 settings->network_bridge ||
3857 settings->network_zone ||
3858 settings->network_interfaces ||
3859 settings->network_macvlan ||
3860 settings->network_ipvlan ||
3861 settings->network_veth_extra ||
3862 settings->network_namespace_path)) {
3863
3864 if (!arg_settings_trusted)
3865 log_warning("Ignoring network settings, file %s is not trusted.", path);
3866 else {
3867 arg_network_veth = settings_network_veth(settings);
3868 arg_private_network = settings_private_network(settings);
3869
3870 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3871 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3872 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3873 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3874
3875 free_and_replace(arg_network_bridge, settings->network_bridge);
3876 free_and_replace(arg_network_zone, settings->network_zone);
3877
3878 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
3879 }
3880 }
3881
3882 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3883 settings->expose_ports) {
3884
3885 if (!arg_settings_trusted)
3886 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3887 else {
3888 expose_port_free_all(arg_expose_ports);
3889 arg_expose_ports = TAKE_PTR(settings->expose_ports);
3890 }
3891 }
3892
3893 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3894 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3895
3896 if (!arg_settings_trusted)
3897 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3898 else {
3899 arg_userns_mode = settings->userns_mode;
3900 arg_uid_shift = settings->uid_shift;
3901 arg_uid_range = settings->uid_range;
3902 arg_userns_chown = settings->userns_chown;
3903 }
3904 }
3905
3906 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3907 arg_notify_ready = settings->notify_ready;
3908
3909 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3910
3911 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
3912 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3913 else {
3914 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3915 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3916 }
3917
3918 #if HAVE_SECCOMP
3919 if (!arg_settings_trusted && settings->seccomp)
3920 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3921 else {
3922 seccomp_release(arg_seccomp);
3923 arg_seccomp = TAKE_PTR(settings->seccomp);
3924 }
3925 #endif
3926 }
3927
3928 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3929 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3930 continue;
3931
3932 if (!settings->rlimit[rl])
3933 continue;
3934
3935 if (!arg_settings_trusted) {
3936 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3937 continue;
3938 }
3939
3940 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3941 }
3942
3943 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3944 settings->hostname)
3945 free_and_replace(arg_hostname, settings->hostname);
3946
3947 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3948 settings->no_new_privileges >= 0)
3949 arg_no_new_privileges = settings->no_new_privileges;
3950
3951 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3952 settings->oom_score_adjust_set) {
3953
3954 if (!arg_settings_trusted)
3955 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3956 else {
3957 arg_oom_score_adjust = settings->oom_score_adjust;
3958 arg_oom_score_adjust_set = true;
3959 }
3960 }
3961
3962 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3963 settings->cpu_set.set) {
3964
3965 if (!arg_settings_trusted)
3966 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3967 else {
3968 cpu_set_reset(&arg_cpu_set);
3969 arg_cpu_set = settings->cpu_set;
3970 settings->cpu_set = (CPUSet) {};
3971 }
3972 }
3973
3974 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3975 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3976 arg_resolv_conf = settings->resolv_conf;
3977
3978 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3979 settings->link_journal != _LINK_JOURNAL_INVALID) {
3980
3981 if (!arg_settings_trusted)
3982 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3983 else {
3984 arg_link_journal = settings->link_journal;
3985 arg_link_journal_try = settings->link_journal_try;
3986 }
3987 }
3988
3989 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3990 settings->timezone != _TIMEZONE_MODE_INVALID)
3991 arg_timezone = settings->timezone;
3992
3993 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3994 settings->slice) {
3995
3996 if (!arg_settings_trusted)
3997 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3998 else
3999 free_and_replace(arg_slice, settings->slice);
4000 }
4001
4002 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4003 settings->use_cgns >= 0) {
4004
4005 if (!arg_settings_trusted)
4006 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4007 else
4008 arg_use_cgns = settings->use_cgns;
4009 }
4010
4011 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4012 settings->clone_ns_flags != (unsigned long) -1) {
4013
4014 if (!arg_settings_trusted)
4015 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4016 else
4017 arg_clone_ns_flags = settings->clone_ns_flags;
4018 }
4019
4020 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4021 settings->console_mode >= 0) {
4022
4023 if (!arg_settings_trusted)
4024 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4025 else
4026 arg_console_mode = settings->console_mode;
4027 }
4028
4029 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4030 * don't consult arg_settings_mask for them. */
4031
4032 sd_bus_message_unref(arg_property_message);
4033 arg_property_message = TAKE_PTR(settings->properties);
4034
4035 arg_console_width = settings->console_width;
4036 arg_console_height = settings->console_height;
4037
4038 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4039 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4040 arg_n_extra_nodes = settings->n_extra_nodes;
4041
4042 return 0;
4043 }
4044
4045 static int load_settings(void) {
4046 _cleanup_(settings_freep) Settings *settings = NULL;
4047 _cleanup_fclose_ FILE *f = NULL;
4048 _cleanup_free_ char *p = NULL;
4049 const char *fn, *i;
4050 int r;
4051
4052 if (arg_oci_bundle)
4053 return 0;
4054
4055 /* If all settings are masked, there's no point in looking for
4056 * the settings file */
4057 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4058 return 0;
4059
4060 fn = strjoina(arg_machine, ".nspawn");
4061
4062 /* We first look in the admin's directories in /etc and /run */
4063 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4064 _cleanup_free_ char *j = NULL;
4065
4066 j = path_join(i, fn);
4067 if (!j)
4068 return log_oom();
4069
4070 f = fopen(j, "re");
4071 if (f) {
4072 p = TAKE_PTR(j);
4073
4074 /* By default, we trust configuration from /etc and /run */
4075 if (arg_settings_trusted < 0)
4076 arg_settings_trusted = true;
4077
4078 break;
4079 }
4080
4081 if (errno != ENOENT)
4082 return log_error_errno(errno, "Failed to open %s: %m", j);
4083 }
4084
4085 if (!f) {
4086 /* After that, let's look for a file next to the
4087 * actual image we shall boot. */
4088
4089 if (arg_image) {
4090 p = file_in_same_dir(arg_image, fn);
4091 if (!p)
4092 return log_oom();
4093 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4094 p = file_in_same_dir(arg_directory, fn);
4095 if (!p)
4096 return log_oom();
4097 }
4098
4099 if (p) {
4100 f = fopen(p, "re");
4101 if (!f && errno != ENOENT)
4102 return log_error_errno(errno, "Failed to open %s: %m", p);
4103
4104 /* By default, we do not trust configuration from /var/lib/machines */
4105 if (arg_settings_trusted < 0)
4106 arg_settings_trusted = false;
4107 }
4108 }
4109
4110 if (!f)
4111 return 0;
4112
4113 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4114
4115 r = settings_load(f, p, &settings);
4116 if (r < 0)
4117 return r;
4118
4119 return merge_settings(settings, p);
4120 }
4121
4122 static int load_oci_bundle(void) {
4123 _cleanup_(settings_freep) Settings *settings = NULL;
4124 int r;
4125
4126 if (!arg_oci_bundle)
4127 return 0;
4128
4129 /* By default let's trust OCI bundles */
4130 if (arg_settings_trusted < 0)
4131 arg_settings_trusted = true;
4132
4133 r = oci_load(NULL, arg_oci_bundle, &settings);
4134 if (r < 0)
4135 return r;
4136
4137 return merge_settings(settings, arg_oci_bundle);
4138 }
4139
4140 static int run_container(
4141 DissectedImage *dissected_image,
4142 bool secondary,
4143 FDSet *fds,
4144 char veth_name[IFNAMSIZ], bool *veth_created,
4145 union in_addr_union *exposed,
4146 int *master, pid_t *pid, int *ret) {
4147
4148 static const struct sigaction sa = {
4149 .sa_handler = nop_signal_handler,
4150 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4151 };
4152
4153 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4154 _cleanup_close_ int etc_passwd_lock = -1;
4155 _cleanup_close_pair_ int
4156 kmsg_socket_pair[2] = { -1, -1 },
4157 rtnl_socket_pair[2] = { -1, -1 },
4158 pid_socket_pair[2] = { -1, -1 },
4159 uuid_socket_pair[2] = { -1, -1 },
4160 notify_socket_pair[2] = { -1, -1 },
4161 uid_shift_socket_pair[2] = { -1, -1 },
4162 master_pty_socket_pair[2] = { -1, -1 },
4163 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4164
4165 _cleanup_close_ int notify_socket = -1;
4166 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4167 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4168 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4169 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4170 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4171 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4172 ContainerStatus container_status = 0;
4173 int ifi = 0, r;
4174 ssize_t l;
4175 sigset_t mask_chld;
4176 _cleanup_close_ int netns_fd = -1;
4177
4178 assert_se(sigemptyset(&mask_chld) == 0);
4179 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4180
4181 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4182 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4183 * check with getpwuid() if the specific user already exists. Note that /etc might be
4184 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4185 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4186 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4187 * really ours. */
4188
4189 etc_passwd_lock = take_etc_passwd_lock(NULL);
4190 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4191 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4192 }
4193
4194 r = barrier_create(&barrier);
4195 if (r < 0)
4196 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4197
4198 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4199 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4200
4201 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4202 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4203
4204 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4205 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4206
4207 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4208 return log_error_errno(errno, "Failed to create id socket pair: %m");
4209
4210 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4211 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4212
4213 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4214 return log_error_errno(errno, "Failed to create console socket pair: %m");
4215
4216 if (arg_userns_mode != USER_NAMESPACE_NO)
4217 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4218 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4219
4220 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4221 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4222 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4223
4224 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4225 * parent's blocking calls and give it a chance to call wait() and terminate. */
4226 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4227 if (r < 0)
4228 return log_error_errno(errno, "Failed to change the signal mask: %m");
4229
4230 r = sigaction(SIGCHLD, &sa, NULL);
4231 if (r < 0)
4232 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4233
4234 if (arg_network_namespace_path) {
4235 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4236 if (netns_fd < 0)
4237 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4238
4239 r = fd_is_network_ns(netns_fd);
4240 if (r == -EUCLEAN)
4241 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4242 else if (r < 0)
4243 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4244 else if (r == 0)
4245 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4246 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4247 }
4248
4249 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4250 if (*pid < 0)
4251 return log_error_errno(errno, "clone() failed%s: %m",
4252 errno == EINVAL ?
4253 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4254
4255 if (*pid == 0) {
4256 /* The outer child only has a file system namespace. */
4257 barrier_set_role(&barrier, BARRIER_CHILD);
4258
4259 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4260 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4261 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4262 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4263 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4264 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4265 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4266 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4267
4268 (void) reset_all_signal_handlers();
4269 (void) reset_signal_mask();
4270
4271 r = outer_child(&barrier,
4272 arg_directory,
4273 dissected_image,
4274 secondary,
4275 pid_socket_pair[1],
4276 uuid_socket_pair[1],
4277 notify_socket_pair[1],
4278 kmsg_socket_pair[1],
4279 rtnl_socket_pair[1],
4280 uid_shift_socket_pair[1],
4281 master_pty_socket_pair[1],
4282 unified_cgroup_hierarchy_socket_pair[1],
4283 fds,
4284 netns_fd);
4285 if (r < 0)
4286 _exit(EXIT_FAILURE);
4287
4288 _exit(EXIT_SUCCESS);
4289 }
4290
4291 barrier_set_role(&barrier, BARRIER_PARENT);
4292
4293 fdset_close(fds);
4294
4295 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4296 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4297 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4298 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4299 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4300 master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4301 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4302 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4303
4304 if (arg_userns_mode != USER_NAMESPACE_NO) {
4305 /* The child just let us know the UID shift it might have read from the image. */
4306 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4307 if (l < 0)
4308 return log_error_errno(errno, "Failed to read UID shift: %m");
4309 if (l != sizeof arg_uid_shift)
4310 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4311
4312 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4313 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4314 * image, but if that's already in use, pick a new one, and report back to the child,
4315 * which one we now picked. */
4316
4317 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4318 if (r < 0)
4319 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4320
4321 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4322 if (l < 0)
4323 return log_error_errno(errno, "Failed to send UID shift: %m");
4324 if (l != sizeof arg_uid_shift)
4325 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4326 }
4327 }
4328
4329 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4330 /* The child let us know the support cgroup mode it might have read from the image. */
4331 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4332 if (l < 0)
4333 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4334 if (l != sizeof(arg_unified_cgroup_hierarchy))
4335 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4336 l, l == 0 ? " The child is most likely dead." : "");
4337 }
4338
4339 /* Wait for the outer child. */
4340 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4341 if (r < 0)
4342 return r;
4343 if (r != EXIT_SUCCESS)
4344 return -EIO;
4345
4346 /* And now retrieve the PID of the inner child. */
4347 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4348 if (l < 0)
4349 return log_error_errno(errno, "Failed to read inner child PID: %m");
4350 if (l != sizeof *pid)
4351 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4352
4353 /* We also retrieve container UUID in case it was generated by outer child */
4354 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4355 if (l < 0)
4356 return log_error_errno(errno, "Failed to read container machine ID: %m");
4357 if (l != sizeof(arg_uuid))
4358 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4359
4360 /* We also retrieve the socket used for notifications generated by outer child */
4361 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4362 if (notify_socket < 0)
4363 return log_error_errno(notify_socket,
4364 "Failed to receive notification socket from the outer child: %m");
4365
4366 log_debug("Init process invoked as PID "PID_FMT, *pid);
4367
4368 if (arg_userns_mode != USER_NAMESPACE_NO) {
4369 if (!barrier_place_and_sync(&barrier)) /* #1 */
4370 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4371
4372 r = setup_uid_map(*pid);
4373 if (r < 0)
4374 return r;
4375
4376 (void) barrier_place(&barrier); /* #2 */
4377 }
4378
4379 if (arg_private_network) {
4380 if (!arg_network_namespace_path) {
4381 /* Wait until the child has unshared its network namespace. */
4382 if (!barrier_place_and_sync(&barrier)) /* #3 */
4383 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4384 }
4385
4386 r = move_network_interfaces(*pid, arg_network_interfaces);
4387 if (r < 0)
4388 return r;
4389
4390 if (arg_network_veth) {
4391 r = setup_veth(arg_machine, *pid, veth_name,
4392 arg_network_bridge || arg_network_zone);
4393 if (r < 0)
4394 return r;
4395 else if (r > 0)
4396 ifi = r;
4397
4398 if (arg_network_bridge) {
4399 /* Add the interface to a bridge */
4400 r = setup_bridge(veth_name, arg_network_bridge, false);
4401 if (r < 0)
4402 return r;
4403 if (r > 0)
4404 ifi = r;
4405 } else if (arg_network_zone) {
4406 /* Add the interface to a bridge, possibly creating it */
4407 r = setup_bridge(veth_name, arg_network_zone, true);
4408 if (r < 0)
4409 return r;
4410 if (r > 0)
4411 ifi = r;
4412 }
4413 }
4414
4415 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4416 if (r < 0)
4417 return r;
4418
4419 /* We created the primary and extra veth links now; let's remember this, so that we know to
4420 remove them later on. Note that we don't bother with removing veth links that were created
4421 here when their setup failed half-way, because in that case the kernel should be able to
4422 remove them on its own, since they cannot be referenced by anything yet. */
4423 *veth_created = true;
4424
4425 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4426 if (r < 0)
4427 return r;
4428
4429 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4430 if (r < 0)
4431 return r;
4432 }
4433
4434 if (arg_register || !arg_keep_unit) {
4435 r = sd_bus_default_system(&bus);
4436 if (r < 0)
4437 return log_error_errno(r, "Failed to open system bus: %m");
4438
4439 r = sd_bus_set_close_on_exit(bus, false);
4440 if (r < 0)
4441 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4442 }
4443
4444 if (!arg_keep_unit) {
4445 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4446 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4447 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4448
4449 r = sd_bus_match_signal_async(
4450 bus,
4451 NULL,
4452 "org.freedesktop.systemd1",
4453 NULL,
4454 "org.freedesktop.systemd1.Scope",
4455 "RequestStop",
4456 on_request_stop, NULL, PID_TO_PTR(*pid));
4457 if (r < 0)
4458 return log_error_errno(r, "Failed to request RequestStop match: %m");
4459 }
4460
4461 if (arg_register) {
4462 r = register_machine(
4463 bus,
4464 arg_machine,
4465 *pid,
4466 arg_directory,
4467 arg_uuid,
4468 ifi,
4469 arg_slice,
4470 arg_custom_mounts, arg_n_custom_mounts,
4471 arg_kill_signal,
4472 arg_property,
4473 arg_property_message,
4474 arg_keep_unit,
4475 arg_container_service_name);
4476 if (r < 0)
4477 return r;
4478
4479 } else if (!arg_keep_unit) {
4480 r = allocate_scope(
4481 bus,
4482 arg_machine,
4483 *pid,
4484 arg_slice,
4485 arg_custom_mounts, arg_n_custom_mounts,
4486 arg_kill_signal,
4487 arg_property,
4488 arg_property_message);
4489 if (r < 0)
4490 return r;
4491
4492 } else if (arg_slice || arg_property)
4493 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
4494
4495 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
4496 if (r < 0)
4497 return r;
4498
4499 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4500 if (r < 0)
4501 return r;
4502
4503 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4504 if (r < 0)
4505 return r;
4506
4507 /* Notify the child that the parent is ready with all
4508 * its setup (including cgroup-ification), and that
4509 * the child can now hand over control to the code to
4510 * run inside the container. */
4511 (void) barrier_place(&barrier); /* #4 */
4512
4513 /* Block SIGCHLD here, before notifying child.
4514 * process_pty() will handle it with the other signals. */
4515 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4516
4517 /* Reset signal to default */
4518 r = default_signals(SIGCHLD, -1);
4519 if (r < 0)
4520 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4521
4522 r = sd_event_new(&event);
4523 if (r < 0)
4524 return log_error_errno(r, "Failed to get default event source: %m");
4525
4526 (void) sd_event_set_watchdog(event, true);
4527
4528 if (bus) {
4529 r = sd_bus_attach_event(bus, event, 0);
4530 if (r < 0)
4531 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4532 }
4533
4534 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
4535 if (r < 0)
4536 return r;
4537
4538 /* Let the child know that we are ready and wait that the child is completely ready now. */
4539 if (!barrier_place_and_sync(&barrier)) /* #5 */
4540 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4541
4542 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4543 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4544 etc_passwd_lock = safe_close(etc_passwd_lock);
4545
4546 (void) sd_notifyf(false,
4547 "STATUS=Container running.\n"
4548 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4549 if (!arg_notify_ready)
4550 (void) sd_notify(false, "READY=1\n");
4551
4552 if (arg_kill_signal > 0) {
4553 /* Try to kill the init system on SIGINT or SIGTERM */
4554 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4555 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4556 } else {
4557 /* Immediately exit */
4558 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4559 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4560 }
4561
4562 /* Exit when the child exits */
4563 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4564
4565 if (arg_expose_ports) {
4566 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4567 if (r < 0)
4568 return r;
4569
4570 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4571 }
4572
4573 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4574
4575 if (arg_console_mode != CONSOLE_PIPE) {
4576 _cleanup_close_ int fd = -1;
4577 PTYForwardFlags flags = 0;
4578
4579 /* Retrieve the master pty allocated by inner child */
4580 fd = receive_one_fd(master_pty_socket_pair[0], 0);
4581 if (fd < 0)
4582 return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
4583
4584 switch (arg_console_mode) {
4585
4586 case CONSOLE_READ_ONLY:
4587 flags |= PTY_FORWARD_READ_ONLY;
4588
4589 _fallthrough_;
4590
4591 case CONSOLE_INTERACTIVE:
4592 flags |= PTY_FORWARD_IGNORE_VHANGUP;
4593
4594 r = pty_forward_new(event, fd, flags, &forward);
4595 if (r < 0)
4596 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4597
4598 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4599 (void) pty_forward_set_width_height(forward,
4600 arg_console_width,
4601 arg_console_height);
4602 break;
4603
4604 default:
4605 assert(arg_console_mode == CONSOLE_PASSIVE);
4606 }
4607
4608 *master = TAKE_FD(fd);
4609 }
4610
4611 r = sd_event_loop(event);
4612 if (r < 0)
4613 return log_error_errno(r, "Failed to run event loop: %m");
4614
4615 if (forward) {
4616 char last_char = 0;
4617
4618 (void) pty_forward_get_last_char(forward, &last_char);
4619 forward = pty_forward_free(forward);
4620
4621 if (!arg_quiet && last_char != '\n')
4622 putc('\n', stdout);
4623 }
4624
4625 /* Kill if it is not dead yet anyway */
4626 if (!arg_register && !arg_keep_unit && bus)
4627 terminate_scope(bus, arg_machine);
4628
4629 /* Normally redundant, but better safe than sorry */
4630 (void) kill(*pid, SIGKILL);
4631
4632 r = wait_for_container(*pid, &container_status);
4633 *pid = 0;
4634
4635 /* Tell machined that we are gone. */
4636 if (bus)
4637 (void) unregister_machine(bus, arg_machine);
4638
4639 if (r < 0)
4640 /* We failed to wait for the container, or the container exited abnormally. */
4641 return r;
4642 if (r > 0 || container_status == CONTAINER_TERMINATED) {
4643 /* r > 0 → The container exited with a non-zero status.
4644 * As a special case, we need to replace 133 with a different value,
4645 * because 133 is special-cased in the service file to reboot the container.
4646 * otherwise → The container exited with zero status and a reboot was not requested.
4647 */
4648 if (r == EXIT_FORCE_RESTART)
4649 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4650 *ret = r;
4651 return 0; /* finito */
4652 }
4653
4654 /* CONTAINER_REBOOTED, loop again */
4655
4656 if (arg_keep_unit) {
4657 /* Special handling if we are running as a service: instead of simply
4658 * restarting the machine we want to restart the entire service, so let's
4659 * inform systemd about this with the special exit code 133. The service
4660 * file uses RestartForceExitStatus=133 so that this results in a full
4661 * nspawn restart. This is necessary since we might have cgroup parameters
4662 * set we want to have flushed out. */
4663 *ret = EXIT_FORCE_RESTART;
4664 return 0; /* finito */
4665 }
4666
4667 expose_port_flush(arg_expose_ports, exposed);
4668
4669 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4670 *veth_created = false;
4671 return 1; /* loop again */
4672 }
4673
4674 static int initialize_rlimits(void) {
4675 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4676 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4677 * container execution environments. */
4678
4679 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4680 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4681 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4682 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4683 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4684 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4685 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4686 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4687 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4688 [RLIMIT_NICE] = { 0, 0 },
4689 [RLIMIT_NOFILE] = { 1024, 4096 },
4690 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4691 [RLIMIT_RTPRIO] = { 0, 0 },
4692 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4693 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4694
4695 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4696 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4697 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4698 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4699 * that PID 1 changes a number of other resource limits during early initialization which is why we
4700 * don't read the other limits from PID 1 but prefer the static table above. */
4701 };
4702
4703 int rl;
4704
4705 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4706 /* Let's only fill in what the user hasn't explicitly configured anyway */
4707 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4708 const struct rlimit *v;
4709 struct rlimit buffer;
4710
4711 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4712 /* For these two let's read the limits off PID 1. See above for an explanation. */
4713
4714 if (prlimit(1, rl, NULL, &buffer) < 0)
4715 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4716
4717 v = &buffer;
4718 } else
4719 v = kernel_defaults + rl;
4720
4721 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4722 if (!arg_rlimit[rl])
4723 return log_oom();
4724 }
4725
4726 if (DEBUG_LOGGING) {
4727 _cleanup_free_ char *k = NULL;
4728
4729 (void) rlimit_format(arg_rlimit[rl], &k);
4730 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4731 }
4732 }
4733
4734 return 0;
4735 }
4736
4737 static int run(int argc, char *argv[]) {
4738 bool secondary = false, remove_directory = false, remove_image = false,
4739 veth_created = false, remove_tmprootdir = false;
4740 _cleanup_close_ int master = -1;
4741 _cleanup_fdset_free_ FDSet *fds = NULL;
4742 int r, n_fd_passed, ret = EXIT_SUCCESS;
4743 char veth_name[IFNAMSIZ] = "";
4744 union in_addr_union exposed = {};
4745 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4746 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4747 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4748 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4749 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4750 pid_t pid = 0;
4751
4752 log_parse_environment();
4753 log_open();
4754
4755 r = parse_argv(argc, argv);
4756 if (r <= 0)
4757 goto finish;
4758
4759 r = must_be_root();
4760 if (r < 0)
4761 goto finish;
4762
4763 r = initialize_rlimits();
4764 if (r < 0)
4765 goto finish;
4766
4767 r = load_oci_bundle();
4768 if (r < 0)
4769 goto finish;
4770
4771 r = determine_names();
4772 if (r < 0)
4773 goto finish;
4774
4775 r = load_settings();
4776 if (r < 0)
4777 goto finish;
4778
4779 r = cg_unified();
4780 if (r < 0) {
4781 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4782 goto finish;
4783 }
4784
4785 r = verify_arguments();
4786 if (r < 0)
4787 goto finish;
4788
4789 /* Reapply environment settings. */
4790 (void) detect_unified_cgroup_hierarchy_from_environment();
4791
4792 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4793 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4794 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4795 (void) ignore_signals(SIGPIPE, -1);
4796
4797 n_fd_passed = sd_listen_fds(false);
4798 if (n_fd_passed > 0) {
4799 r = fdset_new_listen_fds(&fds, false);
4800 if (r < 0) {
4801 log_error_errno(r, "Failed to collect file descriptors: %m");
4802 goto finish;
4803 }
4804 }
4805
4806 /* The "default" umask. This is appropriate for most file and directory
4807 * operations performed by nspawn, and is the umask that will be used for
4808 * the child. Functions like copy_devnodes() change the umask temporarily. */
4809 umask(0022);
4810
4811 if (arg_directory) {
4812 assert(!arg_image);
4813
4814 /* Safety precaution: let's not allow running images from the live host OS image, as long as
4815 * /var from the host will propagate into container dynamically (because bad things happen if
4816 * two systems write to the same /var). Let's allow it for the special cases where /var is
4817 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
4818 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
4819 log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
4820 r = -EINVAL;
4821 goto finish;
4822 }
4823
4824 if (arg_ephemeral) {
4825 _cleanup_free_ char *np = NULL;
4826
4827 r = chase_symlinks_and_update(&arg_directory, 0);
4828 if (r < 0)
4829 goto finish;
4830
4831 /* If the specified path is a mount point we generate the new snapshot immediately
4832 * inside it under a random name. However if the specified is not a mount point we
4833 * create the new snapshot in the parent directory, just next to it. */
4834 r = path_is_mount_point(arg_directory, NULL, 0);
4835 if (r < 0) {
4836 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4837 goto finish;
4838 }
4839 if (r > 0)
4840 r = tempfn_random_child(arg_directory, "machine.", &np);
4841 else
4842 r = tempfn_random(arg_directory, "machine.", &np);
4843 if (r < 0) {
4844 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4845 goto finish;
4846 }
4847
4848 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
4849 * only owned by us and noone else. */
4850 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4851 if (r < 0) {
4852 log_error_errno(r, "Failed to lock %s: %m", np);
4853 goto finish;
4854 }
4855
4856 {
4857 BLOCK_SIGNALS(SIGINT);
4858 r = btrfs_subvol_snapshot(arg_directory, np,
4859 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4860 BTRFS_SNAPSHOT_FALLBACK_COPY |
4861 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4862 BTRFS_SNAPSHOT_RECURSIVE |
4863 BTRFS_SNAPSHOT_QUOTA |
4864 BTRFS_SNAPSHOT_SIGINT);
4865 }
4866 if (r == -EINTR) {
4867 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
4868 goto finish;
4869 }
4870 if (r < 0) {
4871 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4872 goto finish;
4873 }
4874
4875 free_and_replace(arg_directory, np);
4876 remove_directory = true;
4877 } else {
4878 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4879 if (r < 0)
4880 goto finish;
4881
4882 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4883 if (r == -EBUSY) {
4884 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4885 goto finish;
4886 }
4887 if (r < 0) {
4888 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4889 goto finish;
4890 }
4891
4892 if (arg_template) {
4893 r = chase_symlinks_and_update(&arg_template, 0);
4894 if (r < 0)
4895 goto finish;
4896
4897 {
4898 BLOCK_SIGNALS(SIGINT);
4899 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4900 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4901 BTRFS_SNAPSHOT_FALLBACK_COPY |
4902 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4903 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4904 BTRFS_SNAPSHOT_RECURSIVE |
4905 BTRFS_SNAPSHOT_QUOTA |
4906 BTRFS_SNAPSHOT_SIGINT);
4907 }
4908 if (r == -EEXIST)
4909 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4910 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4911 else if (r == -EINTR) {
4912 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
4913 goto finish;
4914 } else if (r < 0) {
4915 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4916 goto finish;
4917 } else
4918 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4919 "Populated %s from template %s.", arg_directory, arg_template);
4920 }
4921 }
4922
4923 if (arg_start_mode == START_BOOT) {
4924 const char *p;
4925
4926 if (arg_pivot_root_new)
4927 p = prefix_roota(arg_directory, arg_pivot_root_new);
4928 else
4929 p = arg_directory;
4930
4931 if (path_is_os_tree(p) <= 0) {
4932 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4933 r = -EINVAL;
4934 goto finish;
4935 }
4936 } else {
4937 const char *p, *q;
4938
4939 if (arg_pivot_root_new)
4940 p = prefix_roota(arg_directory, arg_pivot_root_new);
4941 else
4942 p = arg_directory;
4943
4944 q = strjoina(p, "/usr/");
4945
4946 if (laccess(q, F_OK) < 0) {
4947 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4948 r = -EINVAL;
4949 goto finish;
4950 }
4951 }
4952
4953 } else {
4954 assert(arg_image);
4955 assert(!arg_template);
4956
4957 r = chase_symlinks_and_update(&arg_image, 0);
4958 if (r < 0)
4959 goto finish;
4960
4961 if (arg_ephemeral) {
4962 _cleanup_free_ char *np = NULL;
4963
4964 r = tempfn_random(arg_image, "machine.", &np);
4965 if (r < 0) {
4966 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4967 goto finish;
4968 }
4969
4970 /* Always take an exclusive lock on our own ephemeral copy. */
4971 r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
4972 if (r < 0) {
4973 r = log_error_errno(r, "Failed to create image lock: %m");
4974 goto finish;
4975 }
4976
4977 {
4978 BLOCK_SIGNALS(SIGINT);
4979 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
4980 }
4981 if (r == -EINTR) {
4982 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
4983 goto finish;
4984 }
4985 if (r < 0) {
4986 r = log_error_errno(r, "Failed to copy image file: %m");
4987 goto finish;
4988 }
4989
4990 free_and_replace(arg_image, np);
4991 remove_image = true;
4992 } else {
4993 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4994 if (r == -EBUSY) {
4995 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4996 goto finish;
4997 }
4998 if (r < 0) {
4999 r = log_error_errno(r, "Failed to create image lock: %m");
5000 goto finish;
5001 }
5002
5003 if (!arg_root_hash) {
5004 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
5005 if (r < 0) {
5006 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
5007 goto finish;
5008 }
5009 }
5010 }
5011
5012 if (!mkdtemp(tmprootdir)) {
5013 r = log_error_errno(errno, "Failed to create temporary directory: %m");
5014 goto finish;
5015 }
5016
5017 remove_tmprootdir = true;
5018
5019 arg_directory = strdup(tmprootdir);
5020 if (!arg_directory) {
5021 r = log_oom();
5022 goto finish;
5023 }
5024
5025 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
5026 if (r < 0) {
5027 log_error_errno(r, "Failed to set up loopback block device: %m");
5028 goto finish;
5029 }
5030
5031 r = dissect_image_and_warn(
5032 loop->fd,
5033 arg_image,
5034 arg_root_hash, arg_root_hash_size,
5035 DISSECT_IMAGE_REQUIRE_ROOT,
5036 &dissected_image);
5037 if (r == -ENOPKG) {
5038 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5039 log_notice("Note that the disk image needs to\n"
5040 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5041 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5042 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
5043 " d) or contain a file system without a partition table\n"
5044 "in order to be bootable with systemd-nspawn.");
5045 goto finish;
5046 }
5047 if (r < 0)
5048 goto finish;
5049
5050 if (!arg_root_hash && dissected_image->can_verity)
5051 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
5052
5053 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
5054 if (r < 0)
5055 goto finish;
5056
5057 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5058 if (remove_image && unlink(arg_image) >= 0)
5059 remove_image = false;
5060 }
5061
5062 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5063 if (r < 0)
5064 goto finish;
5065
5066 if (arg_console_mode < 0)
5067 arg_console_mode =
5068 isatty(STDIN_FILENO) > 0 &&
5069 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5070
5071 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5072 arg_quiet = true;
5073
5074 if (!arg_quiet)
5075 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5076 arg_machine, arg_image ?: arg_directory);
5077
5078 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5079
5080 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5081 r = log_error_errno(errno, "Failed to become subreaper: %m");
5082 goto finish;
5083 }
5084
5085 for (;;) {
5086 r = run_container(dissected_image,
5087 secondary,
5088 fds,
5089 veth_name, &veth_created,
5090 &exposed, &master,
5091 &pid, &ret);
5092 if (r <= 0)
5093 break;
5094 }
5095
5096 finish:
5097 (void) sd_notify(false,
5098 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5099 "STOPPING=1\nSTATUS=Terminating...");
5100
5101 if (pid > 0)
5102 (void) kill(pid, SIGKILL);
5103
5104 /* Try to flush whatever is still queued in the pty */
5105 if (master >= 0) {
5106 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
5107 master = safe_close(master);
5108 }
5109
5110 if (pid > 0)
5111 (void) wait_for_terminate(pid, NULL);
5112
5113 pager_close();
5114
5115 if (remove_directory && arg_directory) {
5116 int k;
5117
5118 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5119 if (k < 0)
5120 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5121 }
5122
5123 if (remove_image && arg_image) {
5124 if (unlink(arg_image) < 0)
5125 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5126 }
5127
5128 if (remove_tmprootdir) {
5129 if (rmdir(tmprootdir) < 0)
5130 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5131 }
5132
5133 if (arg_machine) {
5134 const char *p;
5135
5136 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5137 (void) rm_rf(p, REMOVE_ROOT);
5138 }
5139
5140 expose_port_flush(arg_expose_ports, &exposed);
5141
5142 if (veth_created)
5143 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5144 (void) remove_bridge(arg_network_zone);
5145
5146 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5147 expose_port_free_all(arg_expose_ports);
5148 rlimit_free_all(arg_rlimit);
5149 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5150
5151 if (r < 0)
5152 return r;
5153
5154 return ret;
5155 }
5156
5157 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);