]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
676775d98c1185c98e1a56a4bc596266942c2f85
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_BLKID
4 #include <blkid.h>
5 #endif
6 #include <errno.h>
7 #include <getopt.h>
8 #include <grp.h>
9 #include <linux/fs.h>
10 #include <linux/loop.h>
11 #include <pwd.h>
12 #include <sched.h>
13 #if HAVE_SELINUX
14 #include <selinux/selinux.h>
15 #endif
16 #include <signal.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/personality.h>
22 #include <sys/prctl.h>
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 #include <unistd.h>
26
27 #include "sd-bus.h"
28 #include "sd-daemon.h"
29 #include "sd-id128.h"
30
31 #include "alloc-util.h"
32 #include "barrier.h"
33 #include "base-filesystem.h"
34 #include "blkid-util.h"
35 #include "btrfs-util.h"
36 #include "bus-error.h"
37 #include "bus-util.h"
38 #include "cap-list.h"
39 #include "capability-util.h"
40 #include "cgroup-util.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "dev-setup.h"
44 #include "dissect-image.h"
45 #include "env-util.h"
46 #include "fd-util.h"
47 #include "fdset.h"
48 #include "fileio.h"
49 #include "format-util.h"
50 #include "fs-util.h"
51 #include "gpt.h"
52 #include "hexdecoct.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "log.h"
56 #include "loop-util.h"
57 #include "loopback-setup.h"
58 #include "machine-image.h"
59 #include "macro.h"
60 #include "main-func.h"
61 #include "missing.h"
62 #include "mkdir.h"
63 #include "mount-util.h"
64 #include "mountpoint-util.h"
65 #include "namespace-util.h"
66 #include "netlink-util.h"
67 #include "nspawn-cgroup.h"
68 #include "nspawn-def.h"
69 #include "nspawn-expose-ports.h"
70 #include "nspawn-mount.h"
71 #include "nspawn-network.h"
72 #include "nspawn-oci.h"
73 #include "nspawn-patch-uid.h"
74 #include "nspawn-register.h"
75 #include "nspawn-seccomp.h"
76 #include "nspawn-settings.h"
77 #include "nspawn-setuid.h"
78 #include "nspawn-stub-pid1.h"
79 #include "nulstr-util.h"
80 #include "os-util.h"
81 #include "pager.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "pretty-print.h"
85 #include "process-util.h"
86 #include "ptyfwd.h"
87 #include "random-util.h"
88 #include "raw-clone.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #if HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "socket-util.h"
97 #include "stat-util.h"
98 #include "stdio-util.h"
99 #include "string-table.h"
100 #include "string-util.h"
101 #include "strv.h"
102 #include "sysctl-util.h"
103 #include "terminal-util.h"
104 #include "tmpfile-util.h"
105 #include "umask-util.h"
106 #include "user-util.h"
107 #include "util.h"
108
109 #if HAVE_SPLIT_USR
110 #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
111 #else
112 #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
113 #endif
114
115 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED,
125 } ContainerStatus;
126
127 static char *arg_directory = NULL;
128 static char *arg_template = NULL;
129 static char *arg_chdir = NULL;
130 static char *arg_pivot_root_new = NULL;
131 static char *arg_pivot_root_old = NULL;
132 static char *arg_user = NULL;
133 static uid_t arg_uid = UID_INVALID;
134 static gid_t arg_gid = GID_INVALID;
135 static gid_t* arg_supplementary_gids = NULL;
136 static size_t arg_n_supplementary_gids = 0;
137 static sd_id128_t arg_uuid = {};
138 static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 static char *arg_hostname = NULL; /* The name the payload sees by default */
140 static const char *arg_selinux_context = NULL;
141 static const char *arg_selinux_apifs_context = NULL;
142 static char *arg_slice = NULL;
143 static bool arg_private_network = false;
144 static bool arg_read_only = false;
145 static StartMode arg_start_mode = START_PID1;
146 static bool arg_ephemeral = false;
147 static LinkJournal arg_link_journal = LINK_AUTO;
148 static bool arg_link_journal_try = false;
149 static uint64_t arg_caps_retain =
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
161 (1ULL << CAP_MKNOD) |
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
165 (1ULL << CAP_SETFCAP) |
166 (1ULL << CAP_SETGID) |
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
170 (1ULL << CAP_SYS_BOOT) |
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
174 (1ULL << CAP_SYS_RESOURCE) |
175 (1ULL << CAP_SYS_TTY_CONFIG);
176 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
177 static CustomMount *arg_custom_mounts = NULL;
178 static size_t arg_n_custom_mounts = 0;
179 static char **arg_setenv = NULL;
180 static bool arg_quiet = false;
181 static bool arg_register = true;
182 static bool arg_keep_unit = false;
183 static char **arg_network_interfaces = NULL;
184 static char **arg_network_macvlan = NULL;
185 static char **arg_network_ipvlan = NULL;
186 static bool arg_network_veth = false;
187 static char **arg_network_veth_extra = NULL;
188 static char *arg_network_bridge = NULL;
189 static char *arg_network_zone = NULL;
190 static char *arg_network_namespace_path = NULL;
191 static PagerFlags arg_pager_flags = 0;
192 static unsigned long arg_personality = PERSONALITY_INVALID;
193 static char *arg_image = NULL;
194 static char *arg_oci_bundle = NULL;
195 static VolatileMode arg_volatile_mode = VOLATILE_NO;
196 static ExposePort *arg_expose_ports = NULL;
197 static char **arg_property = NULL;
198 static sd_bus_message *arg_property_message = NULL;
199 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
200 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
201 static bool arg_userns_chown = false;
202 static int arg_kill_signal = 0;
203 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
204 static SettingsMask arg_settings_mask = 0;
205 static int arg_settings_trusted = -1;
206 static char **arg_parameters = NULL;
207 static const char *arg_container_service_name = "systemd-nspawn";
208 static bool arg_notify_ready = false;
209 static bool arg_use_cgns = true;
210 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
211 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
212 static void *arg_root_hash = NULL;
213 static size_t arg_root_hash_size = 0;
214 static char **arg_syscall_whitelist = NULL;
215 static char **arg_syscall_blacklist = NULL;
216 #if HAVE_SECCOMP
217 static scmp_filter_ctx arg_seccomp = NULL;
218 #endif
219 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
220 static bool arg_no_new_privileges = false;
221 static int arg_oom_score_adjust = 0;
222 static bool arg_oom_score_adjust_set = false;
223 static cpu_set_t *arg_cpuset = NULL;
224 static unsigned arg_cpuset_ncpus = 0;
225 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
226 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
227 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
228 static DeviceNode* arg_extra_nodes = NULL;
229 static size_t arg_n_extra_nodes = 0;
230 static char **arg_sysctl = NULL;
231 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
232
233 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
234 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
235 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
255 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
259 #if HAVE_SECCOMP
260 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
261 #endif
262 STATIC_DESTRUCTOR_REGISTER(arg_cpuset, CPU_FREEp);
263 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
264
265 static int help(void) {
266 _cleanup_free_ char *link = NULL;
267 int r;
268
269 (void) pager_open(arg_pager_flags);
270
271 r = terminal_urlify_man("systemd-nspawn", "1", &link);
272 if (r < 0)
273 return log_oom();
274
275 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
276 "Spawn a command or OS in a light-weight container.\n\n"
277 " -h --help Show this help\n"
278 " --version Print version string\n"
279 " -q --quiet Do not show status information\n"
280 " --no-pager Do not pipe output into a pager\n"
281 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
282 "%3$sImage:%4$s\n"
283 " -D --directory=PATH Root directory for the container\n"
284 " --template=PATH Initialize root directory from template directory,\n"
285 " if missing\n"
286 " -x --ephemeral Run container with snapshot of root directory, and\n"
287 " remove it after exit\n"
288 " -i --image=PATH Root file system disk image (or device node) for\n"
289 " the container\n"
290 " --oci-bundle=PATH OCI bundle directory\n"
291 " --read-only Mount the root directory read-only\n"
292 " --volatile[=MODE] Run the system in volatile mode\n"
293 " --root-hash=HASH Specify verity root hash for root disk image\n"
294 " --pivot-root=PATH[:PATH]\n"
295 " Pivot root to given directory in the container\n\n"
296 "%3$sExecution:%4$s\n"
297 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
298 " -b --boot Boot up full system (i.e. invoke init)\n"
299 " --chdir=PATH Set working directory in the container\n"
300 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
301 " -u --user=USER Run the command under specified user or UID\n"
302 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
303 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
304 "%3$sSystem Identity:%4$s\n"
305 " -M --machine=NAME Set the machine name for the container\n"
306 " --hostname=NAME Override the hostname for the container\n"
307 " --uuid=UUID Set a specific machine UUID for the container\n\n"
308 "%3$sProperties:%4$s\n"
309 " -S --slice=SLICE Place the container in the specified slice\n"
310 " --property=NAME=VALUE Set scope unit property\n"
311 " --register=BOOLEAN Register container as machine\n"
312 " --keep-unit Do not register a scope for the machine, reuse\n"
313 " the service unit nspawn is running in\n\n"
314 "%3$sUser Namespacing:%4$s\n"
315 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
316 " --private-users[=UIDBASE[:NUIDS]]\n"
317 " Similar, but with user configured UID/GID range\n"
318 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
319 "%3$sNetworking:%4$s\n"
320 " --private-network Disable network in container\n"
321 " --network-interface=INTERFACE\n"
322 " Assign an existing network interface to the\n"
323 " container\n"
324 " --network-macvlan=INTERFACE\n"
325 " Create a macvlan network interface based on an\n"
326 " existing network interface to the container\n"
327 " --network-ipvlan=INTERFACE\n"
328 " Create a ipvlan network interface based on an\n"
329 " existing network interface to the container\n"
330 " -n --network-veth Add a virtual Ethernet connection between host\n"
331 " and container\n"
332 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
333 " Add an additional virtual Ethernet link between\n"
334 " host and container\n"
335 " --network-bridge=INTERFACE\n"
336 " Add a virtual Ethernet connection to the container\n"
337 " and attach it to an existing bridge on the host\n"
338 " --network-zone=NAME Similar, but attach the new interface to an\n"
339 " an automatically managed bridge interface\n"
340 " --network-namespace-path=PATH\n"
341 " Set network namespace to the one represented by\n"
342 " the specified kernel namespace file node\n"
343 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
344 " Expose a container IP port on the host\n\n"
345 "%3$sSecurity:%4$s\n"
346 " --capability=CAP In addition to the default, retain specified\n"
347 " capability\n"
348 " --drop-capability=CAP Drop the specified capability from the default set\n"
349 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
350 " --system-call-filter=LIST|~LIST\n"
351 " Permit/prohibit specific system calls\n"
352 " -Z --selinux-context=SECLABEL\n"
353 " Set the SELinux security context to be used by\n"
354 " processes in the container\n"
355 " -L --selinux-apifs-context=SECLABEL\n"
356 " Set the SELinux security context to be used by\n"
357 " API/tmpfs file systems in the container\n\n"
358 "%3$sResources:%4$s\n"
359 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
360 " --oom-score-adjust=VALUE\n"
361 " Adjust the OOM score value for the payload\n"
362 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
363 " --personality=ARCH Pick personality for this container\n\n"
364 "%3$sIntegration:%4$s\n"
365 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
366 " --timezone=MODE Select mode of /etc/localtime initialization\n"
367 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
368 " host, try-guest, try-host\n"
369 " -j Equivalent to --link-journal=try-guest\n\n"
370 "%3$sMounts:%4$s\n"
371 " --bind=PATH[:PATH[:OPTIONS]]\n"
372 " Bind mount a file or directory from the host into\n"
373 " the container\n"
374 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
375 " Similar, but creates a read-only bind mount\n"
376 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
377 " it\n"
378 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
379 " --overlay=PATH[:PATH...]:PATH\n"
380 " Create an overlay mount from the host to \n"
381 " the container\n"
382 " --overlay-ro=PATH[:PATH...]:PATH\n"
383 " Similar, but creates a read-only overlay mount\n\n"
384 "%3$sInput/Output:%4$s\n"
385 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
386 " set up for the container.\n"
387 " -P --pipe Equivalent to --console=pipe\n"
388 "\nSee the %2$s for details.\n"
389 , program_invocation_short_name
390 , link
391 , ansi_underline(), ansi_normal());
392
393 return 0;
394 }
395
396 static int custom_mount_check_all(void) {
397 size_t i;
398
399 for (i = 0; i < arg_n_custom_mounts; i++) {
400 CustomMount *m = &arg_custom_mounts[i];
401
402 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
403 if (arg_userns_chown)
404 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
405 "--private-users-chown may not be combined with custom root mounts.");
406 else if (arg_uid_shift == UID_INVALID)
407 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
408 "--private-users with automatic UID shift may not be combined with custom root mounts.");
409 }
410 }
411
412 return 0;
413 }
414
415 static int detect_unified_cgroup_hierarchy_from_environment(void) {
416 const char *e;
417 int r;
418
419 /* Allow the user to control whether the unified hierarchy is used */
420 e = getenv("UNIFIED_CGROUP_HIERARCHY");
421 if (e) {
422 r = parse_boolean(e);
423 if (r < 0)
424 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
425 if (r > 0)
426 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
427 else
428 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
429 }
430
431 return 0;
432 }
433
434 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
435 int r;
436
437 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
438 * image actually supports. */
439 r = cg_all_unified();
440 if (r < 0)
441 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
442 if (r > 0) {
443 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
444 * routine only detects 231, so we'll have a false negative here for 230. */
445 r = systemd_installation_has_version(directory, 230);
446 if (r < 0)
447 return log_error_errno(r, "Failed to determine systemd version in container: %m");
448 if (r > 0)
449 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
450 else
451 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
452 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
453 /* Mixed cgroup hierarchy support was added in 233 */
454 r = systemd_installation_has_version(directory, 233);
455 if (r < 0)
456 return log_error_errno(r, "Failed to determine systemd version in container: %m");
457 if (r > 0)
458 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
459 else
460 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
461 } else
462 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
463
464 log_debug("Using %s hierarchy for container.",
465 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
466 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
467
468 return 0;
469 }
470
471 static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
472 int r;
473
474 r = getenv_bool(name);
475 if (r == -ENXIO)
476 return;
477 if (r < 0)
478 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
479
480 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
481 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
482 }
483
484 static void parse_mount_settings_env(void) {
485 const char *e;
486 int r;
487
488 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
489 if (r >= 0)
490 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
491 else if (r != -ENXIO)
492 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
493
494 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
495 if (!e)
496 return;
497
498 if (streq(e, "network")) {
499 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
500 return;
501 }
502
503 r = parse_boolean(e);
504 if (r < 0) {
505 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
506 return;
507 }
508
509 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
510 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
511 }
512
513 static void parse_environment(void) {
514 const char *e;
515 int r;
516
517 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
518 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
519 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
520 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
521
522 parse_mount_settings_env();
523
524 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
525 * even if it is supported. If not supported, it has no effect. */
526 if (!cg_ns_supported())
527 arg_use_cgns = false;
528 else {
529 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
530 if (r < 0) {
531 if (r != -ENXIO)
532 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
533
534 arg_use_cgns = true;
535 } else {
536 arg_use_cgns = r > 0;
537 arg_settings_mask |= SETTING_USE_CGNS;
538 }
539 }
540
541 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
542 if (e)
543 arg_container_service_name = e;
544
545 detect_unified_cgroup_hierarchy_from_environment();
546 }
547
548 static int parse_argv(int argc, char *argv[]) {
549 enum {
550 ARG_VERSION = 0x100,
551 ARG_PRIVATE_NETWORK,
552 ARG_UUID,
553 ARG_READ_ONLY,
554 ARG_CAPABILITY,
555 ARG_DROP_CAPABILITY,
556 ARG_LINK_JOURNAL,
557 ARG_BIND,
558 ARG_BIND_RO,
559 ARG_TMPFS,
560 ARG_OVERLAY,
561 ARG_OVERLAY_RO,
562 ARG_INACCESSIBLE,
563 ARG_SHARE_SYSTEM,
564 ARG_REGISTER,
565 ARG_KEEP_UNIT,
566 ARG_NETWORK_INTERFACE,
567 ARG_NETWORK_MACVLAN,
568 ARG_NETWORK_IPVLAN,
569 ARG_NETWORK_BRIDGE,
570 ARG_NETWORK_ZONE,
571 ARG_NETWORK_VETH_EXTRA,
572 ARG_NETWORK_NAMESPACE_PATH,
573 ARG_PERSONALITY,
574 ARG_VOLATILE,
575 ARG_TEMPLATE,
576 ARG_PROPERTY,
577 ARG_PRIVATE_USERS,
578 ARG_KILL_SIGNAL,
579 ARG_SETTINGS,
580 ARG_CHDIR,
581 ARG_PIVOT_ROOT,
582 ARG_PRIVATE_USERS_CHOWN,
583 ARG_NOTIFY_READY,
584 ARG_ROOT_HASH,
585 ARG_SYSTEM_CALL_FILTER,
586 ARG_RLIMIT,
587 ARG_HOSTNAME,
588 ARG_NO_NEW_PRIVILEGES,
589 ARG_OOM_SCORE_ADJUST,
590 ARG_CPU_AFFINITY,
591 ARG_RESOLV_CONF,
592 ARG_TIMEZONE,
593 ARG_CONSOLE,
594 ARG_PIPE,
595 ARG_OCI_BUNDLE,
596 ARG_NO_PAGER,
597 };
598
599 static const struct option options[] = {
600 { "help", no_argument, NULL, 'h' },
601 { "version", no_argument, NULL, ARG_VERSION },
602 { "directory", required_argument, NULL, 'D' },
603 { "template", required_argument, NULL, ARG_TEMPLATE },
604 { "ephemeral", no_argument, NULL, 'x' },
605 { "user", required_argument, NULL, 'u' },
606 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
607 { "as-pid2", no_argument, NULL, 'a' },
608 { "boot", no_argument, NULL, 'b' },
609 { "uuid", required_argument, NULL, ARG_UUID },
610 { "read-only", no_argument, NULL, ARG_READ_ONLY },
611 { "capability", required_argument, NULL, ARG_CAPABILITY },
612 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
613 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
614 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
615 { "bind", required_argument, NULL, ARG_BIND },
616 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
617 { "tmpfs", required_argument, NULL, ARG_TMPFS },
618 { "overlay", required_argument, NULL, ARG_OVERLAY },
619 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
620 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
621 { "machine", required_argument, NULL, 'M' },
622 { "hostname", required_argument, NULL, ARG_HOSTNAME },
623 { "slice", required_argument, NULL, 'S' },
624 { "setenv", required_argument, NULL, 'E' },
625 { "selinux-context", required_argument, NULL, 'Z' },
626 { "selinux-apifs-context", required_argument, NULL, 'L' },
627 { "quiet", no_argument, NULL, 'q' },
628 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
629 { "register", required_argument, NULL, ARG_REGISTER },
630 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
631 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
632 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
633 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
634 { "network-veth", no_argument, NULL, 'n' },
635 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
636 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
637 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
638 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
639 { "personality", required_argument, NULL, ARG_PERSONALITY },
640 { "image", required_argument, NULL, 'i' },
641 { "volatile", optional_argument, NULL, ARG_VOLATILE },
642 { "port", required_argument, NULL, 'p' },
643 { "property", required_argument, NULL, ARG_PROPERTY },
644 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
645 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
646 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
647 { "settings", required_argument, NULL, ARG_SETTINGS },
648 { "chdir", required_argument, NULL, ARG_CHDIR },
649 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
650 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
651 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
652 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
653 { "rlimit", required_argument, NULL, ARG_RLIMIT },
654 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
655 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
656 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
657 { "timezone", required_argument, NULL, ARG_TIMEZONE },
658 { "console", required_argument, NULL, ARG_CONSOLE },
659 { "pipe", no_argument, NULL, ARG_PIPE },
660 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
661 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
662 {}
663 };
664
665 int c, r;
666 const char *p;
667 uint64_t plus = 0, minus = 0;
668 bool mask_all_settings = false, mask_no_settings = false;
669
670 assert(argc >= 0);
671 assert(argv);
672
673 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
674 switch (c) {
675
676 case 'h':
677 return help();
678
679 case ARG_VERSION:
680 return version();
681
682 case 'D':
683 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
684 if (r < 0)
685 return r;
686
687 arg_settings_mask |= SETTING_DIRECTORY;
688 break;
689
690 case ARG_TEMPLATE:
691 r = parse_path_argument_and_warn(optarg, false, &arg_template);
692 if (r < 0)
693 return r;
694
695 arg_settings_mask |= SETTING_DIRECTORY;
696 break;
697
698 case 'i':
699 r = parse_path_argument_and_warn(optarg, false, &arg_image);
700 if (r < 0)
701 return r;
702
703 arg_settings_mask |= SETTING_DIRECTORY;
704 break;
705
706 case ARG_OCI_BUNDLE:
707 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
708 if (r < 0)
709 return r;
710
711 break;
712
713 case 'x':
714 arg_ephemeral = true;
715 arg_settings_mask |= SETTING_EPHEMERAL;
716 break;
717
718 case 'u':
719 r = free_and_strdup(&arg_user, optarg);
720 if (r < 0)
721 return log_oom();
722
723 arg_settings_mask |= SETTING_USER;
724 break;
725
726 case ARG_NETWORK_ZONE: {
727 char *j;
728
729 j = strappend("vz-", optarg);
730 if (!j)
731 return log_oom();
732
733 if (!ifname_valid(j)) {
734 log_error("Network zone name not valid: %s", j);
735 free(j);
736 return -EINVAL;
737 }
738
739 free_and_replace(arg_network_zone, j);
740
741 arg_network_veth = true;
742 arg_private_network = true;
743 arg_settings_mask |= SETTING_NETWORK;
744 break;
745 }
746
747 case ARG_NETWORK_BRIDGE:
748
749 if (!ifname_valid(optarg))
750 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
751 "Bridge interface name not valid: %s", optarg);
752
753 r = free_and_strdup(&arg_network_bridge, optarg);
754 if (r < 0)
755 return log_oom();
756
757 _fallthrough_;
758 case 'n':
759 arg_network_veth = true;
760 arg_private_network = true;
761 arg_settings_mask |= SETTING_NETWORK;
762 break;
763
764 case ARG_NETWORK_VETH_EXTRA:
765 r = veth_extra_parse(&arg_network_veth_extra, optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
768
769 arg_private_network = true;
770 arg_settings_mask |= SETTING_NETWORK;
771 break;
772
773 case ARG_NETWORK_INTERFACE:
774 if (!ifname_valid(optarg))
775 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
776 "Network interface name not valid: %s", optarg);
777
778 if (strv_extend(&arg_network_interfaces, optarg) < 0)
779 return log_oom();
780
781 arg_private_network = true;
782 arg_settings_mask |= SETTING_NETWORK;
783 break;
784
785 case ARG_NETWORK_MACVLAN:
786
787 if (!ifname_valid(optarg))
788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
789 "MACVLAN network interface name not valid: %s", optarg);
790
791 if (strv_extend(&arg_network_macvlan, optarg) < 0)
792 return log_oom();
793
794 arg_private_network = true;
795 arg_settings_mask |= SETTING_NETWORK;
796 break;
797
798 case ARG_NETWORK_IPVLAN:
799
800 if (!ifname_valid(optarg))
801 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
802 "IPVLAN network interface name not valid: %s", optarg);
803
804 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
805 return log_oom();
806
807 _fallthrough_;
808 case ARG_PRIVATE_NETWORK:
809 arg_private_network = true;
810 arg_settings_mask |= SETTING_NETWORK;
811 break;
812
813 case ARG_NETWORK_NAMESPACE_PATH:
814 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
815 if (r < 0)
816 return r;
817
818 arg_settings_mask |= SETTING_NETWORK;
819 break;
820
821 case 'b':
822 if (arg_start_mode == START_PID2)
823 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
824 "--boot and --as-pid2 may not be combined.");
825
826 arg_start_mode = START_BOOT;
827 arg_settings_mask |= SETTING_START_MODE;
828 break;
829
830 case 'a':
831 if (arg_start_mode == START_BOOT)
832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
833 "--boot and --as-pid2 may not be combined.");
834
835 arg_start_mode = START_PID2;
836 arg_settings_mask |= SETTING_START_MODE;
837 break;
838
839 case ARG_UUID:
840 r = sd_id128_from_string(optarg, &arg_uuid);
841 if (r < 0)
842 return log_error_errno(r, "Invalid UUID: %s", optarg);
843
844 if (sd_id128_is_null(arg_uuid))
845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
846 "Machine UUID may not be all zeroes.");
847
848 arg_settings_mask |= SETTING_MACHINE_ID;
849 break;
850
851 case 'S':
852 r = free_and_strdup(&arg_slice, optarg);
853 if (r < 0)
854 return log_oom();
855
856 arg_settings_mask |= SETTING_SLICE;
857 break;
858
859 case 'M':
860 if (isempty(optarg))
861 arg_machine = mfree(arg_machine);
862 else {
863 if (!machine_name_is_valid(optarg))
864 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
865 "Invalid machine name: %s", optarg);
866
867 r = free_and_strdup(&arg_machine, optarg);
868 if (r < 0)
869 return log_oom();
870 }
871 break;
872
873 case ARG_HOSTNAME:
874 if (isempty(optarg))
875 arg_hostname = mfree(arg_hostname);
876 else {
877 if (!hostname_is_valid(optarg, false))
878 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
879 "Invalid hostname: %s", optarg);
880
881 r = free_and_strdup(&arg_hostname, optarg);
882 if (r < 0)
883 return log_oom();
884 }
885
886 arg_settings_mask |= SETTING_HOSTNAME;
887 break;
888
889 case 'Z':
890 arg_selinux_context = optarg;
891 break;
892
893 case 'L':
894 arg_selinux_apifs_context = optarg;
895 break;
896
897 case ARG_READ_ONLY:
898 arg_read_only = true;
899 arg_settings_mask |= SETTING_READ_ONLY;
900 break;
901
902 case ARG_CAPABILITY:
903 case ARG_DROP_CAPABILITY: {
904 p = optarg;
905 for (;;) {
906 _cleanup_free_ char *t = NULL;
907
908 r = extract_first_word(&p, &t, ",", 0);
909 if (r < 0)
910 return log_error_errno(r, "Failed to parse capability %s.", t);
911 if (r == 0)
912 break;
913
914 if (streq(t, "all")) {
915 if (c == ARG_CAPABILITY)
916 plus = (uint64_t) -1;
917 else
918 minus = (uint64_t) -1;
919 } else {
920 r = capability_from_name(t);
921 if (r < 0)
922 return log_error_errno(r, "Failed to parse capability %s.", t);
923
924 if (c == ARG_CAPABILITY)
925 plus |= 1ULL << r;
926 else
927 minus |= 1ULL << r;
928 }
929 }
930
931 arg_settings_mask |= SETTING_CAPABILITY;
932 break;
933 }
934
935 case ARG_NO_NEW_PRIVILEGES:
936 r = parse_boolean(optarg);
937 if (r < 0)
938 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
939
940 arg_no_new_privileges = r;
941 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
942 break;
943
944 case 'j':
945 arg_link_journal = LINK_GUEST;
946 arg_link_journal_try = true;
947 arg_settings_mask |= SETTING_LINK_JOURNAL;
948 break;
949
950 case ARG_LINK_JOURNAL:
951 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
952 if (r < 0) {
953 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
954 return -EINVAL;
955 }
956
957 arg_settings_mask |= SETTING_LINK_JOURNAL;
958 break;
959
960 case ARG_BIND:
961 case ARG_BIND_RO:
962 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
963 if (r < 0)
964 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
965
966 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
967 break;
968
969 case ARG_TMPFS:
970 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
971 if (r < 0)
972 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
973
974 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
975 break;
976
977 case ARG_OVERLAY:
978 case ARG_OVERLAY_RO:
979 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
980 if (r == -EADDRNOTAVAIL)
981 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
982 if (r < 0)
983 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
984
985 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
986 break;
987
988 case ARG_INACCESSIBLE:
989 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
990 if (r < 0)
991 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
992
993 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
994 break;
995
996 case 'E': {
997 char **n;
998
999 if (!env_assignment_is_valid(optarg))
1000 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1001 "Environment variable assignment '%s' is not valid.", optarg);
1002
1003 n = strv_env_set(arg_setenv, optarg);
1004 if (!n)
1005 return log_oom();
1006
1007 strv_free_and_replace(arg_setenv, n);
1008 arg_settings_mask |= SETTING_ENVIRONMENT;
1009 break;
1010 }
1011
1012 case 'q':
1013 arg_quiet = true;
1014 break;
1015
1016 case ARG_SHARE_SYSTEM:
1017 /* We don't officially support this anymore, except for compat reasons. People should use the
1018 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1019 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1020 arg_clone_ns_flags = 0;
1021 break;
1022
1023 case ARG_REGISTER:
1024 r = parse_boolean(optarg);
1025 if (r < 0) {
1026 log_error("Failed to parse --register= argument: %s", optarg);
1027 return r;
1028 }
1029
1030 arg_register = r;
1031 break;
1032
1033 case ARG_KEEP_UNIT:
1034 arg_keep_unit = true;
1035 break;
1036
1037 case ARG_PERSONALITY:
1038
1039 arg_personality = personality_from_string(optarg);
1040 if (arg_personality == PERSONALITY_INVALID)
1041 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1042 "Unknown or unsupported personality '%s'.", optarg);
1043
1044 arg_settings_mask |= SETTING_PERSONALITY;
1045 break;
1046
1047 case ARG_VOLATILE:
1048
1049 if (!optarg)
1050 arg_volatile_mode = VOLATILE_YES;
1051 else if (streq(optarg, "help")) {
1052 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1053 return 0;
1054 } else {
1055 VolatileMode m;
1056
1057 m = volatile_mode_from_string(optarg);
1058 if (m < 0)
1059 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1060 "Failed to parse --volatile= argument: %s", optarg);
1061 else
1062 arg_volatile_mode = m;
1063 }
1064
1065 arg_settings_mask |= SETTING_VOLATILE_MODE;
1066 break;
1067
1068 case 'p':
1069 r = expose_port_parse(&arg_expose_ports, optarg);
1070 if (r == -EEXIST)
1071 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1072 if (r < 0)
1073 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1074
1075 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1076 break;
1077
1078 case ARG_PROPERTY:
1079 if (strv_extend(&arg_property, optarg) < 0)
1080 return log_oom();
1081
1082 break;
1083
1084 case ARG_PRIVATE_USERS: {
1085 int boolean = -1;
1086
1087 if (!optarg)
1088 boolean = true;
1089 else if (!in_charset(optarg, DIGITS))
1090 /* do *not* parse numbers as booleans */
1091 boolean = parse_boolean(optarg);
1092
1093 if (boolean == false) {
1094 /* no: User namespacing off */
1095 arg_userns_mode = USER_NAMESPACE_NO;
1096 arg_uid_shift = UID_INVALID;
1097 arg_uid_range = UINT32_C(0x10000);
1098 } else if (boolean == true) {
1099 /* yes: User namespacing on, UID range is read from root dir */
1100 arg_userns_mode = USER_NAMESPACE_FIXED;
1101 arg_uid_shift = UID_INVALID;
1102 arg_uid_range = UINT32_C(0x10000);
1103 } else if (streq(optarg, "pick")) {
1104 /* pick: User namespacing on, UID range is picked randomly */
1105 arg_userns_mode = USER_NAMESPACE_PICK;
1106 arg_uid_shift = UID_INVALID;
1107 arg_uid_range = UINT32_C(0x10000);
1108 } else {
1109 _cleanup_free_ char *buffer = NULL;
1110 const char *range, *shift;
1111
1112 /* anything else: User namespacing on, UID range is explicitly configured */
1113
1114 range = strchr(optarg, ':');
1115 if (range) {
1116 buffer = strndup(optarg, range - optarg);
1117 if (!buffer)
1118 return log_oom();
1119 shift = buffer;
1120
1121 range++;
1122 r = safe_atou32(range, &arg_uid_range);
1123 if (r < 0)
1124 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1125 } else
1126 shift = optarg;
1127
1128 r = parse_uid(shift, &arg_uid_shift);
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1131
1132 arg_userns_mode = USER_NAMESPACE_FIXED;
1133 }
1134
1135 if (arg_uid_range <= 0)
1136 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1137 "UID range cannot be 0.");
1138
1139 arg_settings_mask |= SETTING_USERNS;
1140 break;
1141 }
1142
1143 case 'U':
1144 if (userns_supported()) {
1145 arg_userns_mode = USER_NAMESPACE_PICK;
1146 arg_uid_shift = UID_INVALID;
1147 arg_uid_range = UINT32_C(0x10000);
1148
1149 arg_settings_mask |= SETTING_USERNS;
1150 }
1151
1152 break;
1153
1154 case ARG_PRIVATE_USERS_CHOWN:
1155 arg_userns_chown = true;
1156
1157 arg_settings_mask |= SETTING_USERNS;
1158 break;
1159
1160 case ARG_KILL_SIGNAL:
1161 if (streq(optarg, "help")) {
1162 DUMP_STRING_TABLE(signal, int, _NSIG);
1163 return 0;
1164 }
1165
1166 arg_kill_signal = signal_from_string(optarg);
1167 if (arg_kill_signal < 0)
1168 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1169 "Cannot parse signal: %s", optarg);
1170
1171 arg_settings_mask |= SETTING_KILL_SIGNAL;
1172 break;
1173
1174 case ARG_SETTINGS:
1175
1176 /* no → do not read files
1177 * yes → read files, do not override cmdline, trust only subset
1178 * override → read files, override cmdline, trust only subset
1179 * trusted → read files, do not override cmdline, trust all
1180 */
1181
1182 r = parse_boolean(optarg);
1183 if (r < 0) {
1184 if (streq(optarg, "trusted")) {
1185 mask_all_settings = false;
1186 mask_no_settings = false;
1187 arg_settings_trusted = true;
1188
1189 } else if (streq(optarg, "override")) {
1190 mask_all_settings = false;
1191 mask_no_settings = true;
1192 arg_settings_trusted = -1;
1193 } else
1194 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1195 } else if (r > 0) {
1196 /* yes */
1197 mask_all_settings = false;
1198 mask_no_settings = false;
1199 arg_settings_trusted = -1;
1200 } else {
1201 /* no */
1202 mask_all_settings = true;
1203 mask_no_settings = false;
1204 arg_settings_trusted = false;
1205 }
1206
1207 break;
1208
1209 case ARG_CHDIR:
1210 if (!path_is_absolute(optarg))
1211 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1212 "Working directory %s is not an absolute path.", optarg);
1213
1214 r = free_and_strdup(&arg_chdir, optarg);
1215 if (r < 0)
1216 return log_oom();
1217
1218 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1219 break;
1220
1221 case ARG_PIVOT_ROOT:
1222 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1225
1226 arg_settings_mask |= SETTING_PIVOT_ROOT;
1227 break;
1228
1229 case ARG_NOTIFY_READY:
1230 r = parse_boolean(optarg);
1231 if (r < 0)
1232 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1233 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1234 arg_notify_ready = r;
1235 arg_settings_mask |= SETTING_NOTIFY_READY;
1236 break;
1237
1238 case ARG_ROOT_HASH: {
1239 void *k;
1240 size_t l;
1241
1242 r = unhexmem(optarg, strlen(optarg), &k, &l);
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1245 if (l < sizeof(sd_id128_t)) {
1246 log_error("Root hash must be at least 128bit long: %s", optarg);
1247 free(k);
1248 return -EINVAL;
1249 }
1250
1251 free(arg_root_hash);
1252 arg_root_hash = k;
1253 arg_root_hash_size = l;
1254 break;
1255 }
1256
1257 case ARG_SYSTEM_CALL_FILTER: {
1258 bool negative;
1259 const char *items;
1260
1261 negative = optarg[0] == '~';
1262 items = negative ? optarg + 1 : optarg;
1263
1264 for (;;) {
1265 _cleanup_free_ char *word = NULL;
1266
1267 r = extract_first_word(&items, &word, NULL, 0);
1268 if (r == 0)
1269 break;
1270 if (r == -ENOMEM)
1271 return log_oom();
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse system call filter: %m");
1274
1275 if (negative)
1276 r = strv_extend(&arg_syscall_blacklist, word);
1277 else
1278 r = strv_extend(&arg_syscall_whitelist, word);
1279 if (r < 0)
1280 return log_oom();
1281 }
1282
1283 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1284 break;
1285 }
1286
1287 case ARG_RLIMIT: {
1288 const char *eq;
1289 char *name;
1290 int rl;
1291
1292 if (streq(optarg, "help")) {
1293 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1294 return 0;
1295 }
1296
1297 eq = strchr(optarg, '=');
1298 if (!eq)
1299 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1300 "--rlimit= expects an '=' assignment.");
1301
1302 name = strndup(optarg, eq - optarg);
1303 if (!name)
1304 return log_oom();
1305
1306 rl = rlimit_from_string_harder(name);
1307 if (rl < 0)
1308 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1309 "Unknown resource limit: %s", name);
1310
1311 if (!arg_rlimit[rl]) {
1312 arg_rlimit[rl] = new0(struct rlimit, 1);
1313 if (!arg_rlimit[rl])
1314 return log_oom();
1315 }
1316
1317 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1318 if (r < 0)
1319 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1320
1321 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1322 break;
1323 }
1324
1325 case ARG_OOM_SCORE_ADJUST:
1326 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1327 if (r < 0)
1328 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1329
1330 arg_oom_score_adjust_set = true;
1331 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1332 break;
1333
1334 case ARG_CPU_AFFINITY: {
1335 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1336
1337 r = parse_cpu_set(optarg, &cpuset);
1338 if (r < 0)
1339 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1340
1341 if (arg_cpuset)
1342 CPU_FREE(arg_cpuset);
1343
1344 arg_cpuset = TAKE_PTR(cpuset);
1345 arg_cpuset_ncpus = r;
1346 arg_settings_mask |= SETTING_CPU_AFFINITY;
1347 break;
1348 }
1349
1350 case ARG_RESOLV_CONF:
1351 if (streq(optarg, "help")) {
1352 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1353 return 0;
1354 }
1355
1356 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1357 if (arg_resolv_conf < 0)
1358 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1359 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1360
1361 arg_settings_mask |= SETTING_RESOLV_CONF;
1362 break;
1363
1364 case ARG_TIMEZONE:
1365 if (streq(optarg, "help")) {
1366 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1367 return 0;
1368 }
1369
1370 arg_timezone = timezone_mode_from_string(optarg);
1371 if (arg_timezone < 0)
1372 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1373 "Failed to parse /etc/localtime mode: %s", optarg);
1374
1375 arg_settings_mask |= SETTING_TIMEZONE;
1376 break;
1377
1378 case ARG_CONSOLE:
1379 if (streq(optarg, "interactive"))
1380 arg_console_mode = CONSOLE_INTERACTIVE;
1381 else if (streq(optarg, "read-only"))
1382 arg_console_mode = CONSOLE_READ_ONLY;
1383 else if (streq(optarg, "passive"))
1384 arg_console_mode = CONSOLE_PASSIVE;
1385 else if (streq(optarg, "pipe"))
1386 arg_console_mode = CONSOLE_PIPE;
1387 else if (streq(optarg, "help"))
1388 puts("interactive\n"
1389 "read-only\n"
1390 "passive\n"
1391 "pipe");
1392 else {
1393 log_error("Unknown console mode: %s", optarg);
1394 return -EINVAL;
1395 }
1396
1397 arg_settings_mask |= SETTING_CONSOLE_MODE;
1398 break;
1399
1400 case 'P':
1401 case ARG_PIPE:
1402 arg_console_mode = CONSOLE_PIPE;
1403 arg_settings_mask |= SETTING_CONSOLE_MODE;
1404 break;
1405
1406 case ARG_NO_PAGER:
1407 arg_pager_flags |= PAGER_DISABLE;
1408 break;
1409
1410 case '?':
1411 return -EINVAL;
1412
1413 default:
1414 assert_not_reached("Unhandled option");
1415 }
1416
1417 if (argc > optind) {
1418 strv_free(arg_parameters);
1419 arg_parameters = strv_copy(argv + optind);
1420 if (!arg_parameters)
1421 return log_oom();
1422
1423 arg_settings_mask |= SETTING_START_MODE;
1424 }
1425
1426 if (arg_ephemeral && arg_template && !arg_directory)
1427 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1428 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1429 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1430 * --directory=". */
1431 arg_directory = TAKE_PTR(arg_template);
1432
1433 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1434
1435 /* Make sure to parse environment before we reset the settings mask below */
1436 parse_environment();
1437
1438 /* Load all settings from .nspawn files */
1439 if (mask_no_settings)
1440 arg_settings_mask = 0;
1441
1442 /* Don't load any settings from .nspawn files */
1443 if (mask_all_settings)
1444 arg_settings_mask = _SETTINGS_MASK_ALL;
1445
1446 return 1;
1447 }
1448
1449 static int verify_arguments(void) {
1450 int r;
1451
1452 if (arg_userns_mode != USER_NAMESPACE_NO)
1453 arg_mount_settings |= MOUNT_USE_USERNS;
1454
1455 if (arg_private_network)
1456 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1457
1458 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1459 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1460 arg_register = false;
1461 if (arg_start_mode != START_PID1)
1462 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1463 }
1464
1465 if (arg_userns_mode == USER_NAMESPACE_PICK)
1466 arg_userns_chown = true;
1467
1468 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1469 arg_kill_signal = SIGRTMIN+3;
1470
1471 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1472 arg_read_only = true;
1473
1474 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1475 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1476 * The latter is not technically a user session, but we don't need to labour the point. */
1477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1478
1479 if (arg_directory && arg_image)
1480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1481
1482 if (arg_template && arg_image)
1483 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1484
1485 if (arg_template && !(arg_directory || arg_machine))
1486 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1487
1488 if (arg_ephemeral && arg_template)
1489 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1490
1491 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1492 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1493
1494 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1495 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1496
1497 if (arg_userns_chown && arg_read_only)
1498 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1499 "--read-only and --private-users-chown may not be combined.");
1500
1501 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1502 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1503 * copy-up (in case of overlay) making the entire excercise pointless. */
1504 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1505 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1506
1507 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1508 * to avoid conflicts between different network options. */
1509 if (arg_network_namespace_path &&
1510 (arg_network_interfaces || arg_network_macvlan ||
1511 arg_network_ipvlan || arg_network_veth_extra ||
1512 arg_network_bridge || arg_network_zone ||
1513 arg_network_veth || arg_private_network))
1514 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1515
1516 if (arg_network_bridge && arg_network_zone)
1517 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1518 "--network-bridge= and --network-zone= may not be combined.");
1519
1520 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1521 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1522
1523 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1524 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1525
1526 if (arg_expose_ports && !arg_private_network)
1527 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1528
1529 #if ! HAVE_LIBIPTC
1530 if (arg_expose_ports)
1531 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1532 #endif
1533
1534 r = custom_mount_check_all();
1535 if (r < 0)
1536 return r;
1537
1538 return 0;
1539 }
1540
1541 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1542 assert(p);
1543
1544 if (arg_userns_mode == USER_NAMESPACE_NO)
1545 return 0;
1546
1547 if (uid == UID_INVALID && gid == GID_INVALID)
1548 return 0;
1549
1550 if (uid != UID_INVALID) {
1551 uid += arg_uid_shift;
1552
1553 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1554 return -EOVERFLOW;
1555 }
1556
1557 if (gid != GID_INVALID) {
1558 gid += (gid_t) arg_uid_shift;
1559
1560 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1561 return -EOVERFLOW;
1562 }
1563
1564 if (lchown(p, uid, gid) < 0)
1565 return -errno;
1566
1567 return 0;
1568 }
1569
1570 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1571 const char *q;
1572 int r;
1573
1574 q = prefix_roota(root, path);
1575 r = mkdir_errno_wrapper(q, mode);
1576 if (r == -EEXIST)
1577 return 0;
1578 if (r < 0)
1579 return r;
1580
1581 return userns_lchown(q, uid, gid);
1582 }
1583
1584 static const char *timezone_from_path(const char *path) {
1585 return PATH_STARTSWITH_SET(
1586 path,
1587 "../usr/share/zoneinfo/",
1588 "/usr/share/zoneinfo/");
1589 }
1590
1591 static bool etc_writable(void) {
1592 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1593 }
1594
1595 static int setup_timezone(const char *dest) {
1596 _cleanup_free_ char *p = NULL, *etc = NULL;
1597 const char *where, *check;
1598 TimezoneMode m;
1599 int r;
1600
1601 assert(dest);
1602
1603 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1604 r = readlink_malloc("/etc/localtime", &p);
1605 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1606 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1607 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1608 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1609 else if (r < 0) {
1610 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1611 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1612 * file.
1613 *
1614 * Example:
1615 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1616 */
1617 return 0;
1618 } else if (arg_timezone == TIMEZONE_AUTO)
1619 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1620 else
1621 m = arg_timezone;
1622 } else
1623 m = arg_timezone;
1624
1625 if (m == TIMEZONE_OFF)
1626 return 0;
1627
1628 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1629 if (r < 0) {
1630 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1631 return 0;
1632 }
1633
1634 where = strjoina(etc, "/localtime");
1635
1636 switch (m) {
1637
1638 case TIMEZONE_DELETE:
1639 if (unlink(where) < 0)
1640 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1641
1642 return 0;
1643
1644 case TIMEZONE_SYMLINK: {
1645 _cleanup_free_ char *q = NULL;
1646 const char *z, *what;
1647
1648 z = timezone_from_path(p);
1649 if (!z) {
1650 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1651 return 0;
1652 }
1653
1654 r = readlink_malloc(where, &q);
1655 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1656 return 0; /* Already pointing to the right place? Then do nothing .. */
1657
1658 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1659 r = chase_symlinks(check, dest, 0, NULL);
1660 if (r < 0)
1661 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1662 else {
1663 if (unlink(where) < 0 && errno != ENOENT) {
1664 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1665 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1666 return 0;
1667 }
1668
1669 what = strjoina("../usr/share/zoneinfo/", z);
1670 if (symlink(what, where) < 0) {
1671 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1672 errno, "Failed to correct timezone of container, ignoring: %m");
1673 return 0;
1674 }
1675
1676 break;
1677 }
1678
1679 _fallthrough_;
1680 }
1681
1682 case TIMEZONE_BIND: {
1683 _cleanup_free_ char *resolved = NULL;
1684 int found;
1685
1686 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1687 if (found < 0) {
1688 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1689 return 0;
1690 }
1691
1692 if (found == 0) /* missing? */
1693 (void) touch(resolved);
1694
1695 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1696 if (r >= 0)
1697 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1698
1699 _fallthrough_;
1700 }
1701
1702 case TIMEZONE_COPY:
1703 /* If mounting failed, try to copy */
1704 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1705 if (r < 0) {
1706 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1707 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1708 return 0;
1709 }
1710
1711 break;
1712
1713 default:
1714 assert_not_reached("unexpected mode");
1715 }
1716
1717 /* Fix permissions of the symlink or file copy we just created */
1718 r = userns_lchown(where, 0, 0);
1719 if (r < 0)
1720 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1721
1722 return 0;
1723 }
1724
1725 static int have_resolv_conf(const char *path) {
1726 assert(path);
1727
1728 if (access(path, F_OK) < 0) {
1729 if (errno == ENOENT)
1730 return 0;
1731
1732 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1733 }
1734
1735 return 1;
1736 }
1737
1738 static int resolved_listening(void) {
1739 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1740 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1741 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1742 int r;
1743
1744 /* Check if resolved is listening */
1745
1746 r = sd_bus_open_system(&bus);
1747 if (r < 0)
1748 return log_debug_errno(r, "Failed to open system bus: %m");
1749
1750 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1751 if (r < 0)
1752 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1753 if (r == 0)
1754 return 0;
1755
1756 r = sd_bus_get_property_string(bus,
1757 "org.freedesktop.resolve1",
1758 "/org/freedesktop/resolve1",
1759 "org.freedesktop.resolve1.Manager",
1760 "DNSStubListener",
1761 &error,
1762 &dns_stub_listener_mode);
1763 if (r < 0)
1764 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1765
1766 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1767 }
1768
1769 static int setup_resolv_conf(const char *dest) {
1770 _cleanup_free_ char *etc = NULL;
1771 const char *where, *what;
1772 ResolvConfMode m;
1773 int r;
1774
1775 assert(dest);
1776
1777 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1778 if (arg_private_network)
1779 m = RESOLV_CONF_OFF;
1780 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1781 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
1782 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1783 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
1784 else
1785 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
1786 } else
1787 m = arg_resolv_conf;
1788
1789 if (m == RESOLV_CONF_OFF)
1790 return 0;
1791
1792 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1793 if (r < 0) {
1794 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1795 return 0;
1796 }
1797
1798 where = strjoina(etc, "/resolv.conf");
1799
1800 if (m == RESOLV_CONF_DELETE) {
1801 if (unlink(where) < 0)
1802 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1803
1804 return 0;
1805 }
1806
1807 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1808 what = STATIC_RESOLV_CONF;
1809 else
1810 what = "/etc/resolv.conf";
1811
1812 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1813 _cleanup_free_ char *resolved = NULL;
1814 int found;
1815
1816 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1817 if (found < 0) {
1818 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1819 return 0;
1820 }
1821
1822 if (found == 0) /* missing? */
1823 (void) touch(resolved);
1824
1825 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1826 if (r >= 0)
1827 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1828 }
1829
1830 /* If that didn't work, let's copy the file */
1831 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
1832 if (r < 0) {
1833 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1834 * resolved or something similar runs inside and the symlink points there.
1835 *
1836 * If the disk image is read-only, there's also no point in complaining.
1837 */
1838 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1839 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1840 return 0;
1841 }
1842
1843 r = userns_lchown(where, 0, 0);
1844 if (r < 0)
1845 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1846
1847 return 0;
1848 }
1849
1850 static int setup_boot_id(void) {
1851 _cleanup_(unlink_and_freep) char *from = NULL;
1852 _cleanup_free_ char *path = NULL;
1853 sd_id128_t rnd = SD_ID128_NULL;
1854 const char *to;
1855 int r;
1856
1857 /* Generate a new randomized boot ID, so that each boot-up of
1858 * the container gets a new one */
1859
1860 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1861 if (r < 0)
1862 return log_error_errno(r, "Failed to generate random boot ID path: %m");
1863
1864 r = sd_id128_randomize(&rnd);
1865 if (r < 0)
1866 return log_error_errno(r, "Failed to generate random boot id: %m");
1867
1868 r = id128_write(path, ID128_UUID, rnd, false);
1869 if (r < 0)
1870 return log_error_errno(r, "Failed to write boot id: %m");
1871
1872 from = TAKE_PTR(path);
1873 to = "/proc/sys/kernel/random/boot_id";
1874
1875 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1876 if (r < 0)
1877 return r;
1878
1879 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1880 }
1881
1882 static int copy_devnodes(const char *dest) {
1883 static const char devnodes[] =
1884 "null\0"
1885 "zero\0"
1886 "full\0"
1887 "random\0"
1888 "urandom\0"
1889 "tty\0"
1890 "net/tun\0";
1891
1892 _cleanup_umask_ mode_t u;
1893 const char *d;
1894 int r = 0;
1895
1896 assert(dest);
1897
1898 u = umask(0000);
1899
1900 /* Create /dev/net, so that we can create /dev/net/tun in it */
1901 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1902 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1903
1904 NULSTR_FOREACH(d, devnodes) {
1905 _cleanup_free_ char *from = NULL, *to = NULL;
1906 struct stat st;
1907
1908 from = strappend("/dev/", d);
1909 if (!from)
1910 return log_oom();
1911
1912 to = prefix_root(dest, from);
1913 if (!to)
1914 return log_oom();
1915
1916 if (stat(from, &st) < 0) {
1917
1918 if (errno != ENOENT)
1919 return log_error_errno(errno, "Failed to stat %s: %m", from);
1920
1921 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1922 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1923 "%s is not a char or block device, cannot copy.", from);
1924 else {
1925 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1926
1927 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1928 /* Explicitly warn the user when /dev is already populated. */
1929 if (errno == EEXIST)
1930 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1931 if (errno != EPERM)
1932 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1933
1934 /* Some systems abusively restrict mknod but allow bind mounts. */
1935 r = touch(to);
1936 if (r < 0)
1937 return log_error_errno(r, "touch (%s) failed: %m", to);
1938 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1939 if (r < 0)
1940 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1941 }
1942
1943 r = userns_lchown(to, 0, 0);
1944 if (r < 0)
1945 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1946
1947 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1948 if (!dn)
1949 return log_oom();
1950
1951 r = userns_mkdir(dest, dn, 0755, 0, 0);
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to create '%s': %m", dn);
1954
1955 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1956 return log_oom();
1957
1958 prefixed = prefix_root(dest, sl);
1959 if (!prefixed)
1960 return log_oom();
1961
1962 t = strjoin("../", d);
1963 if (!t)
1964 return log_oom();
1965
1966 if (symlink(t, prefixed) < 0)
1967 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
1968 }
1969 }
1970
1971 return r;
1972 }
1973
1974 static int make_extra_nodes(const char *dest) {
1975 _cleanup_umask_ mode_t u;
1976 size_t i;
1977 int r;
1978
1979 u = umask(0000);
1980
1981 for (i = 0; i < arg_n_extra_nodes; i++) {
1982 _cleanup_free_ char *path = NULL;
1983 DeviceNode *n = arg_extra_nodes + i;
1984
1985 path = prefix_root(dest, n->path);
1986 if (!path)
1987 return log_oom();
1988
1989 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
1990 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
1991
1992 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
1993 if (r < 0)
1994 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
1995 }
1996
1997 return 0;
1998 }
1999
2000 static int setup_pts(const char *dest) {
2001 _cleanup_free_ char *options = NULL;
2002 const char *p;
2003 int r;
2004
2005 #if HAVE_SELINUX
2006 if (arg_selinux_apifs_context)
2007 (void) asprintf(&options,
2008 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2009 arg_uid_shift + TTY_GID,
2010 arg_selinux_apifs_context);
2011 else
2012 #endif
2013 (void) asprintf(&options,
2014 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2015 arg_uid_shift + TTY_GID);
2016
2017 if (!options)
2018 return log_oom();
2019
2020 /* Mount /dev/pts itself */
2021 p = prefix_roota(dest, "/dev/pts");
2022 r = mkdir_errno_wrapper(p, 0755);
2023 if (r < 0)
2024 return log_error_errno(r, "Failed to create /dev/pts: %m");
2025
2026 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2027 if (r < 0)
2028 return r;
2029 r = userns_lchown(p, 0, 0);
2030 if (r < 0)
2031 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2032
2033 /* Create /dev/ptmx symlink */
2034 p = prefix_roota(dest, "/dev/ptmx");
2035 if (symlink("pts/ptmx", p) < 0)
2036 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2037 r = userns_lchown(p, 0, 0);
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2040
2041 /* And fix /dev/pts/ptmx ownership */
2042 p = prefix_roota(dest, "/dev/pts/ptmx");
2043 r = userns_lchown(p, 0, 0);
2044 if (r < 0)
2045 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2046
2047 return 0;
2048 }
2049
2050 static int setup_dev_console(const char *dest, const char *console) {
2051 _cleanup_umask_ mode_t u;
2052 const char *to;
2053 int r;
2054
2055 assert(dest);
2056
2057 u = umask(0000);
2058
2059 if (!console)
2060 return 0;
2061
2062 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
2063 if (r < 0)
2064 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
2065
2066 /* We need to bind mount the right tty to /dev/console since
2067 * ptys can only exist on pts file systems. To have something
2068 * to bind mount things on we create a empty regular file. */
2069
2070 to = prefix_roota(dest, "/dev/console");
2071 r = touch(to);
2072 if (r < 0)
2073 return log_error_errno(r, "touch() for /dev/console failed: %m");
2074
2075 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
2076 }
2077
2078 static int setup_keyring(void) {
2079 key_serial_t keyring;
2080
2081 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2082 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2083 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2084 * these system calls let's make sure we don't leak anything into the container. */
2085
2086 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2087 if (keyring == -1) {
2088 if (errno == ENOSYS)
2089 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2090 else if (IN_SET(errno, EACCES, EPERM))
2091 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2092 else
2093 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2094 }
2095
2096 return 0;
2097 }
2098
2099 static int setup_kmsg(int kmsg_socket) {
2100 _cleanup_(unlink_and_freep) char *from = NULL;
2101 _cleanup_free_ char *fifo = NULL;
2102 _cleanup_close_ int fd = -1;
2103 _cleanup_umask_ mode_t u;
2104 const char *to;
2105 int r;
2106
2107 assert(kmsg_socket >= 0);
2108
2109 u = umask(0000);
2110
2111 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
2112 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2113 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2114 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2115
2116 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
2117 if (r < 0)
2118 return log_error_errno(r, "Failed to generate kmsg path: %m");
2119
2120 if (mkfifo(fifo, 0600) < 0)
2121 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2122
2123 from = TAKE_PTR(fifo);
2124 to = "/proc/kmsg";
2125
2126 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2127 if (r < 0)
2128 return r;
2129
2130 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2131 if (fd < 0)
2132 return log_error_errno(errno, "Failed to open fifo: %m");
2133
2134 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2135 r = send_one_fd(kmsg_socket, fd, 0);
2136 if (r < 0)
2137 return log_error_errno(r, "Failed to send FIFO fd: %m");
2138
2139 return 0;
2140 }
2141
2142 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2143 union in_addr_union *exposed = userdata;
2144
2145 assert(rtnl);
2146 assert(m);
2147 assert(exposed);
2148
2149 expose_port_execute(rtnl, arg_expose_ports, exposed);
2150 return 0;
2151 }
2152
2153 static int setup_hostname(void) {
2154 int r;
2155
2156 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2157 return 0;
2158
2159 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to set hostname: %m");
2162
2163 return 0;
2164 }
2165
2166 static int setup_journal(const char *directory) {
2167 _cleanup_free_ char *d = NULL;
2168 const char *dirname, *p, *q;
2169 sd_id128_t this_id;
2170 char id[33];
2171 bool try;
2172 int r;
2173
2174 /* Don't link journals in ephemeral mode */
2175 if (arg_ephemeral)
2176 return 0;
2177
2178 if (arg_link_journal == LINK_NO)
2179 return 0;
2180
2181 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2182
2183 r = sd_id128_get_machine(&this_id);
2184 if (r < 0)
2185 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2186
2187 if (sd_id128_equal(arg_uuid, this_id)) {
2188 log_full(try ? LOG_WARNING : LOG_ERR,
2189 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2190 if (try)
2191 return 0;
2192 return -EEXIST;
2193 }
2194
2195 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2196 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2197 if (r < 0) {
2198 bool ignore = r == -EROFS && try;
2199 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2200 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2201 return ignore ? 0 : r;
2202 }
2203 }
2204
2205 (void) sd_id128_to_string(arg_uuid, id);
2206
2207 p = strjoina("/var/log/journal/", id);
2208 q = prefix_roota(directory, p);
2209
2210 if (path_is_mount_point(p, NULL, 0) > 0) {
2211 if (try)
2212 return 0;
2213
2214 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2215 "%s: already a mount point, refusing to use for journal", p);
2216 }
2217
2218 if (path_is_mount_point(q, NULL, 0) > 0) {
2219 if (try)
2220 return 0;
2221
2222 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2223 "%s: already a mount point, refusing to use for journal", q);
2224 }
2225
2226 r = readlink_and_make_absolute(p, &d);
2227 if (r >= 0) {
2228 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2229 path_equal(d, q)) {
2230
2231 r = userns_mkdir(directory, p, 0755, 0, 0);
2232 if (r < 0)
2233 log_warning_errno(r, "Failed to create directory %s: %m", q);
2234 return 0;
2235 }
2236
2237 if (unlink(p) < 0)
2238 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2239 } else if (r == -EINVAL) {
2240
2241 if (arg_link_journal == LINK_GUEST &&
2242 rmdir(p) < 0) {
2243
2244 if (errno == ENOTDIR) {
2245 log_error("%s already exists and is neither a symlink nor a directory", p);
2246 return r;
2247 } else
2248 return log_error_errno(errno, "Failed to remove %s: %m", p);
2249 }
2250 } else if (r != -ENOENT)
2251 return log_error_errno(r, "readlink(%s) failed: %m", p);
2252
2253 if (arg_link_journal == LINK_GUEST) {
2254
2255 if (symlink(q, p) < 0) {
2256 if (try) {
2257 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2258 return 0;
2259 } else
2260 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2261 }
2262
2263 r = userns_mkdir(directory, p, 0755, 0, 0);
2264 if (r < 0)
2265 log_warning_errno(r, "Failed to create directory %s: %m", q);
2266 return 0;
2267 }
2268
2269 if (arg_link_journal == LINK_HOST) {
2270 /* don't create parents here — if the host doesn't have
2271 * permanent journal set up, don't force it here */
2272
2273 r = mkdir_errno_wrapper(p, 0755);
2274 if (r < 0 && r != -EEXIST) {
2275 if (try) {
2276 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2277 return 0;
2278 } else
2279 return log_error_errno(r, "Failed to create %s: %m", p);
2280 }
2281
2282 } else if (access(p, F_OK) < 0)
2283 return 0;
2284
2285 if (dir_is_empty(q) == 0)
2286 log_warning("%s is not empty, proceeding anyway.", q);
2287
2288 r = userns_mkdir(directory, p, 0755, 0, 0);
2289 if (r < 0)
2290 return log_error_errno(r, "Failed to create %s: %m", q);
2291
2292 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2293 if (r < 0)
2294 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2295
2296 return 0;
2297 }
2298
2299 static int drop_capabilities(uid_t uid) {
2300 CapabilityQuintet q;
2301
2302 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2303 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2304 * arg_caps_retain. */
2305
2306 if (capability_quintet_is_set(&arg_full_capabilities)) {
2307 q = arg_full_capabilities;
2308
2309 if (q.bounding == (uint64_t) -1)
2310 q.bounding = uid == 0 ? arg_caps_retain : 0;
2311
2312 if (q.effective == (uint64_t) -1)
2313 q.effective = uid == 0 ? q.bounding : 0;
2314
2315 if (q.inheritable == (uint64_t) -1)
2316 q.inheritable = uid == 0 ? q.bounding : 0;
2317
2318 if (q.permitted == (uint64_t) -1)
2319 q.permitted = uid == 0 ? q.bounding : 0;
2320
2321 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2322 q.ambient = 0;
2323 } else
2324 q = (CapabilityQuintet) {
2325 .bounding = arg_caps_retain,
2326 .effective = uid == 0 ? arg_caps_retain : 0,
2327 .inheritable = uid == 0 ? arg_caps_retain : 0,
2328 .permitted = uid == 0 ? arg_caps_retain : 0,
2329 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2330 };
2331
2332 return capability_quintet_enforce(&q);
2333 }
2334
2335 static int reset_audit_loginuid(void) {
2336 _cleanup_free_ char *p = NULL;
2337 int r;
2338
2339 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2340 return 0;
2341
2342 r = read_one_line_file("/proc/self/loginuid", &p);
2343 if (r == -ENOENT)
2344 return 0;
2345 if (r < 0)
2346 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2347
2348 /* Already reset? */
2349 if (streq(p, "4294967295"))
2350 return 0;
2351
2352 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2353 if (r < 0) {
2354 log_error_errno(r,
2355 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2356 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2357 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2358 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2359 "using systemd-nspawn. Sleeping for 5s... (%m)");
2360
2361 sleep(5);
2362 }
2363
2364 return 0;
2365 }
2366
2367 static int setup_propagate(const char *root) {
2368 const char *p, *q;
2369 int r;
2370
2371 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2372 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2373 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2374 (void) mkdir_p(p, 0600);
2375
2376 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2377 if (r < 0)
2378 return log_error_errno(r, "Failed to create /run/systemd: %m");
2379
2380 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2381 if (r < 0)
2382 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2383
2384 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2387
2388 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
2389 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2390 if (r < 0)
2391 return r;
2392
2393 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2394 if (r < 0)
2395 return r;
2396
2397 /* machined will MS_MOVE into that directory, and that's only
2398 * supported for non-shared mounts. */
2399 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2400 }
2401
2402 static int setup_machine_id(const char *directory) {
2403 const char *etc_machine_id;
2404 sd_id128_t id;
2405 int r;
2406
2407 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2408 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2409 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2410 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2411 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2412 * container behaves nicely). */
2413
2414 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2415
2416 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2417 if (r < 0) {
2418 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2419 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2420
2421 if (sd_id128_is_null(arg_uuid)) {
2422 r = sd_id128_randomize(&arg_uuid);
2423 if (r < 0)
2424 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2425 }
2426 } else {
2427 if (sd_id128_is_null(id))
2428 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2429 "Machine ID in container image is zero, refusing.");
2430
2431 arg_uuid = id;
2432 }
2433
2434 return 0;
2435 }
2436
2437 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2438 int r;
2439
2440 assert(directory);
2441
2442 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2443 return 0;
2444
2445 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2446 if (r == -EOPNOTSUPP)
2447 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2448 if (r == -EBADE)
2449 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2450 if (r < 0)
2451 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2452 if (r == 0)
2453 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2454 else
2455 log_debug("Patched directory tree to match UID/GID range.");
2456
2457 return r;
2458 }
2459
2460 /*
2461 * Return values:
2462 * < 0 : wait_for_terminate() failed to get the state of the
2463 * container, the container was terminated by a signal, or
2464 * failed for an unknown reason. No change is made to the
2465 * container argument.
2466 * > 0 : The program executed in the container terminated with an
2467 * error. The exit code of the program executed in the
2468 * container is returned. The container argument has been set
2469 * to CONTAINER_TERMINATED.
2470 * 0 : The container is being rebooted, has been shut down or exited
2471 * successfully. The container argument has been set to either
2472 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2473 *
2474 * That is, success is indicated by a return value of zero, and an
2475 * error is indicated by a non-zero value.
2476 */
2477 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2478 siginfo_t status;
2479 int r;
2480
2481 r = wait_for_terminate(pid, &status);
2482 if (r < 0)
2483 return log_warning_errno(r, "Failed to wait for container: %m");
2484
2485 switch (status.si_code) {
2486
2487 case CLD_EXITED:
2488 if (status.si_status == 0)
2489 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2490 else
2491 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2492
2493 *container = CONTAINER_TERMINATED;
2494 return status.si_status;
2495
2496 case CLD_KILLED:
2497 if (status.si_status == SIGINT) {
2498 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2499 *container = CONTAINER_TERMINATED;
2500 return 0;
2501
2502 } else if (status.si_status == SIGHUP) {
2503 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2504 *container = CONTAINER_REBOOTED;
2505 return 0;
2506 }
2507
2508 _fallthrough_;
2509 case CLD_DUMPED:
2510 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2511 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2512
2513 default:
2514 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2515 "Container %s failed due to unknown reason.", arg_machine);
2516 }
2517 }
2518
2519 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2520 pid_t pid;
2521
2522 pid = PTR_TO_PID(userdata);
2523 if (pid > 0) {
2524 if (kill(pid, arg_kill_signal) >= 0) {
2525 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2526 sd_event_source_set_userdata(s, NULL);
2527 return 0;
2528 }
2529 }
2530
2531 sd_event_exit(sd_event_source_get_event(s), 0);
2532 return 0;
2533 }
2534
2535 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2536 pid_t pid;
2537
2538 assert(s);
2539 assert(ssi);
2540
2541 pid = PTR_TO_PID(userdata);
2542
2543 for (;;) {
2544 siginfo_t si = {};
2545
2546 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2547 return log_error_errno(errno, "Failed to waitid(): %m");
2548 if (si.si_pid == 0) /* No pending children. */
2549 break;
2550 if (si.si_pid == pid) {
2551 /* The main process we care for has exited. Return from
2552 * signal handler but leave the zombie. */
2553 sd_event_exit(sd_event_source_get_event(s), 0);
2554 break;
2555 }
2556
2557 /* Reap all other children. */
2558 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2559 }
2560
2561 return 0;
2562 }
2563
2564 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2565 pid_t pid;
2566
2567 assert(m);
2568
2569 pid = PTR_TO_PID(userdata);
2570
2571 if (arg_kill_signal > 0) {
2572 log_info("Container termination requested. Attempting to halt container.");
2573 (void) kill(pid, arg_kill_signal);
2574 } else {
2575 log_info("Container termination requested. Exiting.");
2576 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2577 }
2578
2579 return 0;
2580 }
2581
2582 static int determine_names(void) {
2583 int r;
2584
2585 if (arg_template && !arg_directory && arg_machine) {
2586
2587 /* If --template= was specified then we should not
2588 * search for a machine, but instead create a new one
2589 * in /var/lib/machine. */
2590
2591 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2592 if (!arg_directory)
2593 return log_oom();
2594 }
2595
2596 if (!arg_image && !arg_directory) {
2597 if (arg_machine) {
2598 _cleanup_(image_unrefp) Image *i = NULL;
2599
2600 r = image_find(IMAGE_MACHINE, arg_machine, &i);
2601 if (r == -ENOENT)
2602 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2603 if (r < 0)
2604 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2605
2606 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2607 r = free_and_strdup(&arg_image, i->path);
2608 else
2609 r = free_and_strdup(&arg_directory, i->path);
2610 if (r < 0)
2611 return log_oom();
2612
2613 if (!arg_ephemeral)
2614 arg_read_only = arg_read_only || i->read_only;
2615 } else {
2616 r = safe_getcwd(&arg_directory);
2617 if (r < 0)
2618 return log_error_errno(r, "Failed to determine current directory: %m");
2619 }
2620
2621 if (!arg_directory && !arg_image) {
2622 log_error("Failed to determine path, please use -D or -i.");
2623 return -EINVAL;
2624 }
2625 }
2626
2627 if (!arg_machine) {
2628 if (arg_directory && path_equal(arg_directory, "/"))
2629 arg_machine = gethostname_malloc();
2630 else {
2631 if (arg_image) {
2632 char *e;
2633
2634 arg_machine = strdup(basename(arg_image));
2635
2636 /* Truncate suffix if there is one */
2637 e = endswith(arg_machine, ".raw");
2638 if (e)
2639 *e = 0;
2640 } else
2641 arg_machine = strdup(basename(arg_directory));
2642 }
2643 if (!arg_machine)
2644 return log_oom();
2645
2646 hostname_cleanup(arg_machine);
2647 if (!machine_name_is_valid(arg_machine)) {
2648 log_error("Failed to determine machine name automatically, please use -M.");
2649 return -EINVAL;
2650 }
2651
2652 if (arg_ephemeral) {
2653 char *b;
2654
2655 /* Add a random suffix when this is an
2656 * ephemeral machine, so that we can run many
2657 * instances at once without manually having
2658 * to specify -M each time. */
2659
2660 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2661 return log_oom();
2662
2663 free(arg_machine);
2664 arg_machine = b;
2665 }
2666 }
2667
2668 return 0;
2669 }
2670
2671 static int chase_symlinks_and_update(char **p, unsigned flags) {
2672 char *chased;
2673 int r;
2674
2675 assert(p);
2676
2677 if (!*p)
2678 return 0;
2679
2680 r = chase_symlinks(*p, NULL, flags, &chased);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2683
2684 free_and_replace(*p, chased);
2685 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
2686 }
2687
2688 static int determine_uid_shift(const char *directory) {
2689 int r;
2690
2691 if (arg_userns_mode == USER_NAMESPACE_NO) {
2692 arg_uid_shift = 0;
2693 return 0;
2694 }
2695
2696 if (arg_uid_shift == UID_INVALID) {
2697 struct stat st;
2698
2699 r = stat(directory, &st);
2700 if (r < 0)
2701 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2702
2703 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2704
2705 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2706 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2707 "UID and GID base of %s don't match.", directory);
2708
2709 arg_uid_range = UINT32_C(0x10000);
2710 }
2711
2712 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2713 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2714 "UID base too high for UID range.");
2715
2716 return 0;
2717 }
2718
2719 static unsigned long effective_clone_ns_flags(void) {
2720 unsigned long flags = arg_clone_ns_flags;
2721
2722 if (arg_private_network)
2723 flags |= CLONE_NEWNET;
2724 if (arg_use_cgns)
2725 flags |= CLONE_NEWCGROUP;
2726 if (arg_userns_mode != USER_NAMESPACE_NO)
2727 flags |= CLONE_NEWUSER;
2728
2729 return flags;
2730 }
2731
2732 static int patch_sysctl(void) {
2733
2734 /* This table is inspired by runc's sysctl() function */
2735 static const struct {
2736 const char *key;
2737 bool prefix;
2738 unsigned long clone_flags;
2739 } safe_sysctl[] = {
2740 { "kernel.hostname", false, CLONE_NEWUTS },
2741 { "kernel.domainname", false, CLONE_NEWUTS },
2742 { "kernel.msgmax", false, CLONE_NEWIPC },
2743 { "kernel.msgmnb", false, CLONE_NEWIPC },
2744 { "kernel.msgmni", false, CLONE_NEWIPC },
2745 { "kernel.sem", false, CLONE_NEWIPC },
2746 { "kernel.shmall", false, CLONE_NEWIPC },
2747 { "kernel.shmmax", false, CLONE_NEWIPC },
2748 { "kernel.shmmni", false, CLONE_NEWIPC },
2749 { "fs.mqueue.", true, CLONE_NEWIPC },
2750 { "net.", true, CLONE_NEWNET },
2751 };
2752
2753 unsigned long flags;
2754 char **k, **v;
2755 int r;
2756
2757 flags = effective_clone_ns_flags();
2758
2759 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2760 bool good = false;
2761 size_t i;
2762
2763 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2764
2765 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2766 continue;
2767
2768 if (safe_sysctl[i].prefix)
2769 good = startswith(*k, safe_sysctl[i].key);
2770 else
2771 good = streq(*k, safe_sysctl[i].key);
2772
2773 if (good)
2774 break;
2775 }
2776
2777 if (!good) {
2778 log_error("Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2779 return -EPERM;
2780 }
2781
2782 r = sysctl_write(*k, *v);
2783 if (r < 0)
2784 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2785 }
2786
2787 return 0;
2788 }
2789
2790 static int inner_child(
2791 Barrier *barrier,
2792 const char *directory,
2793 bool secondary,
2794 int kmsg_socket,
2795 int rtnl_socket,
2796 FDSet *fds) {
2797
2798 _cleanup_free_ char *home = NULL;
2799 char as_uuid[37];
2800 size_t n_env = 1;
2801 const char *envp[] = {
2802 "PATH=" DEFAULT_PATH_COMPAT,
2803 NULL, /* container */
2804 NULL, /* TERM */
2805 NULL, /* HOME */
2806 NULL, /* USER */
2807 NULL, /* LOGNAME */
2808 NULL, /* container_uuid */
2809 NULL, /* LISTEN_FDS */
2810 NULL, /* LISTEN_PID */
2811 NULL, /* NOTIFY_SOCKET */
2812 NULL
2813 };
2814 const char *exec_target;
2815 _cleanup_strv_free_ char **env_use = NULL;
2816 int r, which_failed;
2817
2818 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2819 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2820 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2821 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2822 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2823 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2824 * namespace.
2825 *
2826 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2827 * unshare(). See below. */
2828
2829 assert(barrier);
2830 assert(directory);
2831 assert(kmsg_socket >= 0);
2832
2833 log_debug("Inner child is initializing.");
2834
2835 if (arg_userns_mode != USER_NAMESPACE_NO) {
2836 /* Tell the parent, that it now can write the UID map. */
2837 (void) barrier_place(barrier); /* #1 */
2838
2839 /* Wait until the parent wrote the UID map */
2840 if (!barrier_place_and_sync(barrier)) /* #2 */
2841 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2842 "Parent died too early");
2843 }
2844
2845 r = reset_uid_gid();
2846 if (r < 0)
2847 return log_error_errno(r, "Couldn't become new root: %m");
2848
2849 r = mount_all(NULL,
2850 arg_mount_settings | MOUNT_IN_USERNS,
2851 arg_uid_shift,
2852 arg_selinux_apifs_context);
2853 if (r < 0)
2854 return r;
2855
2856 if (!arg_network_namespace_path && arg_private_network) {
2857 r = unshare(CLONE_NEWNET);
2858 if (r < 0)
2859 return log_error_errno(errno, "Failed to unshare network namespace: %m");
2860
2861 /* Tell the parent that it can setup network interfaces. */
2862 (void) barrier_place(barrier); /* #3 */
2863 }
2864
2865 r = mount_sysfs(NULL, arg_mount_settings);
2866 if (r < 0)
2867 return r;
2868
2869 /* Wait until we are cgroup-ified, so that we
2870 * can mount the right cgroup path writable */
2871 if (!barrier_place_and_sync(barrier)) /* #4 */
2872 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2873 "Parent died too early");
2874
2875 if (arg_use_cgns) {
2876 r = unshare(CLONE_NEWCGROUP);
2877 if (r < 0)
2878 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2879 r = mount_cgroups(
2880 "",
2881 arg_unified_cgroup_hierarchy,
2882 arg_userns_mode != USER_NAMESPACE_NO,
2883 arg_uid_shift,
2884 arg_uid_range,
2885 arg_selinux_apifs_context,
2886 true);
2887 if (r < 0)
2888 return r;
2889 } else {
2890 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2891 if (r < 0)
2892 return r;
2893 }
2894
2895 r = setup_boot_id();
2896 if (r < 0)
2897 return r;
2898
2899 r = setup_kmsg(kmsg_socket);
2900 if (r < 0)
2901 return r;
2902 kmsg_socket = safe_close(kmsg_socket);
2903
2904 r = mount_custom(
2905 "/",
2906 arg_custom_mounts,
2907 arg_n_custom_mounts,
2908 false,
2909 0,
2910 0,
2911 arg_selinux_apifs_context,
2912 true);
2913 if (r < 0)
2914 return r;
2915
2916 if (setsid() < 0)
2917 return log_error_errno(errno, "setsid() failed: %m");
2918
2919 if (arg_private_network)
2920 loopback_setup();
2921
2922 if (arg_expose_ports) {
2923 r = expose_port_send_rtnl(rtnl_socket);
2924 if (r < 0)
2925 return r;
2926 rtnl_socket = safe_close(rtnl_socket);
2927 }
2928
2929 r = patch_sysctl();
2930 if (r < 0)
2931 return r;
2932
2933 if (arg_oom_score_adjust_set) {
2934 r = set_oom_score_adjust(arg_oom_score_adjust);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to adjust OOM score: %m");
2937 }
2938
2939 if (arg_cpuset)
2940 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2941 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2942
2943 (void) setup_hostname();
2944
2945 if (arg_personality != PERSONALITY_INVALID) {
2946 r = safe_personality(arg_personality);
2947 if (r < 0)
2948 return log_error_errno(r, "personality() failed: %m");
2949 } else if (secondary) {
2950 r = safe_personality(PER_LINUX32);
2951 if (r < 0)
2952 return log_error_errno(r, "personality() failed: %m");
2953 }
2954
2955 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2956 if (r < 0)
2957 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2958
2959 #if HAVE_SECCOMP
2960 if (arg_seccomp) {
2961
2962 if (is_seccomp_available()) {
2963
2964 r = seccomp_load(arg_seccomp);
2965 if (IN_SET(r, -EPERM, -EACCES))
2966 return log_error_errno(r, "Failed to install seccomp filter: %m");
2967 if (r < 0)
2968 log_debug_errno(r, "Failed to install seccomp filter: %m");
2969 }
2970 } else
2971 #endif
2972 {
2973 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
2974 if (r < 0)
2975 return r;
2976 }
2977
2978 #if HAVE_SELINUX
2979 if (arg_selinux_context)
2980 if (setexeccon(arg_selinux_context) < 0)
2981 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2982 #endif
2983
2984 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
2985 * if we need to later on. */
2986 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
2987 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
2988
2989 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
2990 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
2991 else
2992 r = change_uid_gid(arg_user, &home);
2993 if (r < 0)
2994 return r;
2995
2996 r = drop_capabilities(getuid());
2997 if (r < 0)
2998 return log_error_errno(r, "Dropping capabilities failed: %m");
2999
3000 if (arg_no_new_privileges)
3001 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3002 return log_error_errno(errno, "Failed to disable new privileges: %m");
3003
3004 /* LXC sets container=lxc, so follow the scheme here */
3005 envp[n_env++] = strjoina("container=", arg_container_service_name);
3006
3007 envp[n_env] = strv_find_prefix(environ, "TERM=");
3008 if (envp[n_env])
3009 n_env++;
3010
3011 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3012 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3013 return log_oom();
3014
3015 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3016 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3017 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3018 return log_oom();
3019
3020 assert(!sd_id128_is_null(arg_uuid));
3021
3022 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3023 return log_oom();
3024
3025 if (fdset_size(fds) > 0) {
3026 r = fdset_cloexec(fds, false);
3027 if (r < 0)
3028 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3029
3030 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3031 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3032 return log_oom();
3033 }
3034 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3035 return log_oom();
3036
3037 env_use = strv_env_merge(2, envp, arg_setenv);
3038 if (!env_use)
3039 return log_oom();
3040
3041 /* Let the parent know that we are ready and
3042 * wait until the parent is ready with the
3043 * setup, too... */
3044 if (!barrier_place_and_sync(barrier)) /* #5 */
3045 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3046 "Parent died too early");
3047
3048 if (arg_chdir)
3049 if (chdir(arg_chdir) < 0)
3050 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3051
3052 if (arg_start_mode == START_PID2) {
3053 r = stub_pid1(arg_uuid);
3054 if (r < 0)
3055 return r;
3056 }
3057
3058 log_debug("Inner child completed, invoking payload.");
3059
3060 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3061 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3062 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3063 log_close();
3064 log_set_open_when_needed(true);
3065
3066 (void) fdset_close_others(fds);
3067
3068 if (arg_start_mode == START_BOOT) {
3069 char **a;
3070 size_t m;
3071
3072 /* Automatically search for the init system */
3073
3074 m = strv_length(arg_parameters);
3075 a = newa(char*, m + 2);
3076 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3077 a[1 + m] = NULL;
3078
3079 a[0] = (char*) "/usr/lib/systemd/systemd";
3080 execve(a[0], a, env_use);
3081
3082 a[0] = (char*) "/lib/systemd/systemd";
3083 execve(a[0], a, env_use);
3084
3085 a[0] = (char*) "/sbin/init";
3086 execve(a[0], a, env_use);
3087
3088 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3089 } else if (!strv_isempty(arg_parameters)) {
3090 const char *dollar_path;
3091
3092 exec_target = arg_parameters[0];
3093
3094 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3095 * binary. */
3096 dollar_path = strv_env_get(env_use, "PATH");
3097 if (dollar_path) {
3098 if (putenv((char*) dollar_path) != 0)
3099 return log_error_errno(errno, "Failed to update $PATH: %m");
3100 }
3101
3102 execvpe(arg_parameters[0], arg_parameters, env_use);
3103 } else {
3104 if (!arg_chdir)
3105 /* If we cannot change the directory, we'll end up in /, that is expected. */
3106 (void) chdir(home ?: "/root");
3107
3108 execle("/bin/bash", "-bash", NULL, env_use);
3109 execle("/bin/sh", "-sh", NULL, env_use);
3110
3111 exec_target = "/bin/bash, /bin/sh";
3112 }
3113
3114 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3115 }
3116
3117 static int setup_sd_notify_child(void) {
3118 _cleanup_close_ int fd = -1;
3119 union sockaddr_union sa = {
3120 .un.sun_family = AF_UNIX,
3121 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3122 };
3123 int r;
3124
3125 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3126 if (fd < 0)
3127 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3128
3129 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3130 (void) sockaddr_un_unlink(&sa.un);
3131
3132 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3133 if (r < 0)
3134 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3135
3136 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3137 if (r < 0)
3138 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3139
3140 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3141 if (r < 0)
3142 return log_error_errno(r, "SO_PASSCRED failed: %m");
3143
3144 return TAKE_FD(fd);
3145 }
3146
3147 static int outer_child(
3148 Barrier *barrier,
3149 const char *directory,
3150 const char *console,
3151 DissectedImage *dissected_image,
3152 bool secondary,
3153 int pid_socket,
3154 int uuid_socket,
3155 int notify_socket,
3156 int kmsg_socket,
3157 int rtnl_socket,
3158 int uid_shift_socket,
3159 int unified_cgroup_hierarchy_socket,
3160 FDSet *fds,
3161 int netns_fd) {
3162
3163 _cleanup_close_ int fd = -1;
3164 pid_t pid;
3165 ssize_t l;
3166 int r;
3167
3168 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3169 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3170 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3171 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3172
3173 assert(barrier);
3174 assert(directory);
3175 assert(pid_socket >= 0);
3176 assert(uuid_socket >= 0);
3177 assert(notify_socket >= 0);
3178 assert(kmsg_socket >= 0);
3179
3180 log_debug("Outer child is initializing.");
3181
3182 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3183 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3184
3185 if (arg_console_mode != CONSOLE_PIPE) {
3186 int terminal;
3187
3188 assert(console);
3189
3190 terminal = open_terminal(console, O_RDWR);
3191 if (terminal < 0)
3192 return log_error_errno(terminal, "Failed to open console: %m");
3193
3194 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
3195 r = log_dup_console();
3196 if (r < 0)
3197 return log_error_errno(r, "Failed to duplicate stderr: %m");
3198
3199 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
3200 if (r < 0)
3201 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
3202 }
3203
3204 r = reset_audit_loginuid();
3205 if (r < 0)
3206 return r;
3207
3208 /* Mark everything as slave, so that we still
3209 * receive mounts from the real root, but don't
3210 * propagate mounts to the real root. */
3211 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3212 if (r < 0)
3213 return r;
3214
3215 if (dissected_image) {
3216 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3217 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3218 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3219 * makes sure ESP partitions and userns are compatible. */
3220
3221 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3222 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3223 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3224 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3225 if (r < 0)
3226 return r;
3227 }
3228
3229 r = determine_uid_shift(directory);
3230 if (r < 0)
3231 return r;
3232
3233 if (arg_userns_mode != USER_NAMESPACE_NO) {
3234 /* Let the parent know which UID shift we read from the image */
3235 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3236 if (l < 0)
3237 return log_error_errno(errno, "Failed to send UID shift: %m");
3238 if (l != sizeof(arg_uid_shift))
3239 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3240 "Short write while sending UID shift.");
3241
3242 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3243 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3244 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3245 * not it will pick a different one, and send it back to us. */
3246
3247 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3248 if (l < 0)
3249 return log_error_errno(errno, "Failed to recv UID shift: %m");
3250 if (l != sizeof(arg_uid_shift))
3251 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3252 "Short read while receiving UID shift.");
3253 }
3254
3255 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3256 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3257 }
3258
3259 if (!dissected_image) {
3260 /* Turn directory into bind mount */
3261 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3262 if (r < 0)
3263 return r;
3264 }
3265
3266 r = setup_pivot_root(
3267 directory,
3268 arg_pivot_root_new,
3269 arg_pivot_root_old);
3270 if (r < 0)
3271 return r;
3272
3273 r = setup_volatile_mode(
3274 directory,
3275 arg_volatile_mode,
3276 arg_userns_mode != USER_NAMESPACE_NO,
3277 arg_uid_shift,
3278 arg_uid_range,
3279 arg_selinux_context);
3280 if (r < 0)
3281 return r;
3282
3283 if (dissected_image) {
3284 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3285 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3286 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3287 if (r < 0)
3288 return r;
3289 }
3290
3291 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3292 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3293
3294 r = detect_unified_cgroup_hierarchy_from_image(directory);
3295 if (r < 0)
3296 return r;
3297
3298 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3299 if (l < 0)
3300 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3301 if (l != sizeof(arg_unified_cgroup_hierarchy))
3302 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3303 "Short write while sending cgroup mode.");
3304
3305 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3306 }
3307
3308 /* Mark everything as shared so our mounts get propagated down. This is
3309 * required to make new bind mounts available in systemd services
3310 * inside the containter that create a new mount namespace.
3311 * See https://github.com/systemd/systemd/issues/3860
3312 * Further submounts (such as /dev) done after this will inherit the
3313 * shared propagation mode. */
3314 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3315 if (r < 0)
3316 return r;
3317
3318 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3319 if (r < 0)
3320 return r;
3321
3322 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3323 if (r < 0)
3324 return r;
3325
3326 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
3327 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3328 if (r < 0)
3329 return log_error_errno(r, "Failed to make tree read-only: %m");
3330 }
3331
3332 r = mount_all(directory,
3333 arg_mount_settings,
3334 arg_uid_shift,
3335 arg_selinux_apifs_context);
3336 if (r < 0)
3337 return r;
3338
3339 r = copy_devnodes(directory);
3340 if (r < 0)
3341 return r;
3342
3343 r = make_extra_nodes(directory);
3344 if (r < 0)
3345 return r;
3346
3347 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3348 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
3349
3350 r = setup_pts(directory);
3351 if (r < 0)
3352 return r;
3353
3354 r = setup_propagate(directory);
3355 if (r < 0)
3356 return r;
3357
3358 r = setup_dev_console(directory, console);
3359 if (r < 0)
3360 return r;
3361
3362 r = setup_keyring();
3363 if (r < 0)
3364 return r;
3365
3366 r = setup_timezone(directory);
3367 if (r < 0)
3368 return r;
3369
3370 r = setup_resolv_conf(directory);
3371 if (r < 0)
3372 return r;
3373
3374 r = setup_machine_id(directory);
3375 if (r < 0)
3376 return r;
3377
3378 r = setup_journal(directory);
3379 if (r < 0)
3380 return r;
3381
3382 r = mount_custom(
3383 directory,
3384 arg_custom_mounts,
3385 arg_n_custom_mounts,
3386 arg_userns_mode != USER_NAMESPACE_NO,
3387 arg_uid_shift,
3388 arg_uid_range,
3389 arg_selinux_apifs_context,
3390 false);
3391 if (r < 0)
3392 return r;
3393
3394 if (!arg_use_cgns) {
3395 r = mount_cgroups(
3396 directory,
3397 arg_unified_cgroup_hierarchy,
3398 arg_userns_mode != USER_NAMESPACE_NO,
3399 arg_uid_shift,
3400 arg_uid_range,
3401 arg_selinux_apifs_context,
3402 false);
3403 if (r < 0)
3404 return r;
3405 }
3406
3407 r = mount_move_root(directory);
3408 if (r < 0)
3409 return log_error_errno(r, "Failed to move root directory: %m");
3410
3411 fd = setup_sd_notify_child();
3412 if (fd < 0)
3413 return fd;
3414
3415 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3416 arg_clone_ns_flags |
3417 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3418 if (pid < 0)
3419 return log_error_errno(errno, "Failed to fork inner child: %m");
3420 if (pid == 0) {
3421 pid_socket = safe_close(pid_socket);
3422 uuid_socket = safe_close(uuid_socket);
3423 notify_socket = safe_close(notify_socket);
3424 uid_shift_socket = safe_close(uid_shift_socket);
3425
3426 /* The inner child has all namespaces that are
3427 * requested, so that we all are owned by the user if
3428 * user namespaces are turned on. */
3429
3430 if (arg_network_namespace_path) {
3431 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3432 if (r < 0)
3433 return log_error_errno(r, "Failed to join network namespace: %m");
3434 }
3435
3436 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3437 if (r < 0)
3438 _exit(EXIT_FAILURE);
3439
3440 _exit(EXIT_SUCCESS);
3441 }
3442
3443 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3444 if (l < 0)
3445 return log_error_errno(errno, "Failed to send PID: %m");
3446 if (l != sizeof(pid))
3447 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3448 "Short write while sending PID.");
3449
3450 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3451 if (l < 0)
3452 return log_error_errno(errno, "Failed to send machine ID: %m");
3453 if (l != sizeof(arg_uuid))
3454 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3455 "Short write while sending machine ID.");
3456
3457 l = send_one_fd(notify_socket, fd, 0);
3458 if (l < 0)
3459 return log_error_errno(errno, "Failed to send notify fd: %m");
3460
3461 pid_socket = safe_close(pid_socket);
3462 uuid_socket = safe_close(uuid_socket);
3463 notify_socket = safe_close(notify_socket);
3464 kmsg_socket = safe_close(kmsg_socket);
3465 rtnl_socket = safe_close(rtnl_socket);
3466 netns_fd = safe_close(netns_fd);
3467
3468 return 0;
3469 }
3470
3471 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3472 bool tried_hashed = false;
3473 unsigned n_tries = 100;
3474 uid_t candidate;
3475 int r;
3476
3477 assert(shift);
3478 assert(ret_lock_file);
3479 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3480 assert(arg_uid_range == 0x10000U);
3481
3482 candidate = *shift;
3483
3484 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3485
3486 for (;;) {
3487 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3488 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3489
3490 if (--n_tries <= 0)
3491 return -EBUSY;
3492
3493 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3494 goto next;
3495 if ((candidate & UINT32_C(0xFFFF)) != 0)
3496 goto next;
3497
3498 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3499 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3500 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3501 goto next;
3502 if (r < 0)
3503 return r;
3504
3505 /* Make some superficial checks whether the range is currently known in the user database */
3506 if (getpwuid(candidate))
3507 goto next;
3508 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3509 goto next;
3510 if (getgrgid(candidate))
3511 goto next;
3512 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3513 goto next;
3514
3515 *ret_lock_file = lf;
3516 lf = (struct LockFile) LOCK_FILE_INIT;
3517 *shift = candidate;
3518 return 0;
3519
3520 next:
3521 if (arg_machine && !tried_hashed) {
3522 /* Try to hash the base from the container name */
3523
3524 static const uint8_t hash_key[] = {
3525 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3526 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3527 };
3528
3529 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3530
3531 tried_hashed = true;
3532 } else
3533 random_bytes(&candidate, sizeof(candidate));
3534
3535 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3536 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3537 }
3538 }
3539
3540 static int setup_uid_map(pid_t pid) {
3541 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3542 int r;
3543
3544 assert(pid > 1);
3545
3546 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3547 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3548 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3549 if (r < 0)
3550 return log_error_errno(r, "Failed to write UID map: %m");
3551
3552 /* We always assign the same UID and GID ranges */
3553 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3554 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3555 if (r < 0)
3556 return log_error_errno(r, "Failed to write GID map: %m");
3557
3558 return 0;
3559 }
3560
3561 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3562 char buf[NOTIFY_BUFFER_MAX+1];
3563 char *p = NULL;
3564 struct iovec iovec = {
3565 .iov_base = buf,
3566 .iov_len = sizeof(buf)-1,
3567 };
3568 union {
3569 struct cmsghdr cmsghdr;
3570 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3571 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3572 } control = {};
3573 struct msghdr msghdr = {
3574 .msg_iov = &iovec,
3575 .msg_iovlen = 1,
3576 .msg_control = &control,
3577 .msg_controllen = sizeof(control),
3578 };
3579 struct cmsghdr *cmsg;
3580 struct ucred *ucred = NULL;
3581 ssize_t n;
3582 pid_t inner_child_pid;
3583 _cleanup_strv_free_ char **tags = NULL;
3584
3585 assert(userdata);
3586
3587 inner_child_pid = PTR_TO_PID(userdata);
3588
3589 if (revents != EPOLLIN) {
3590 log_warning("Got unexpected poll event for notify fd.");
3591 return 0;
3592 }
3593
3594 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3595 if (n < 0) {
3596 if (IN_SET(errno, EAGAIN, EINTR))
3597 return 0;
3598
3599 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3600 }
3601 cmsg_close_all(&msghdr);
3602
3603 CMSG_FOREACH(cmsg, &msghdr) {
3604 if (cmsg->cmsg_level == SOL_SOCKET &&
3605 cmsg->cmsg_type == SCM_CREDENTIALS &&
3606 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3607
3608 ucred = (struct ucred*) CMSG_DATA(cmsg);
3609 }
3610 }
3611
3612 if (!ucred || ucred->pid != inner_child_pid) {
3613 log_debug("Received notify message without valid credentials. Ignoring.");
3614 return 0;
3615 }
3616
3617 if ((size_t) n >= sizeof(buf)) {
3618 log_warning("Received notify message exceeded maximum size. Ignoring.");
3619 return 0;
3620 }
3621
3622 buf[n] = 0;
3623 tags = strv_split(buf, "\n\r");
3624 if (!tags)
3625 return log_oom();
3626
3627 if (strv_find(tags, "READY=1"))
3628 (void) sd_notifyf(false, "READY=1\n");
3629
3630 p = strv_find_startswith(tags, "STATUS=");
3631 if (p)
3632 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
3633
3634 return 0;
3635 }
3636
3637 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3638 int r;
3639
3640 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3641 if (r < 0)
3642 return log_error_errno(r, "Failed to allocate notify event source: %m");
3643
3644 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3645
3646 return 0;
3647 }
3648
3649 static int merge_settings(Settings *settings, const char *path) {
3650 int rl;
3651
3652 assert(settings);
3653 assert(path);
3654
3655 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3656 * that this steals the fields of the Settings* structure, and hence modifies it. */
3657
3658 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3659 settings->start_mode >= 0) {
3660 arg_start_mode = settings->start_mode;
3661 strv_free_and_replace(arg_parameters, settings->parameters);
3662 }
3663
3664 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3665 arg_ephemeral = settings->ephemeral;
3666
3667 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3668 settings->root) {
3669
3670 if (!arg_settings_trusted)
3671 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3672 else
3673 free_and_replace(arg_directory, settings->root);
3674 }
3675
3676 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3677 settings->pivot_root_new) {
3678 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3679 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3680 }
3681
3682 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3683 settings->working_directory)
3684 free_and_replace(arg_chdir, settings->working_directory);
3685
3686 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3687 settings->environment)
3688 strv_free_and_replace(arg_setenv, settings->environment);
3689
3690 if ((arg_settings_mask & SETTING_USER) == 0) {
3691
3692 if (settings->user)
3693 free_and_replace(arg_user, settings->user);
3694
3695 if (uid_is_valid(settings->uid))
3696 arg_uid = settings->uid;
3697 if (gid_is_valid(settings->gid))
3698 arg_gid = settings->gid;
3699 if (settings->n_supplementary_gids > 0) {
3700 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3701 arg_n_supplementary_gids = settings->n_supplementary_gids;
3702 }
3703 }
3704
3705 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3706 uint64_t plus, minus;
3707
3708 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3709 * Settings structure */
3710
3711 plus = settings->capability;
3712 minus = settings->drop_capability;
3713
3714 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3715 if (settings_private_network(settings))
3716 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3717 else
3718 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3719 }
3720
3721 if (!arg_settings_trusted && plus != 0) {
3722 if (settings->capability != 0)
3723 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3724 } else
3725 arg_caps_retain |= plus;
3726
3727 arg_caps_retain &= ~minus;
3728
3729 /* Copy the full capabilities over too */
3730 if (capability_quintet_is_set(&settings->full_capabilities)) {
3731 if (!arg_settings_trusted)
3732 log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
3733 else
3734 arg_full_capabilities = settings->full_capabilities;
3735 }
3736 }
3737
3738 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3739 settings->kill_signal > 0)
3740 arg_kill_signal = settings->kill_signal;
3741
3742 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3743 settings->personality != PERSONALITY_INVALID)
3744 arg_personality = settings->personality;
3745
3746 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3747 !sd_id128_is_null(settings->machine_id)) {
3748
3749 if (!arg_settings_trusted)
3750 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3751 else
3752 arg_uuid = settings->machine_id;
3753 }
3754
3755 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3756 settings->read_only >= 0)
3757 arg_read_only = settings->read_only;
3758
3759 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3760 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3761 arg_volatile_mode = settings->volatile_mode;
3762
3763 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3764 settings->n_custom_mounts > 0) {
3765
3766 if (!arg_settings_trusted)
3767 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3768 else {
3769 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3770 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3771 arg_n_custom_mounts = settings->n_custom_mounts;
3772 settings->n_custom_mounts = 0;
3773 }
3774 }
3775
3776 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3777 (settings->private_network >= 0 ||
3778 settings->network_veth >= 0 ||
3779 settings->network_bridge ||
3780 settings->network_zone ||
3781 settings->network_interfaces ||
3782 settings->network_macvlan ||
3783 settings->network_ipvlan ||
3784 settings->network_veth_extra ||
3785 settings->network_namespace_path)) {
3786
3787 if (!arg_settings_trusted)
3788 log_warning("Ignoring network settings, file %s is not trusted.", path);
3789 else {
3790 arg_network_veth = settings_network_veth(settings);
3791 arg_private_network = settings_private_network(settings);
3792
3793 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3794 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3795 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3796 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3797
3798 free_and_replace(arg_network_bridge, settings->network_bridge);
3799 free_and_replace(arg_network_zone, settings->network_zone);
3800
3801 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
3802 }
3803 }
3804
3805 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3806 settings->expose_ports) {
3807
3808 if (!arg_settings_trusted)
3809 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3810 else {
3811 expose_port_free_all(arg_expose_ports);
3812 arg_expose_ports = TAKE_PTR(settings->expose_ports);
3813 }
3814 }
3815
3816 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3817 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3818
3819 if (!arg_settings_trusted)
3820 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3821 else {
3822 arg_userns_mode = settings->userns_mode;
3823 arg_uid_shift = settings->uid_shift;
3824 arg_uid_range = settings->uid_range;
3825 arg_userns_chown = settings->userns_chown;
3826 }
3827 }
3828
3829 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3830 arg_notify_ready = settings->notify_ready;
3831
3832 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3833
3834 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
3835 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3836 else {
3837 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3838 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3839 }
3840
3841 #if HAVE_SECCOMP
3842 if (!arg_settings_trusted && settings->seccomp)
3843 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3844 else {
3845 seccomp_release(arg_seccomp);
3846 arg_seccomp = TAKE_PTR(settings->seccomp);
3847 }
3848 #endif
3849 }
3850
3851 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3852 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3853 continue;
3854
3855 if (!settings->rlimit[rl])
3856 continue;
3857
3858 if (!arg_settings_trusted) {
3859 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3860 continue;
3861 }
3862
3863 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3864 }
3865
3866 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3867 settings->hostname)
3868 free_and_replace(arg_hostname, settings->hostname);
3869
3870 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3871 settings->no_new_privileges >= 0)
3872 arg_no_new_privileges = settings->no_new_privileges;
3873
3874 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3875 settings->oom_score_adjust_set) {
3876
3877 if (!arg_settings_trusted)
3878 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3879 else {
3880 arg_oom_score_adjust = settings->oom_score_adjust;
3881 arg_oom_score_adjust_set = true;
3882 }
3883 }
3884
3885 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3886 settings->cpuset) {
3887
3888 if (!arg_settings_trusted)
3889 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3890 else {
3891 if (arg_cpuset)
3892 CPU_FREE(arg_cpuset);
3893 arg_cpuset = TAKE_PTR(settings->cpuset);
3894 arg_cpuset_ncpus = settings->cpuset_ncpus;
3895 }
3896 }
3897
3898 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3899 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3900 arg_resolv_conf = settings->resolv_conf;
3901
3902 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3903 settings->link_journal != _LINK_JOURNAL_INVALID) {
3904
3905 if (!arg_settings_trusted)
3906 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3907 else {
3908 arg_link_journal = settings->link_journal;
3909 arg_link_journal_try = settings->link_journal_try;
3910 }
3911 }
3912
3913 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3914 settings->timezone != _TIMEZONE_MODE_INVALID)
3915 arg_timezone = settings->timezone;
3916
3917 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3918 settings->slice) {
3919
3920 if (!arg_settings_trusted)
3921 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3922 else
3923 free_and_replace(arg_slice, settings->slice);
3924 }
3925
3926 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
3927 settings->use_cgns >= 0) {
3928
3929 if (!arg_settings_trusted)
3930 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
3931 else
3932 arg_use_cgns = settings->use_cgns;
3933 }
3934
3935 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
3936 settings->clone_ns_flags != (unsigned long) -1) {
3937
3938 if (!arg_settings_trusted)
3939 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
3940 else
3941 arg_clone_ns_flags = settings->clone_ns_flags;
3942 }
3943
3944 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
3945 settings->console_mode >= 0) {
3946
3947 if (!arg_settings_trusted)
3948 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
3949 else
3950 arg_console_mode = settings->console_mode;
3951 }
3952
3953 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
3954 * don't consult arg_settings_mask for them. */
3955
3956 sd_bus_message_unref(arg_property_message);
3957 arg_property_message = TAKE_PTR(settings->properties);
3958
3959 arg_console_width = settings->console_width;
3960 arg_console_height = settings->console_height;
3961
3962 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3963 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
3964 arg_n_extra_nodes = settings->n_extra_nodes;
3965
3966 return 0;
3967 }
3968
3969 static int load_settings(void) {
3970 _cleanup_(settings_freep) Settings *settings = NULL;
3971 _cleanup_fclose_ FILE *f = NULL;
3972 _cleanup_free_ char *p = NULL;
3973 const char *fn, *i;
3974 int r;
3975
3976 if (arg_oci_bundle)
3977 return 0;
3978
3979 /* If all settings are masked, there's no point in looking for
3980 * the settings file */
3981 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3982 return 0;
3983
3984 fn = strjoina(arg_machine, ".nspawn");
3985
3986 /* We first look in the admin's directories in /etc and /run */
3987 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3988 _cleanup_free_ char *j = NULL;
3989
3990 j = strjoin(i, "/", fn);
3991 if (!j)
3992 return log_oom();
3993
3994 f = fopen(j, "re");
3995 if (f) {
3996 p = TAKE_PTR(j);
3997
3998 /* By default, we trust configuration from /etc and /run */
3999 if (arg_settings_trusted < 0)
4000 arg_settings_trusted = true;
4001
4002 break;
4003 }
4004
4005 if (errno != ENOENT)
4006 return log_error_errno(errno, "Failed to open %s: %m", j);
4007 }
4008
4009 if (!f) {
4010 /* After that, let's look for a file next to the
4011 * actual image we shall boot. */
4012
4013 if (arg_image) {
4014 p = file_in_same_dir(arg_image, fn);
4015 if (!p)
4016 return log_oom();
4017 } else if (arg_directory) {
4018 p = file_in_same_dir(arg_directory, fn);
4019 if (!p)
4020 return log_oom();
4021 }
4022
4023 if (p) {
4024 f = fopen(p, "re");
4025 if (!f && errno != ENOENT)
4026 return log_error_errno(errno, "Failed to open %s: %m", p);
4027
4028 /* By default, we do not trust configuration from /var/lib/machines */
4029 if (arg_settings_trusted < 0)
4030 arg_settings_trusted = false;
4031 }
4032 }
4033
4034 if (!f)
4035 return 0;
4036
4037 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4038
4039 r = settings_load(f, p, &settings);
4040 if (r < 0)
4041 return r;
4042
4043 return merge_settings(settings, p);
4044 }
4045
4046 static int load_oci_bundle(void) {
4047 _cleanup_(settings_freep) Settings *settings = NULL;
4048 int r;
4049
4050 if (!arg_oci_bundle)
4051 return 0;
4052
4053 /* By default let's trust OCI bundles */
4054 if (arg_settings_trusted < 0)
4055 arg_settings_trusted = true;
4056
4057 r = oci_load(NULL, arg_oci_bundle, &settings);
4058 if (r < 0)
4059 return r;
4060
4061 return merge_settings(settings, arg_oci_bundle);
4062 }
4063
4064 static int run_container(int master,
4065 const char* console,
4066 DissectedImage *dissected_image,
4067 bool secondary,
4068 FDSet *fds,
4069 char veth_name[IFNAMSIZ], bool *veth_created,
4070 union in_addr_union *exposed,
4071 pid_t *pid, int *ret) {
4072
4073 static const struct sigaction sa = {
4074 .sa_handler = nop_signal_handler,
4075 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4076 };
4077
4078 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4079 _cleanup_close_ int etc_passwd_lock = -1;
4080 _cleanup_close_pair_ int
4081 kmsg_socket_pair[2] = { -1, -1 },
4082 rtnl_socket_pair[2] = { -1, -1 },
4083 pid_socket_pair[2] = { -1, -1 },
4084 uuid_socket_pair[2] = { -1, -1 },
4085 notify_socket_pair[2] = { -1, -1 },
4086 uid_shift_socket_pair[2] = { -1, -1 },
4087 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4088
4089 _cleanup_close_ int notify_socket= -1;
4090 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4091 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4092 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4093 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4094 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4095 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4096 ContainerStatus container_status = 0;
4097 int ifi = 0, r;
4098 ssize_t l;
4099 sigset_t mask_chld;
4100 _cleanup_close_ int netns_fd = -1;
4101
4102 assert_se(sigemptyset(&mask_chld) == 0);
4103 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4104
4105 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4106 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4107 * check with getpwuid() if the specific user already exists. Note that /etc might be
4108 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4109 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4110 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4111 * really ours. */
4112
4113 etc_passwd_lock = take_etc_passwd_lock(NULL);
4114 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4115 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4116 }
4117
4118 r = barrier_create(&barrier);
4119 if (r < 0)
4120 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4121
4122 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4123 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4124
4125 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4126 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4127
4128 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4129 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4130
4131 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4132 return log_error_errno(errno, "Failed to create id socket pair: %m");
4133
4134 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4135 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4136
4137 if (arg_userns_mode != USER_NAMESPACE_NO)
4138 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4139 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4140
4141 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4142 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4143 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4144
4145 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4146 * parent's blocking calls and give it a chance to call wait() and terminate. */
4147 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4148 if (r < 0)
4149 return log_error_errno(errno, "Failed to change the signal mask: %m");
4150
4151 r = sigaction(SIGCHLD, &sa, NULL);
4152 if (r < 0)
4153 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4154
4155 if (arg_network_namespace_path) {
4156 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4157 if (netns_fd < 0)
4158 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4159
4160 r = fd_is_network_ns(netns_fd);
4161 if (r == -EUCLEAN)
4162 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4163 else if (r < 0)
4164 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4165 else if (r == 0) {
4166 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4167 return -EINVAL;
4168 }
4169 }
4170
4171 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4172 if (*pid < 0)
4173 return log_error_errno(errno, "clone() failed%s: %m",
4174 errno == EINVAL ?
4175 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4176
4177 if (*pid == 0) {
4178 /* The outer child only has a file system namespace. */
4179 barrier_set_role(&barrier, BARRIER_CHILD);
4180
4181 master = safe_close(master);
4182
4183 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4184 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4185 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4186 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4187 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4188 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4189 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4190
4191 (void) reset_all_signal_handlers();
4192 (void) reset_signal_mask();
4193
4194 r = outer_child(&barrier,
4195 arg_directory,
4196 console,
4197 dissected_image,
4198 secondary,
4199 pid_socket_pair[1],
4200 uuid_socket_pair[1],
4201 notify_socket_pair[1],
4202 kmsg_socket_pair[1],
4203 rtnl_socket_pair[1],
4204 uid_shift_socket_pair[1],
4205 unified_cgroup_hierarchy_socket_pair[1],
4206 fds,
4207 netns_fd);
4208 if (r < 0)
4209 _exit(EXIT_FAILURE);
4210
4211 _exit(EXIT_SUCCESS);
4212 }
4213
4214 barrier_set_role(&barrier, BARRIER_PARENT);
4215
4216 fdset_close(fds);
4217
4218 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4219 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4220 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4221 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4222 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4223 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4224 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4225
4226 if (arg_userns_mode != USER_NAMESPACE_NO) {
4227 /* The child just let us know the UID shift it might have read from the image. */
4228 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4229 if (l < 0)
4230 return log_error_errno(errno, "Failed to read UID shift: %m");
4231 if (l != sizeof arg_uid_shift) {
4232 log_error("Short read while reading UID shift.");
4233 return -EIO;
4234 }
4235
4236 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4237 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4238 * image, but if that's already in use, pick a new one, and report back to the child,
4239 * which one we now picked. */
4240
4241 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4242 if (r < 0)
4243 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4244
4245 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4246 if (l < 0)
4247 return log_error_errno(errno, "Failed to send UID shift: %m");
4248 if (l != sizeof arg_uid_shift) {
4249 log_error("Short write while writing UID shift.");
4250 return -EIO;
4251 }
4252 }
4253 }
4254
4255 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4256 /* The child let us know the support cgroup mode it might have read from the image. */
4257 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4258 if (l < 0)
4259 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4260 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
4261 log_error("Short read while reading cgroup mode (%zu bytes).%s",
4262 l, l == 0 ? " The child is most likely dead." : "");
4263 return -EIO;
4264 }
4265 }
4266
4267 /* Wait for the outer child. */
4268 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4269 if (r < 0)
4270 return r;
4271 if (r != EXIT_SUCCESS)
4272 return -EIO;
4273
4274 /* And now retrieve the PID of the inner child. */
4275 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4276 if (l < 0)
4277 return log_error_errno(errno, "Failed to read inner child PID: %m");
4278 if (l != sizeof *pid) {
4279 log_error("Short read while reading inner child PID.");
4280 return -EIO;
4281 }
4282
4283 /* We also retrieve container UUID in case it was generated by outer child */
4284 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4285 if (l < 0)
4286 return log_error_errno(errno, "Failed to read container machine ID: %m");
4287 if (l != sizeof(arg_uuid)) {
4288 log_error("Short read while reading container machined ID.");
4289 return -EIO;
4290 }
4291
4292 /* We also retrieve the socket used for notifications generated by outer child */
4293 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4294 if (notify_socket < 0)
4295 return log_error_errno(notify_socket,
4296 "Failed to receive notification socket from the outer child: %m");
4297
4298 log_debug("Init process invoked as PID "PID_FMT, *pid);
4299
4300 if (arg_userns_mode != USER_NAMESPACE_NO) {
4301 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4302 log_error("Child died too early.");
4303 return -ESRCH;
4304 }
4305
4306 r = setup_uid_map(*pid);
4307 if (r < 0)
4308 return r;
4309
4310 (void) barrier_place(&barrier); /* #2 */
4311 }
4312
4313 if (arg_private_network) {
4314 if (!arg_network_namespace_path) {
4315 /* Wait until the child has unshared its network namespace. */
4316 if (!barrier_place_and_sync(&barrier)) { /* #3 */
4317 log_error("Child died too early");
4318 return -ESRCH;
4319 }
4320 }
4321
4322 r = move_network_interfaces(*pid, arg_network_interfaces);
4323 if (r < 0)
4324 return r;
4325
4326 if (arg_network_veth) {
4327 r = setup_veth(arg_machine, *pid, veth_name,
4328 arg_network_bridge || arg_network_zone);
4329 if (r < 0)
4330 return r;
4331 else if (r > 0)
4332 ifi = r;
4333
4334 if (arg_network_bridge) {
4335 /* Add the interface to a bridge */
4336 r = setup_bridge(veth_name, arg_network_bridge, false);
4337 if (r < 0)
4338 return r;
4339 if (r > 0)
4340 ifi = r;
4341 } else if (arg_network_zone) {
4342 /* Add the interface to a bridge, possibly creating it */
4343 r = setup_bridge(veth_name, arg_network_zone, true);
4344 if (r < 0)
4345 return r;
4346 if (r > 0)
4347 ifi = r;
4348 }
4349 }
4350
4351 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4352 if (r < 0)
4353 return r;
4354
4355 /* We created the primary and extra veth links now; let's remember this, so that we know to
4356 remove them later on. Note that we don't bother with removing veth links that were created
4357 here when their setup failed half-way, because in that case the kernel should be able to
4358 remove them on its own, since they cannot be referenced by anything yet. */
4359 *veth_created = true;
4360
4361 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4362 if (r < 0)
4363 return r;
4364
4365 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4366 if (r < 0)
4367 return r;
4368 }
4369
4370 if (arg_register || !arg_keep_unit) {
4371 r = sd_bus_default_system(&bus);
4372 if (r < 0)
4373 return log_error_errno(r, "Failed to open system bus: %m");
4374
4375 r = sd_bus_set_close_on_exit(bus, false);
4376 if (r < 0)
4377 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4378 }
4379
4380 if (!arg_keep_unit) {
4381 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4382 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4383 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4384
4385 r = sd_bus_match_signal_async(
4386 bus,
4387 NULL,
4388 "org.freedesktop.systemd1",
4389 NULL,
4390 "org.freedesktop.systemd1.Scope",
4391 "RequestStop",
4392 on_request_stop, NULL, PID_TO_PTR(*pid));
4393 if (r < 0)
4394 return log_error_errno(r, "Failed to request RequestStop match: %m");
4395 }
4396
4397 if (arg_register) {
4398 r = register_machine(
4399 bus,
4400 arg_machine,
4401 *pid,
4402 arg_directory,
4403 arg_uuid,
4404 ifi,
4405 arg_slice,
4406 arg_custom_mounts, arg_n_custom_mounts,
4407 arg_kill_signal,
4408 arg_property,
4409 arg_property_message,
4410 arg_keep_unit,
4411 arg_container_service_name);
4412 if (r < 0)
4413 return r;
4414
4415 } else if (!arg_keep_unit) {
4416 r = allocate_scope(
4417 bus,
4418 arg_machine,
4419 *pid,
4420 arg_slice,
4421 arg_custom_mounts, arg_n_custom_mounts,
4422 arg_kill_signal,
4423 arg_property,
4424 arg_property_message);
4425 if (r < 0)
4426 return r;
4427
4428 } else if (arg_slice || arg_property)
4429 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
4430
4431 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
4432 if (r < 0)
4433 return r;
4434
4435 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4436 if (r < 0)
4437 return r;
4438
4439 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4440 if (r < 0)
4441 return r;
4442
4443 /* Notify the child that the parent is ready with all
4444 * its setup (including cgroup-ification), and that
4445 * the child can now hand over control to the code to
4446 * run inside the container. */
4447 (void) barrier_place(&barrier); /* #4 */
4448
4449 /* Block SIGCHLD here, before notifying child.
4450 * process_pty() will handle it with the other signals. */
4451 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4452
4453 /* Reset signal to default */
4454 r = default_signals(SIGCHLD, -1);
4455 if (r < 0)
4456 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4457
4458 r = sd_event_new(&event);
4459 if (r < 0)
4460 return log_error_errno(r, "Failed to get default event source: %m");
4461
4462 (void) sd_event_set_watchdog(event, true);
4463
4464 if (bus) {
4465 r = sd_bus_attach_event(bus, event, 0);
4466 if (r < 0)
4467 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4468 }
4469
4470 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
4471 if (r < 0)
4472 return r;
4473
4474 /* Let the child know that we are ready and wait that the child is completely ready now. */
4475 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4476 log_error("Child died too early.");
4477 return -ESRCH;
4478 }
4479
4480 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4481 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4482 etc_passwd_lock = safe_close(etc_passwd_lock);
4483
4484 (void) sd_notifyf(false,
4485 "STATUS=Container running.\n"
4486 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4487 if (!arg_notify_ready)
4488 (void) sd_notify(false, "READY=1\n");
4489
4490 if (arg_kill_signal > 0) {
4491 /* Try to kill the init system on SIGINT or SIGTERM */
4492 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4493 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4494 } else {
4495 /* Immediately exit */
4496 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4497 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4498 }
4499
4500 /* Exit when the child exits */
4501 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4502
4503 if (arg_expose_ports) {
4504 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4505 if (r < 0)
4506 return r;
4507
4508 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4509 }
4510
4511 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4512
4513 if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
4514 assert(master >= 0);
4515
4516 r = pty_forward_new(event, master,
4517 PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
4518 &forward);
4519 if (r < 0)
4520 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4521
4522 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4523 (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
4524 }
4525
4526 r = sd_event_loop(event);
4527 if (r < 0)
4528 return log_error_errno(r, "Failed to run event loop: %m");
4529
4530 if (forward) {
4531 char last_char = 0;
4532
4533 (void) pty_forward_get_last_char(forward, &last_char);
4534 forward = pty_forward_free(forward);
4535
4536 if (!arg_quiet && last_char != '\n')
4537 putc('\n', stdout);
4538 }
4539
4540 /* Kill if it is not dead yet anyway */
4541 if (bus) {
4542 if (arg_register)
4543 terminate_machine(bus, arg_machine);
4544 else if (!arg_keep_unit)
4545 terminate_scope(bus, arg_machine);
4546 }
4547
4548 /* Normally redundant, but better safe than sorry */
4549 (void) kill(*pid, SIGKILL);
4550
4551 r = wait_for_container(*pid, &container_status);
4552 *pid = 0;
4553
4554 if (r < 0)
4555 /* We failed to wait for the container, or the container exited abnormally. */
4556 return r;
4557 if (r > 0 || container_status == CONTAINER_TERMINATED) {
4558 /* r > 0 → The container exited with a non-zero status.
4559 * As a special case, we need to replace 133 with a different value,
4560 * because 133 is special-cased in the service file to reboot the container.
4561 * otherwise → The container exited with zero status and a reboot was not requested.
4562 */
4563 if (r == EXIT_FORCE_RESTART)
4564 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4565 *ret = r;
4566 return 0; /* finito */
4567 }
4568
4569 /* CONTAINER_REBOOTED, loop again */
4570
4571 if (arg_keep_unit) {
4572 /* Special handling if we are running as a service: instead of simply
4573 * restarting the machine we want to restart the entire service, so let's
4574 * inform systemd about this with the special exit code 133. The service
4575 * file uses RestartForceExitStatus=133 so that this results in a full
4576 * nspawn restart. This is necessary since we might have cgroup parameters
4577 * set we want to have flushed out. */
4578 *ret = EXIT_FORCE_RESTART;
4579 return 0; /* finito */
4580 }
4581
4582 expose_port_flush(arg_expose_ports, exposed);
4583
4584 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4585 *veth_created = false;
4586 return 1; /* loop again */
4587 }
4588
4589 static int initialize_rlimits(void) {
4590 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4591 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4592 * container execution environments. */
4593
4594 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4595 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4596 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4597 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4598 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4599 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4600 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4601 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4602 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4603 [RLIMIT_NICE] = { 0, 0 },
4604 [RLIMIT_NOFILE] = { 1024, 4096 },
4605 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4606 [RLIMIT_RTPRIO] = { 0, 0 },
4607 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4608 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4609
4610 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4611 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4612 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4613 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4614 * that PID 1 changes a number of other resource limits during early initialization which is why we
4615 * don't read the other limits from PID 1 but prefer the static table above. */
4616 };
4617
4618 int rl;
4619
4620 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4621 /* Let's only fill in what the user hasn't explicitly configured anyway */
4622 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4623 const struct rlimit *v;
4624 struct rlimit buffer;
4625
4626 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4627 /* For these two let's read the limits off PID 1. See above for an explanation. */
4628
4629 if (prlimit(1, rl, NULL, &buffer) < 0)
4630 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4631
4632 v = &buffer;
4633 } else
4634 v = kernel_defaults + rl;
4635
4636 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4637 if (!arg_rlimit[rl])
4638 return log_oom();
4639 }
4640
4641 if (DEBUG_LOGGING) {
4642 _cleanup_free_ char *k = NULL;
4643
4644 (void) rlimit_format(arg_rlimit[rl], &k);
4645 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4646 }
4647 }
4648
4649 return 0;
4650 }
4651
4652 static int run(int argc, char *argv[]) {
4653 _cleanup_free_ char *console = NULL;
4654 _cleanup_close_ int master = -1;
4655 _cleanup_fdset_free_ FDSet *fds = NULL;
4656 int r, n_fd_passed, ret = EXIT_SUCCESS;
4657 char veth_name[IFNAMSIZ] = "";
4658 bool secondary = false, remove_directory = false, remove_image = false;
4659 pid_t pid = 0;
4660 union in_addr_union exposed = {};
4661 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4662 bool veth_created = false, remove_tmprootdir = false;
4663 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4664 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4665 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4666 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4667
4668 log_parse_environment();
4669 log_open();
4670
4671 r = parse_argv(argc, argv);
4672 if (r <= 0)
4673 goto finish;
4674
4675 r = must_be_root();
4676 if (r < 0)
4677 goto finish;
4678
4679 r = initialize_rlimits();
4680 if (r < 0)
4681 goto finish;
4682
4683 r = load_oci_bundle();
4684 if (r < 0)
4685 goto finish;
4686
4687 r = determine_names();
4688 if (r < 0)
4689 goto finish;
4690
4691 r = load_settings();
4692 if (r < 0)
4693 goto finish;
4694
4695 r = cg_unified_flush();
4696 if (r < 0) {
4697 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4698 goto finish;
4699 }
4700
4701 r = verify_arguments();
4702 if (r < 0)
4703 goto finish;
4704
4705 r = detect_unified_cgroup_hierarchy_from_environment();
4706 if (r < 0)
4707 goto finish;
4708
4709 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4710 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4711 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4712 (void) ignore_signals(SIGPIPE, -1);
4713
4714 n_fd_passed = sd_listen_fds(false);
4715 if (n_fd_passed > 0) {
4716 r = fdset_new_listen_fds(&fds, false);
4717 if (r < 0) {
4718 log_error_errno(r, "Failed to collect file descriptors: %m");
4719 goto finish;
4720 }
4721 }
4722
4723 /* The "default" umask. This is appropriate for most file and directory
4724 * operations performed by nspawn, and is the umask that will be used for
4725 * the child. Functions like copy_devnodes() change the umask temporarily. */
4726 umask(0022);
4727
4728 if (arg_directory) {
4729 assert(!arg_image);
4730
4731 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4732 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4733 r = -EINVAL;
4734 goto finish;
4735 }
4736
4737 if (arg_ephemeral) {
4738 _cleanup_free_ char *np = NULL;
4739
4740 r = chase_symlinks_and_update(&arg_directory, 0);
4741 if (r < 0)
4742 goto finish;
4743
4744 /* If the specified path is a mount point we
4745 * generate the new snapshot immediately
4746 * inside it under a random name. However if
4747 * the specified is not a mount point we
4748 * create the new snapshot in the parent
4749 * directory, just next to it. */
4750 r = path_is_mount_point(arg_directory, NULL, 0);
4751 if (r < 0) {
4752 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4753 goto finish;
4754 }
4755 if (r > 0)
4756 r = tempfn_random_child(arg_directory, "machine.", &np);
4757 else
4758 r = tempfn_random(arg_directory, "machine.", &np);
4759 if (r < 0) {
4760 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4761 goto finish;
4762 }
4763
4764 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4765 if (r < 0) {
4766 log_error_errno(r, "Failed to lock %s: %m", np);
4767 goto finish;
4768 }
4769
4770 r = btrfs_subvol_snapshot(arg_directory, np,
4771 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4772 BTRFS_SNAPSHOT_FALLBACK_COPY |
4773 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4774 BTRFS_SNAPSHOT_RECURSIVE |
4775 BTRFS_SNAPSHOT_QUOTA);
4776 if (r < 0) {
4777 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4778 goto finish;
4779 }
4780
4781 free_and_replace(arg_directory, np);
4782
4783 remove_directory = true;
4784
4785 } else {
4786 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4787 if (r < 0)
4788 goto finish;
4789
4790 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4791 if (r == -EBUSY) {
4792 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4793 goto finish;
4794 }
4795 if (r < 0) {
4796 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4797 goto finish;
4798 }
4799
4800 if (arg_template) {
4801 r = chase_symlinks_and_update(&arg_template, 0);
4802 if (r < 0)
4803 goto finish;
4804
4805 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4806 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4807 BTRFS_SNAPSHOT_FALLBACK_COPY |
4808 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4809 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4810 BTRFS_SNAPSHOT_RECURSIVE |
4811 BTRFS_SNAPSHOT_QUOTA);
4812 if (r == -EEXIST)
4813 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4814 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4815 else if (r < 0) {
4816 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4817 goto finish;
4818 } else
4819 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4820 "Populated %s from template %s.", arg_directory, arg_template);
4821 }
4822 }
4823
4824 if (arg_start_mode == START_BOOT) {
4825 const char *p;
4826
4827 if (arg_pivot_root_new)
4828 p = prefix_roota(arg_directory, arg_pivot_root_new);
4829 else
4830 p = arg_directory;
4831
4832 if (path_is_os_tree(p) <= 0) {
4833 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4834 r = -EINVAL;
4835 goto finish;
4836 }
4837 } else {
4838 const char *p, *q;
4839
4840 if (arg_pivot_root_new)
4841 p = prefix_roota(arg_directory, arg_pivot_root_new);
4842 else
4843 p = arg_directory;
4844
4845 q = strjoina(p, "/usr/");
4846
4847 if (laccess(q, F_OK) < 0) {
4848 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4849 r = -EINVAL;
4850 goto finish;
4851 }
4852 }
4853
4854 } else {
4855 assert(arg_image);
4856 assert(!arg_template);
4857
4858 r = chase_symlinks_and_update(&arg_image, 0);
4859 if (r < 0)
4860 goto finish;
4861
4862 if (arg_ephemeral) {
4863 _cleanup_free_ char *np = NULL;
4864
4865 r = tempfn_random(arg_image, "machine.", &np);
4866 if (r < 0) {
4867 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4868 goto finish;
4869 }
4870
4871 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4872 if (r < 0) {
4873 r = log_error_errno(r, "Failed to create image lock: %m");
4874 goto finish;
4875 }
4876
4877 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
4878 if (r < 0) {
4879 r = log_error_errno(r, "Failed to copy image file: %m");
4880 goto finish;
4881 }
4882
4883 free_and_replace(arg_image, np);
4884
4885 remove_image = true;
4886 } else {
4887 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4888 if (r == -EBUSY) {
4889 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4890 goto finish;
4891 }
4892 if (r < 0) {
4893 r = log_error_errno(r, "Failed to create image lock: %m");
4894 goto finish;
4895 }
4896
4897 if (!arg_root_hash) {
4898 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4899 if (r < 0) {
4900 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4901 goto finish;
4902 }
4903 }
4904 }
4905
4906 if (!mkdtemp(tmprootdir)) {
4907 r = log_error_errno(errno, "Failed to create temporary directory: %m");
4908 goto finish;
4909 }
4910
4911 remove_tmprootdir = true;
4912
4913 arg_directory = strdup(tmprootdir);
4914 if (!arg_directory) {
4915 r = log_oom();
4916 goto finish;
4917 }
4918
4919 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4920 if (r < 0) {
4921 log_error_errno(r, "Failed to set up loopback block device: %m");
4922 goto finish;
4923 }
4924
4925 r = dissect_image_and_warn(
4926 loop->fd,
4927 arg_image,
4928 arg_root_hash, arg_root_hash_size,
4929 DISSECT_IMAGE_REQUIRE_ROOT,
4930 &dissected_image);
4931 if (r == -ENOPKG) {
4932 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
4933 log_notice("Note that the disk image needs to\n"
4934 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4935 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4936 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4937 " d) or contain a file system without a partition table\n"
4938 "in order to be bootable with systemd-nspawn.");
4939 goto finish;
4940 }
4941 if (r < 0)
4942 goto finish;
4943
4944 if (!arg_root_hash && dissected_image->can_verity)
4945 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4946
4947 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
4948 if (r < 0)
4949 goto finish;
4950
4951 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4952 if (remove_image && unlink(arg_image) >= 0)
4953 remove_image = false;
4954 }
4955
4956 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
4957 if (r < 0)
4958 goto finish;
4959
4960 if (arg_console_mode < 0)
4961 arg_console_mode =
4962 isatty(STDIN_FILENO) > 0 &&
4963 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
4964
4965 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
4966 arg_quiet = true;
4967
4968 if (arg_console_mode != CONSOLE_PIPE) {
4969 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
4970 if (master < 0) {
4971 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4972 goto finish;
4973 }
4974
4975 r = ptsname_malloc(master, &console);
4976 if (r < 0) {
4977 r = log_error_errno(r, "Failed to determine tty name: %m");
4978 goto finish;
4979 }
4980
4981 if (arg_selinux_apifs_context) {
4982 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4983 if (r < 0)
4984 goto finish;
4985 }
4986
4987 if (unlockpt(master) < 0) {
4988 r = log_error_errno(errno, "Failed to unlock tty: %m");
4989 goto finish;
4990 }
4991 }
4992
4993 if (!arg_quiet)
4994 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4995 arg_machine, arg_image ?: arg_directory);
4996
4997 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4998
4999 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5000 r = log_error_errno(errno, "Failed to become subreaper: %m");
5001 goto finish;
5002 }
5003
5004 for (;;) {
5005 r = run_container(master,
5006 console,
5007 dissected_image,
5008 secondary,
5009 fds,
5010 veth_name, &veth_created,
5011 &exposed,
5012 &pid, &ret);
5013 if (r <= 0)
5014 break;
5015 }
5016
5017 finish:
5018 (void) sd_notify(false,
5019 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5020 "STOPPING=1\nSTATUS=Terminating...");
5021
5022 if (pid > 0)
5023 (void) kill(pid, SIGKILL);
5024
5025 /* Try to flush whatever is still queued in the pty */
5026 if (master >= 0) {
5027 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
5028 master = safe_close(master);
5029 }
5030
5031 if (pid > 0)
5032 (void) wait_for_terminate(pid, NULL);
5033
5034 pager_close();
5035
5036 if (remove_directory && arg_directory) {
5037 int k;
5038
5039 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5040 if (k < 0)
5041 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5042 }
5043
5044 if (remove_image && arg_image) {
5045 if (unlink(arg_image) < 0)
5046 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5047 }
5048
5049 if (remove_tmprootdir) {
5050 if (rmdir(tmprootdir) < 0)
5051 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5052 }
5053
5054 if (arg_machine) {
5055 const char *p;
5056
5057 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5058 (void) rm_rf(p, REMOVE_ROOT);
5059 }
5060
5061 expose_port_flush(arg_expose_ports, &exposed);
5062
5063 if (veth_created)
5064 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5065 (void) remove_bridge(arg_network_zone);
5066
5067 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5068 expose_port_free_all(arg_expose_ports);
5069 rlimit_free_all(arg_rlimit);
5070 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5071
5072 if (r < 0)
5073 return r;
5074
5075 return ret;
5076 }
5077
5078 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);