]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tree-wide: use SYNTHETIC_ERRNO() where appropriate
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_BLKID
4 #include <blkid.h>
5 #endif
6 #include <errno.h>
7 #include <getopt.h>
8 #include <grp.h>
9 #include <linux/fs.h>
10 #include <linux/loop.h>
11 #include <pwd.h>
12 #include <sched.h>
13 #if HAVE_SELINUX
14 #include <selinux/selinux.h>
15 #endif
16 #include <signal.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/personality.h>
22 #include <sys/prctl.h>
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 #include <unistd.h>
26
27 #include "sd-bus.h"
28 #include "sd-daemon.h"
29 #include "sd-id128.h"
30
31 #include "alloc-util.h"
32 #include "barrier.h"
33 #include "base-filesystem.h"
34 #include "blkid-util.h"
35 #include "btrfs-util.h"
36 #include "bus-error.h"
37 #include "bus-util.h"
38 #include "cap-list.h"
39 #include "capability-util.h"
40 #include "cgroup-util.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "dev-setup.h"
44 #include "dissect-image.h"
45 #include "env-util.h"
46 #include "fd-util.h"
47 #include "fdset.h"
48 #include "fileio.h"
49 #include "format-util.h"
50 #include "fs-util.h"
51 #include "gpt.h"
52 #include "hexdecoct.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "log.h"
56 #include "loop-util.h"
57 #include "loopback-setup.h"
58 #include "machine-image.h"
59 #include "macro.h"
60 #include "main-func.h"
61 #include "missing.h"
62 #include "mkdir.h"
63 #include "mount-util.h"
64 #include "mountpoint-util.h"
65 #include "namespace-util.h"
66 #include "netlink-util.h"
67 #include "nspawn-cgroup.h"
68 #include "nspawn-def.h"
69 #include "nspawn-expose-ports.h"
70 #include "nspawn-mount.h"
71 #include "nspawn-network.h"
72 #include "nspawn-oci.h"
73 #include "nspawn-patch-uid.h"
74 #include "nspawn-register.h"
75 #include "nspawn-seccomp.h"
76 #include "nspawn-settings.h"
77 #include "nspawn-setuid.h"
78 #include "nspawn-stub-pid1.h"
79 #include "nulstr-util.h"
80 #include "os-util.h"
81 #include "pager.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "pretty-print.h"
85 #include "process-util.h"
86 #include "ptyfwd.h"
87 #include "random-util.h"
88 #include "raw-clone.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #if HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "socket-util.h"
97 #include "stat-util.h"
98 #include "stdio-util.h"
99 #include "string-table.h"
100 #include "string-util.h"
101 #include "strv.h"
102 #include "sysctl-util.h"
103 #include "terminal-util.h"
104 #include "tmpfile-util.h"
105 #include "umask-util.h"
106 #include "user-util.h"
107 #include "util.h"
108
109 #if HAVE_SPLIT_USR
110 #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
111 #else
112 #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
113 #endif
114
115 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
119
120 #define EXIT_FORCE_RESTART 133
121
122 typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED,
125 } ContainerStatus;
126
127 static char *arg_directory = NULL;
128 static char *arg_template = NULL;
129 static char *arg_chdir = NULL;
130 static char *arg_pivot_root_new = NULL;
131 static char *arg_pivot_root_old = NULL;
132 static char *arg_user = NULL;
133 static uid_t arg_uid = UID_INVALID;
134 static gid_t arg_gid = GID_INVALID;
135 static gid_t* arg_supplementary_gids = NULL;
136 static size_t arg_n_supplementary_gids = 0;
137 static sd_id128_t arg_uuid = {};
138 static char *arg_machine = NULL; /* The name used by the host to refer to this */
139 static char *arg_hostname = NULL; /* The name the payload sees by default */
140 static const char *arg_selinux_context = NULL;
141 static const char *arg_selinux_apifs_context = NULL;
142 static char *arg_slice = NULL;
143 static bool arg_private_network = false;
144 static bool arg_read_only = false;
145 static StartMode arg_start_mode = START_PID1;
146 static bool arg_ephemeral = false;
147 static LinkJournal arg_link_journal = LINK_AUTO;
148 static bool arg_link_journal_try = false;
149 static uint64_t arg_caps_retain =
150 (1ULL << CAP_AUDIT_CONTROL) |
151 (1ULL << CAP_AUDIT_WRITE) |
152 (1ULL << CAP_CHOWN) |
153 (1ULL << CAP_DAC_OVERRIDE) |
154 (1ULL << CAP_DAC_READ_SEARCH) |
155 (1ULL << CAP_FOWNER) |
156 (1ULL << CAP_FSETID) |
157 (1ULL << CAP_IPC_OWNER) |
158 (1ULL << CAP_KILL) |
159 (1ULL << CAP_LEASE) |
160 (1ULL << CAP_LINUX_IMMUTABLE) |
161 (1ULL << CAP_MKNOD) |
162 (1ULL << CAP_NET_BIND_SERVICE) |
163 (1ULL << CAP_NET_BROADCAST) |
164 (1ULL << CAP_NET_RAW) |
165 (1ULL << CAP_SETFCAP) |
166 (1ULL << CAP_SETGID) |
167 (1ULL << CAP_SETPCAP) |
168 (1ULL << CAP_SETUID) |
169 (1ULL << CAP_SYS_ADMIN) |
170 (1ULL << CAP_SYS_BOOT) |
171 (1ULL << CAP_SYS_CHROOT) |
172 (1ULL << CAP_SYS_NICE) |
173 (1ULL << CAP_SYS_PTRACE) |
174 (1ULL << CAP_SYS_RESOURCE) |
175 (1ULL << CAP_SYS_TTY_CONFIG);
176 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
177 static CustomMount *arg_custom_mounts = NULL;
178 static size_t arg_n_custom_mounts = 0;
179 static char **arg_setenv = NULL;
180 static bool arg_quiet = false;
181 static bool arg_register = true;
182 static bool arg_keep_unit = false;
183 static char **arg_network_interfaces = NULL;
184 static char **arg_network_macvlan = NULL;
185 static char **arg_network_ipvlan = NULL;
186 static bool arg_network_veth = false;
187 static char **arg_network_veth_extra = NULL;
188 static char *arg_network_bridge = NULL;
189 static char *arg_network_zone = NULL;
190 static char *arg_network_namespace_path = NULL;
191 static PagerFlags arg_pager_flags = 0;
192 static unsigned long arg_personality = PERSONALITY_INVALID;
193 static char *arg_image = NULL;
194 static char *arg_oci_bundle = NULL;
195 static VolatileMode arg_volatile_mode = VOLATILE_NO;
196 static ExposePort *arg_expose_ports = NULL;
197 static char **arg_property = NULL;
198 static sd_bus_message *arg_property_message = NULL;
199 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
200 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
201 static bool arg_userns_chown = false;
202 static int arg_kill_signal = 0;
203 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
204 static SettingsMask arg_settings_mask = 0;
205 static int arg_settings_trusted = -1;
206 static char **arg_parameters = NULL;
207 static const char *arg_container_service_name = "systemd-nspawn";
208 static bool arg_notify_ready = false;
209 static bool arg_use_cgns = true;
210 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
211 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
212 static void *arg_root_hash = NULL;
213 static size_t arg_root_hash_size = 0;
214 static char **arg_syscall_whitelist = NULL;
215 static char **arg_syscall_blacklist = NULL;
216 #if HAVE_SECCOMP
217 static scmp_filter_ctx arg_seccomp = NULL;
218 #endif
219 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
220 static bool arg_no_new_privileges = false;
221 static int arg_oom_score_adjust = 0;
222 static bool arg_oom_score_adjust_set = false;
223 static cpu_set_t *arg_cpuset = NULL;
224 static unsigned arg_cpuset_ncpus = 0;
225 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
226 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
227 static unsigned arg_console_width = (unsigned) -1, arg_console_height = (unsigned) -1;
228 static DeviceNode* arg_extra_nodes = NULL;
229 static size_t arg_n_extra_nodes = 0;
230 static char **arg_sysctl = NULL;
231 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
232
233 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
234 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
235 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
255 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_root_hash, freep);
257 STATIC_DESTRUCTOR_REGISTER(arg_syscall_whitelist, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_syscall_blacklist, strv_freep);
259 #if HAVE_SECCOMP
260 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
261 #endif
262 STATIC_DESTRUCTOR_REGISTER(arg_cpuset, CPU_FREEp);
263 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
264
265 static int help(void) {
266 _cleanup_free_ char *link = NULL;
267 int r;
268
269 (void) pager_open(arg_pager_flags);
270
271 r = terminal_urlify_man("systemd-nspawn", "1", &link);
272 if (r < 0)
273 return log_oom();
274
275 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
276 "Spawn a command or OS in a light-weight container.\n\n"
277 " -h --help Show this help\n"
278 " --version Print version string\n"
279 " -q --quiet Do not show status information\n"
280 " --no-pager Do not pipe output into a pager\n"
281 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
282 "%3$sImage:%4$s\n"
283 " -D --directory=PATH Root directory for the container\n"
284 " --template=PATH Initialize root directory from template directory,\n"
285 " if missing\n"
286 " -x --ephemeral Run container with snapshot of root directory, and\n"
287 " remove it after exit\n"
288 " -i --image=PATH Root file system disk image (or device node) for\n"
289 " the container\n"
290 " --oci-bundle=PATH OCI bundle directory\n"
291 " --read-only Mount the root directory read-only\n"
292 " --volatile[=MODE] Run the system in volatile mode\n"
293 " --root-hash=HASH Specify verity root hash for root disk image\n"
294 " --pivot-root=PATH[:PATH]\n"
295 " Pivot root to given directory in the container\n\n"
296 "%3$sExecution:%4$s\n"
297 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
298 " -b --boot Boot up full system (i.e. invoke init)\n"
299 " --chdir=PATH Set working directory in the container\n"
300 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
301 " -u --user=USER Run the command under specified user or UID\n"
302 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
303 " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
304 "%3$sSystem Identity:%4$s\n"
305 " -M --machine=NAME Set the machine name for the container\n"
306 " --hostname=NAME Override the hostname for the container\n"
307 " --uuid=UUID Set a specific machine UUID for the container\n\n"
308 "%3$sProperties:%4$s\n"
309 " -S --slice=SLICE Place the container in the specified slice\n"
310 " --property=NAME=VALUE Set scope unit property\n"
311 " --register=BOOLEAN Register container as machine\n"
312 " --keep-unit Do not register a scope for the machine, reuse\n"
313 " the service unit nspawn is running in\n\n"
314 "%3$sUser Namespacing:%4$s\n"
315 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
316 " --private-users[=UIDBASE[:NUIDS]]\n"
317 " Similar, but with user configured UID/GID range\n"
318 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n\n"
319 "%3$sNetworking:%4$s\n"
320 " --private-network Disable network in container\n"
321 " --network-interface=INTERFACE\n"
322 " Assign an existing network interface to the\n"
323 " container\n"
324 " --network-macvlan=INTERFACE\n"
325 " Create a macvlan network interface based on an\n"
326 " existing network interface to the container\n"
327 " --network-ipvlan=INTERFACE\n"
328 " Create a ipvlan network interface based on an\n"
329 " existing network interface to the container\n"
330 " -n --network-veth Add a virtual Ethernet connection between host\n"
331 " and container\n"
332 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
333 " Add an additional virtual Ethernet link between\n"
334 " host and container\n"
335 " --network-bridge=INTERFACE\n"
336 " Add a virtual Ethernet connection to the container\n"
337 " and attach it to an existing bridge on the host\n"
338 " --network-zone=NAME Similar, but attach the new interface to an\n"
339 " an automatically managed bridge interface\n"
340 " --network-namespace-path=PATH\n"
341 " Set network namespace to the one represented by\n"
342 " the specified kernel namespace file node\n"
343 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
344 " Expose a container IP port on the host\n\n"
345 "%3$sSecurity:%4$s\n"
346 " --capability=CAP In addition to the default, retain specified\n"
347 " capability\n"
348 " --drop-capability=CAP Drop the specified capability from the default set\n"
349 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
350 " --system-call-filter=LIST|~LIST\n"
351 " Permit/prohibit specific system calls\n"
352 " -Z --selinux-context=SECLABEL\n"
353 " Set the SELinux security context to be used by\n"
354 " processes in the container\n"
355 " -L --selinux-apifs-context=SECLABEL\n"
356 " Set the SELinux security context to be used by\n"
357 " API/tmpfs file systems in the container\n\n"
358 "%3$sResources:%4$s\n"
359 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
360 " --oom-score-adjust=VALUE\n"
361 " Adjust the OOM score value for the payload\n"
362 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
363 " --personality=ARCH Pick personality for this container\n\n"
364 "%3$sIntegration:%4$s\n"
365 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
366 " --timezone=MODE Select mode of /etc/localtime initialization\n"
367 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
368 " host, try-guest, try-host\n"
369 " -j Equivalent to --link-journal=try-guest\n\n"
370 "%3$sMounts:%4$s\n"
371 " --bind=PATH[:PATH[:OPTIONS]]\n"
372 " Bind mount a file or directory from the host into\n"
373 " the container\n"
374 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
375 " Similar, but creates a read-only bind mount\n"
376 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
377 " it\n"
378 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
379 " --overlay=PATH[:PATH...]:PATH\n"
380 " Create an overlay mount from the host to \n"
381 " the container\n"
382 " --overlay-ro=PATH[:PATH...]:PATH\n"
383 " Similar, but creates a read-only overlay mount\n\n"
384 "%3$sInput/Output:%4$s\n"
385 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
386 " set up for the container.\n"
387 " -P --pipe Equivalent to --console=pipe\n"
388 "\nSee the %2$s for details.\n"
389 , program_invocation_short_name
390 , link
391 , ansi_underline(), ansi_normal());
392
393 return 0;
394 }
395
396 static int custom_mount_check_all(void) {
397 size_t i;
398
399 for (i = 0; i < arg_n_custom_mounts; i++) {
400 CustomMount *m = &arg_custom_mounts[i];
401
402 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
403 if (arg_userns_chown)
404 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
405 "--private-users-chown may not be combined with custom root mounts.");
406 else if (arg_uid_shift == UID_INVALID)
407 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
408 "--private-users with automatic UID shift may not be combined with custom root mounts.");
409 }
410 }
411
412 return 0;
413 }
414
415 static int detect_unified_cgroup_hierarchy_from_environment(void) {
416 const char *e;
417 int r;
418
419 /* Allow the user to control whether the unified hierarchy is used */
420 e = getenv("UNIFIED_CGROUP_HIERARCHY");
421 if (e) {
422 r = parse_boolean(e);
423 if (r < 0)
424 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
425 if (r > 0)
426 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
427 else
428 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
429 }
430
431 return 0;
432 }
433
434 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
435 int r;
436
437 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
438 * image actually supports. */
439 r = cg_all_unified();
440 if (r < 0)
441 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
442 if (r > 0) {
443 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
444 * routine only detects 231, so we'll have a false negative here for 230. */
445 r = systemd_installation_has_version(directory, 230);
446 if (r < 0)
447 return log_error_errno(r, "Failed to determine systemd version in container: %m");
448 if (r > 0)
449 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
450 else
451 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
452 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
453 /* Mixed cgroup hierarchy support was added in 233 */
454 r = systemd_installation_has_version(directory, 233);
455 if (r < 0)
456 return log_error_errno(r, "Failed to determine systemd version in container: %m");
457 if (r > 0)
458 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
459 else
460 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
461 } else
462 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
463
464 log_debug("Using %s hierarchy for container.",
465 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
466 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
467
468 return 0;
469 }
470
471 static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
472 int r;
473
474 r = getenv_bool(name);
475 if (r == -ENXIO)
476 return;
477 if (r < 0)
478 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
479
480 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
481 arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
482 }
483
484 static void parse_mount_settings_env(void) {
485 const char *e;
486 int r;
487
488 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
489 if (r >= 0)
490 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
491 else if (r != -ENXIO)
492 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
493
494 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
495 if (!e)
496 return;
497
498 if (streq(e, "network")) {
499 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
500 return;
501 }
502
503 r = parse_boolean(e);
504 if (r < 0) {
505 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
506 return;
507 }
508
509 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
510 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
511 }
512
513 static void parse_environment(void) {
514 const char *e;
515 int r;
516
517 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
518 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
519 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
520 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
521
522 parse_mount_settings_env();
523
524 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
525 * even if it is supported. If not supported, it has no effect. */
526 if (!cg_ns_supported())
527 arg_use_cgns = false;
528 else {
529 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
530 if (r < 0) {
531 if (r != -ENXIO)
532 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS, ignoring: %m");
533
534 arg_use_cgns = true;
535 } else {
536 arg_use_cgns = r > 0;
537 arg_settings_mask |= SETTING_USE_CGNS;
538 }
539 }
540
541 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
542 if (e)
543 arg_container_service_name = e;
544
545 detect_unified_cgroup_hierarchy_from_environment();
546 }
547
548 static int parse_argv(int argc, char *argv[]) {
549 enum {
550 ARG_VERSION = 0x100,
551 ARG_PRIVATE_NETWORK,
552 ARG_UUID,
553 ARG_READ_ONLY,
554 ARG_CAPABILITY,
555 ARG_DROP_CAPABILITY,
556 ARG_LINK_JOURNAL,
557 ARG_BIND,
558 ARG_BIND_RO,
559 ARG_TMPFS,
560 ARG_OVERLAY,
561 ARG_OVERLAY_RO,
562 ARG_INACCESSIBLE,
563 ARG_SHARE_SYSTEM,
564 ARG_REGISTER,
565 ARG_KEEP_UNIT,
566 ARG_NETWORK_INTERFACE,
567 ARG_NETWORK_MACVLAN,
568 ARG_NETWORK_IPVLAN,
569 ARG_NETWORK_BRIDGE,
570 ARG_NETWORK_ZONE,
571 ARG_NETWORK_VETH_EXTRA,
572 ARG_NETWORK_NAMESPACE_PATH,
573 ARG_PERSONALITY,
574 ARG_VOLATILE,
575 ARG_TEMPLATE,
576 ARG_PROPERTY,
577 ARG_PRIVATE_USERS,
578 ARG_KILL_SIGNAL,
579 ARG_SETTINGS,
580 ARG_CHDIR,
581 ARG_PIVOT_ROOT,
582 ARG_PRIVATE_USERS_CHOWN,
583 ARG_NOTIFY_READY,
584 ARG_ROOT_HASH,
585 ARG_SYSTEM_CALL_FILTER,
586 ARG_RLIMIT,
587 ARG_HOSTNAME,
588 ARG_NO_NEW_PRIVILEGES,
589 ARG_OOM_SCORE_ADJUST,
590 ARG_CPU_AFFINITY,
591 ARG_RESOLV_CONF,
592 ARG_TIMEZONE,
593 ARG_CONSOLE,
594 ARG_PIPE,
595 ARG_OCI_BUNDLE,
596 ARG_NO_PAGER,
597 };
598
599 static const struct option options[] = {
600 { "help", no_argument, NULL, 'h' },
601 { "version", no_argument, NULL, ARG_VERSION },
602 { "directory", required_argument, NULL, 'D' },
603 { "template", required_argument, NULL, ARG_TEMPLATE },
604 { "ephemeral", no_argument, NULL, 'x' },
605 { "user", required_argument, NULL, 'u' },
606 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
607 { "as-pid2", no_argument, NULL, 'a' },
608 { "boot", no_argument, NULL, 'b' },
609 { "uuid", required_argument, NULL, ARG_UUID },
610 { "read-only", no_argument, NULL, ARG_READ_ONLY },
611 { "capability", required_argument, NULL, ARG_CAPABILITY },
612 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
613 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
614 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
615 { "bind", required_argument, NULL, ARG_BIND },
616 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
617 { "tmpfs", required_argument, NULL, ARG_TMPFS },
618 { "overlay", required_argument, NULL, ARG_OVERLAY },
619 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
620 { "inaccessible", required_argument, NULL, ARG_INACCESSIBLE },
621 { "machine", required_argument, NULL, 'M' },
622 { "hostname", required_argument, NULL, ARG_HOSTNAME },
623 { "slice", required_argument, NULL, 'S' },
624 { "setenv", required_argument, NULL, 'E' },
625 { "selinux-context", required_argument, NULL, 'Z' },
626 { "selinux-apifs-context", required_argument, NULL, 'L' },
627 { "quiet", no_argument, NULL, 'q' },
628 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
629 { "register", required_argument, NULL, ARG_REGISTER },
630 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
631 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
632 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
633 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
634 { "network-veth", no_argument, NULL, 'n' },
635 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
636 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
637 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
638 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
639 { "personality", required_argument, NULL, ARG_PERSONALITY },
640 { "image", required_argument, NULL, 'i' },
641 { "volatile", optional_argument, NULL, ARG_VOLATILE },
642 { "port", required_argument, NULL, 'p' },
643 { "property", required_argument, NULL, ARG_PROPERTY },
644 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
645 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
646 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
647 { "settings", required_argument, NULL, ARG_SETTINGS },
648 { "chdir", required_argument, NULL, ARG_CHDIR },
649 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
650 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
651 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
652 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
653 { "rlimit", required_argument, NULL, ARG_RLIMIT },
654 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
655 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
656 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
657 { "timezone", required_argument, NULL, ARG_TIMEZONE },
658 { "console", required_argument, NULL, ARG_CONSOLE },
659 { "pipe", no_argument, NULL, ARG_PIPE },
660 { "oci-bundle", required_argument, NULL, ARG_OCI_BUNDLE },
661 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
662 {}
663 };
664
665 int c, r;
666 const char *p;
667 uint64_t plus = 0, minus = 0;
668 bool mask_all_settings = false, mask_no_settings = false;
669
670 assert(argc >= 0);
671 assert(argv);
672
673 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
674 switch (c) {
675
676 case 'h':
677 return help();
678
679 case ARG_VERSION:
680 return version();
681
682 case 'D':
683 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
684 if (r < 0)
685 return r;
686
687 arg_settings_mask |= SETTING_DIRECTORY;
688 break;
689
690 case ARG_TEMPLATE:
691 r = parse_path_argument_and_warn(optarg, false, &arg_template);
692 if (r < 0)
693 return r;
694
695 arg_settings_mask |= SETTING_DIRECTORY;
696 break;
697
698 case 'i':
699 r = parse_path_argument_and_warn(optarg, false, &arg_image);
700 if (r < 0)
701 return r;
702
703 arg_settings_mask |= SETTING_DIRECTORY;
704 break;
705
706 case ARG_OCI_BUNDLE:
707 r = parse_path_argument_and_warn(optarg, false, &arg_oci_bundle);
708 if (r < 0)
709 return r;
710
711 break;
712
713 case 'x':
714 arg_ephemeral = true;
715 arg_settings_mask |= SETTING_EPHEMERAL;
716 break;
717
718 case 'u':
719 r = free_and_strdup(&arg_user, optarg);
720 if (r < 0)
721 return log_oom();
722
723 arg_settings_mask |= SETTING_USER;
724 break;
725
726 case ARG_NETWORK_ZONE: {
727 char *j;
728
729 j = strappend("vz-", optarg);
730 if (!j)
731 return log_oom();
732
733 if (!ifname_valid(j)) {
734 log_error("Network zone name not valid: %s", j);
735 free(j);
736 return -EINVAL;
737 }
738
739 free_and_replace(arg_network_zone, j);
740
741 arg_network_veth = true;
742 arg_private_network = true;
743 arg_settings_mask |= SETTING_NETWORK;
744 break;
745 }
746
747 case ARG_NETWORK_BRIDGE:
748
749 if (!ifname_valid(optarg))
750 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
751 "Bridge interface name not valid: %s", optarg);
752
753 r = free_and_strdup(&arg_network_bridge, optarg);
754 if (r < 0)
755 return log_oom();
756
757 _fallthrough_;
758 case 'n':
759 arg_network_veth = true;
760 arg_private_network = true;
761 arg_settings_mask |= SETTING_NETWORK;
762 break;
763
764 case ARG_NETWORK_VETH_EXTRA:
765 r = veth_extra_parse(&arg_network_veth_extra, optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
768
769 arg_private_network = true;
770 arg_settings_mask |= SETTING_NETWORK;
771 break;
772
773 case ARG_NETWORK_INTERFACE:
774 if (!ifname_valid(optarg))
775 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
776 "Network interface name not valid: %s", optarg);
777
778 if (strv_extend(&arg_network_interfaces, optarg) < 0)
779 return log_oom();
780
781 arg_private_network = true;
782 arg_settings_mask |= SETTING_NETWORK;
783 break;
784
785 case ARG_NETWORK_MACVLAN:
786
787 if (!ifname_valid(optarg))
788 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
789 "MACVLAN network interface name not valid: %s", optarg);
790
791 if (strv_extend(&arg_network_macvlan, optarg) < 0)
792 return log_oom();
793
794 arg_private_network = true;
795 arg_settings_mask |= SETTING_NETWORK;
796 break;
797
798 case ARG_NETWORK_IPVLAN:
799
800 if (!ifname_valid(optarg))
801 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
802 "IPVLAN network interface name not valid: %s", optarg);
803
804 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
805 return log_oom();
806
807 _fallthrough_;
808 case ARG_PRIVATE_NETWORK:
809 arg_private_network = true;
810 arg_settings_mask |= SETTING_NETWORK;
811 break;
812
813 case ARG_NETWORK_NAMESPACE_PATH:
814 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
815 if (r < 0)
816 return r;
817
818 arg_settings_mask |= SETTING_NETWORK;
819 break;
820
821 case 'b':
822 if (arg_start_mode == START_PID2)
823 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
824 "--boot and --as-pid2 may not be combined.");
825
826 arg_start_mode = START_BOOT;
827 arg_settings_mask |= SETTING_START_MODE;
828 break;
829
830 case 'a':
831 if (arg_start_mode == START_BOOT)
832 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
833 "--boot and --as-pid2 may not be combined.");
834
835 arg_start_mode = START_PID2;
836 arg_settings_mask |= SETTING_START_MODE;
837 break;
838
839 case ARG_UUID:
840 r = sd_id128_from_string(optarg, &arg_uuid);
841 if (r < 0)
842 return log_error_errno(r, "Invalid UUID: %s", optarg);
843
844 if (sd_id128_is_null(arg_uuid))
845 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
846 "Machine UUID may not be all zeroes.");
847
848 arg_settings_mask |= SETTING_MACHINE_ID;
849 break;
850
851 case 'S':
852 r = free_and_strdup(&arg_slice, optarg);
853 if (r < 0)
854 return log_oom();
855
856 arg_settings_mask |= SETTING_SLICE;
857 break;
858
859 case 'M':
860 if (isempty(optarg))
861 arg_machine = mfree(arg_machine);
862 else {
863 if (!machine_name_is_valid(optarg))
864 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
865 "Invalid machine name: %s", optarg);
866
867 r = free_and_strdup(&arg_machine, optarg);
868 if (r < 0)
869 return log_oom();
870 }
871 break;
872
873 case ARG_HOSTNAME:
874 if (isempty(optarg))
875 arg_hostname = mfree(arg_hostname);
876 else {
877 if (!hostname_is_valid(optarg, false))
878 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
879 "Invalid hostname: %s", optarg);
880
881 r = free_and_strdup(&arg_hostname, optarg);
882 if (r < 0)
883 return log_oom();
884 }
885
886 arg_settings_mask |= SETTING_HOSTNAME;
887 break;
888
889 case 'Z':
890 arg_selinux_context = optarg;
891 break;
892
893 case 'L':
894 arg_selinux_apifs_context = optarg;
895 break;
896
897 case ARG_READ_ONLY:
898 arg_read_only = true;
899 arg_settings_mask |= SETTING_READ_ONLY;
900 break;
901
902 case ARG_CAPABILITY:
903 case ARG_DROP_CAPABILITY: {
904 p = optarg;
905 for (;;) {
906 _cleanup_free_ char *t = NULL;
907
908 r = extract_first_word(&p, &t, ",", 0);
909 if (r < 0)
910 return log_error_errno(r, "Failed to parse capability %s.", t);
911 if (r == 0)
912 break;
913
914 if (streq(t, "all")) {
915 if (c == ARG_CAPABILITY)
916 plus = (uint64_t) -1;
917 else
918 minus = (uint64_t) -1;
919 } else {
920 r = capability_from_name(t);
921 if (r < 0)
922 return log_error_errno(r, "Failed to parse capability %s.", t);
923
924 if (c == ARG_CAPABILITY)
925 plus |= 1ULL << r;
926 else
927 minus |= 1ULL << r;
928 }
929 }
930
931 arg_settings_mask |= SETTING_CAPABILITY;
932 break;
933 }
934
935 case ARG_NO_NEW_PRIVILEGES:
936 r = parse_boolean(optarg);
937 if (r < 0)
938 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
939
940 arg_no_new_privileges = r;
941 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
942 break;
943
944 case 'j':
945 arg_link_journal = LINK_GUEST;
946 arg_link_journal_try = true;
947 arg_settings_mask |= SETTING_LINK_JOURNAL;
948 break;
949
950 case ARG_LINK_JOURNAL:
951 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
952 if (r < 0)
953 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
954
955 arg_settings_mask |= SETTING_LINK_JOURNAL;
956 break;
957
958 case ARG_BIND:
959 case ARG_BIND_RO:
960 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
961 if (r < 0)
962 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
963
964 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
965 break;
966
967 case ARG_TMPFS:
968 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
969 if (r < 0)
970 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
971
972 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
973 break;
974
975 case ARG_OVERLAY:
976 case ARG_OVERLAY_RO:
977 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
978 if (r == -EADDRNOTAVAIL)
979 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
980 if (r < 0)
981 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
982
983 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
984 break;
985
986 case ARG_INACCESSIBLE:
987 r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
988 if (r < 0)
989 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
990
991 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
992 break;
993
994 case 'E': {
995 char **n;
996
997 if (!env_assignment_is_valid(optarg))
998 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
999 "Environment variable assignment '%s' is not valid.", optarg);
1000
1001 n = strv_env_set(arg_setenv, optarg);
1002 if (!n)
1003 return log_oom();
1004
1005 strv_free_and_replace(arg_setenv, n);
1006 arg_settings_mask |= SETTING_ENVIRONMENT;
1007 break;
1008 }
1009
1010 case 'q':
1011 arg_quiet = true;
1012 break;
1013
1014 case ARG_SHARE_SYSTEM:
1015 /* We don't officially support this anymore, except for compat reasons. People should use the
1016 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1017 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1018 arg_clone_ns_flags = 0;
1019 break;
1020
1021 case ARG_REGISTER:
1022 r = parse_boolean(optarg);
1023 if (r < 0) {
1024 log_error("Failed to parse --register= argument: %s", optarg);
1025 return r;
1026 }
1027
1028 arg_register = r;
1029 break;
1030
1031 case ARG_KEEP_UNIT:
1032 arg_keep_unit = true;
1033 break;
1034
1035 case ARG_PERSONALITY:
1036
1037 arg_personality = personality_from_string(optarg);
1038 if (arg_personality == PERSONALITY_INVALID)
1039 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1040 "Unknown or unsupported personality '%s'.", optarg);
1041
1042 arg_settings_mask |= SETTING_PERSONALITY;
1043 break;
1044
1045 case ARG_VOLATILE:
1046
1047 if (!optarg)
1048 arg_volatile_mode = VOLATILE_YES;
1049 else if (streq(optarg, "help")) {
1050 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1051 return 0;
1052 } else {
1053 VolatileMode m;
1054
1055 m = volatile_mode_from_string(optarg);
1056 if (m < 0)
1057 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1058 "Failed to parse --volatile= argument: %s", optarg);
1059 else
1060 arg_volatile_mode = m;
1061 }
1062
1063 arg_settings_mask |= SETTING_VOLATILE_MODE;
1064 break;
1065
1066 case 'p':
1067 r = expose_port_parse(&arg_expose_ports, optarg);
1068 if (r == -EEXIST)
1069 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1070 if (r < 0)
1071 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1072
1073 arg_settings_mask |= SETTING_EXPOSE_PORTS;
1074 break;
1075
1076 case ARG_PROPERTY:
1077 if (strv_extend(&arg_property, optarg) < 0)
1078 return log_oom();
1079
1080 break;
1081
1082 case ARG_PRIVATE_USERS: {
1083 int boolean = -1;
1084
1085 if (!optarg)
1086 boolean = true;
1087 else if (!in_charset(optarg, DIGITS))
1088 /* do *not* parse numbers as booleans */
1089 boolean = parse_boolean(optarg);
1090
1091 if (boolean == false) {
1092 /* no: User namespacing off */
1093 arg_userns_mode = USER_NAMESPACE_NO;
1094 arg_uid_shift = UID_INVALID;
1095 arg_uid_range = UINT32_C(0x10000);
1096 } else if (boolean == true) {
1097 /* yes: User namespacing on, UID range is read from root dir */
1098 arg_userns_mode = USER_NAMESPACE_FIXED;
1099 arg_uid_shift = UID_INVALID;
1100 arg_uid_range = UINT32_C(0x10000);
1101 } else if (streq(optarg, "pick")) {
1102 /* pick: User namespacing on, UID range is picked randomly */
1103 arg_userns_mode = USER_NAMESPACE_PICK;
1104 arg_uid_shift = UID_INVALID;
1105 arg_uid_range = UINT32_C(0x10000);
1106 } else {
1107 _cleanup_free_ char *buffer = NULL;
1108 const char *range, *shift;
1109
1110 /* anything else: User namespacing on, UID range is explicitly configured */
1111
1112 range = strchr(optarg, ':');
1113 if (range) {
1114 buffer = strndup(optarg, range - optarg);
1115 if (!buffer)
1116 return log_oom();
1117 shift = buffer;
1118
1119 range++;
1120 r = safe_atou32(range, &arg_uid_range);
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1123 } else
1124 shift = optarg;
1125
1126 r = parse_uid(shift, &arg_uid_shift);
1127 if (r < 0)
1128 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1129
1130 arg_userns_mode = USER_NAMESPACE_FIXED;
1131 }
1132
1133 if (arg_uid_range <= 0)
1134 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1135 "UID range cannot be 0.");
1136
1137 arg_settings_mask |= SETTING_USERNS;
1138 break;
1139 }
1140
1141 case 'U':
1142 if (userns_supported()) {
1143 arg_userns_mode = USER_NAMESPACE_PICK;
1144 arg_uid_shift = UID_INVALID;
1145 arg_uid_range = UINT32_C(0x10000);
1146
1147 arg_settings_mask |= SETTING_USERNS;
1148 }
1149
1150 break;
1151
1152 case ARG_PRIVATE_USERS_CHOWN:
1153 arg_userns_chown = true;
1154
1155 arg_settings_mask |= SETTING_USERNS;
1156 break;
1157
1158 case ARG_KILL_SIGNAL:
1159 if (streq(optarg, "help")) {
1160 DUMP_STRING_TABLE(signal, int, _NSIG);
1161 return 0;
1162 }
1163
1164 arg_kill_signal = signal_from_string(optarg);
1165 if (arg_kill_signal < 0)
1166 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1167 "Cannot parse signal: %s", optarg);
1168
1169 arg_settings_mask |= SETTING_KILL_SIGNAL;
1170 break;
1171
1172 case ARG_SETTINGS:
1173
1174 /* no → do not read files
1175 * yes → read files, do not override cmdline, trust only subset
1176 * override → read files, override cmdline, trust only subset
1177 * trusted → read files, do not override cmdline, trust all
1178 */
1179
1180 r = parse_boolean(optarg);
1181 if (r < 0) {
1182 if (streq(optarg, "trusted")) {
1183 mask_all_settings = false;
1184 mask_no_settings = false;
1185 arg_settings_trusted = true;
1186
1187 } else if (streq(optarg, "override")) {
1188 mask_all_settings = false;
1189 mask_no_settings = true;
1190 arg_settings_trusted = -1;
1191 } else
1192 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1193 } else if (r > 0) {
1194 /* yes */
1195 mask_all_settings = false;
1196 mask_no_settings = false;
1197 arg_settings_trusted = -1;
1198 } else {
1199 /* no */
1200 mask_all_settings = true;
1201 mask_no_settings = false;
1202 arg_settings_trusted = false;
1203 }
1204
1205 break;
1206
1207 case ARG_CHDIR:
1208 if (!path_is_absolute(optarg))
1209 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1210 "Working directory %s is not an absolute path.", optarg);
1211
1212 r = free_and_strdup(&arg_chdir, optarg);
1213 if (r < 0)
1214 return log_oom();
1215
1216 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1217 break;
1218
1219 case ARG_PIVOT_ROOT:
1220 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1221 if (r < 0)
1222 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1223
1224 arg_settings_mask |= SETTING_PIVOT_ROOT;
1225 break;
1226
1227 case ARG_NOTIFY_READY:
1228 r = parse_boolean(optarg);
1229 if (r < 0)
1230 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1231 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1232 arg_notify_ready = r;
1233 arg_settings_mask |= SETTING_NOTIFY_READY;
1234 break;
1235
1236 case ARG_ROOT_HASH: {
1237 void *k;
1238 size_t l;
1239
1240 r = unhexmem(optarg, strlen(optarg), &k, &l);
1241 if (r < 0)
1242 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1243 if (l < sizeof(sd_id128_t)) {
1244 free(k);
1245 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1246 }
1247
1248 free(arg_root_hash);
1249 arg_root_hash = k;
1250 arg_root_hash_size = l;
1251 break;
1252 }
1253
1254 case ARG_SYSTEM_CALL_FILTER: {
1255 bool negative;
1256 const char *items;
1257
1258 negative = optarg[0] == '~';
1259 items = negative ? optarg + 1 : optarg;
1260
1261 for (;;) {
1262 _cleanup_free_ char *word = NULL;
1263
1264 r = extract_first_word(&items, &word, NULL, 0);
1265 if (r == 0)
1266 break;
1267 if (r == -ENOMEM)
1268 return log_oom();
1269 if (r < 0)
1270 return log_error_errno(r, "Failed to parse system call filter: %m");
1271
1272 if (negative)
1273 r = strv_extend(&arg_syscall_blacklist, word);
1274 else
1275 r = strv_extend(&arg_syscall_whitelist, word);
1276 if (r < 0)
1277 return log_oom();
1278 }
1279
1280 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1281 break;
1282 }
1283
1284 case ARG_RLIMIT: {
1285 const char *eq;
1286 char *name;
1287 int rl;
1288
1289 if (streq(optarg, "help")) {
1290 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1291 return 0;
1292 }
1293
1294 eq = strchr(optarg, '=');
1295 if (!eq)
1296 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1297 "--rlimit= expects an '=' assignment.");
1298
1299 name = strndup(optarg, eq - optarg);
1300 if (!name)
1301 return log_oom();
1302
1303 rl = rlimit_from_string_harder(name);
1304 if (rl < 0)
1305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1306 "Unknown resource limit: %s", name);
1307
1308 if (!arg_rlimit[rl]) {
1309 arg_rlimit[rl] = new0(struct rlimit, 1);
1310 if (!arg_rlimit[rl])
1311 return log_oom();
1312 }
1313
1314 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1315 if (r < 0)
1316 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1317
1318 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1319 break;
1320 }
1321
1322 case ARG_OOM_SCORE_ADJUST:
1323 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1324 if (r < 0)
1325 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1326
1327 arg_oom_score_adjust_set = true;
1328 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1329 break;
1330
1331 case ARG_CPU_AFFINITY: {
1332 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1333
1334 r = parse_cpu_set(optarg, &cpuset);
1335 if (r < 0)
1336 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1337
1338 if (arg_cpuset)
1339 CPU_FREE(arg_cpuset);
1340
1341 arg_cpuset = TAKE_PTR(cpuset);
1342 arg_cpuset_ncpus = r;
1343 arg_settings_mask |= SETTING_CPU_AFFINITY;
1344 break;
1345 }
1346
1347 case ARG_RESOLV_CONF:
1348 if (streq(optarg, "help")) {
1349 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1350 return 0;
1351 }
1352
1353 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1354 if (arg_resolv_conf < 0)
1355 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1356 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1357
1358 arg_settings_mask |= SETTING_RESOLV_CONF;
1359 break;
1360
1361 case ARG_TIMEZONE:
1362 if (streq(optarg, "help")) {
1363 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1364 return 0;
1365 }
1366
1367 arg_timezone = timezone_mode_from_string(optarg);
1368 if (arg_timezone < 0)
1369 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1370 "Failed to parse /etc/localtime mode: %s", optarg);
1371
1372 arg_settings_mask |= SETTING_TIMEZONE;
1373 break;
1374
1375 case ARG_CONSOLE:
1376 if (streq(optarg, "interactive"))
1377 arg_console_mode = CONSOLE_INTERACTIVE;
1378 else if (streq(optarg, "read-only"))
1379 arg_console_mode = CONSOLE_READ_ONLY;
1380 else if (streq(optarg, "passive"))
1381 arg_console_mode = CONSOLE_PASSIVE;
1382 else if (streq(optarg, "pipe"))
1383 arg_console_mode = CONSOLE_PIPE;
1384 else if (streq(optarg, "help"))
1385 puts("interactive\n"
1386 "read-only\n"
1387 "passive\n"
1388 "pipe");
1389 else
1390 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
1391
1392 arg_settings_mask |= SETTING_CONSOLE_MODE;
1393 break;
1394
1395 case 'P':
1396 case ARG_PIPE:
1397 arg_console_mode = CONSOLE_PIPE;
1398 arg_settings_mask |= SETTING_CONSOLE_MODE;
1399 break;
1400
1401 case ARG_NO_PAGER:
1402 arg_pager_flags |= PAGER_DISABLE;
1403 break;
1404
1405 case '?':
1406 return -EINVAL;
1407
1408 default:
1409 assert_not_reached("Unhandled option");
1410 }
1411
1412 if (argc > optind) {
1413 strv_free(arg_parameters);
1414 arg_parameters = strv_copy(argv + optind);
1415 if (!arg_parameters)
1416 return log_oom();
1417
1418 arg_settings_mask |= SETTING_START_MODE;
1419 }
1420
1421 if (arg_ephemeral && arg_template && !arg_directory)
1422 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1423 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1424 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1425 * --directory=". */
1426 arg_directory = TAKE_PTR(arg_template);
1427
1428 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1429
1430 /* Make sure to parse environment before we reset the settings mask below */
1431 parse_environment();
1432
1433 /* Load all settings from .nspawn files */
1434 if (mask_no_settings)
1435 arg_settings_mask = 0;
1436
1437 /* Don't load any settings from .nspawn files */
1438 if (mask_all_settings)
1439 arg_settings_mask = _SETTINGS_MASK_ALL;
1440
1441 return 1;
1442 }
1443
1444 static int verify_arguments(void) {
1445 int r;
1446
1447 if (arg_userns_mode != USER_NAMESPACE_NO)
1448 arg_mount_settings |= MOUNT_USE_USERNS;
1449
1450 if (arg_private_network)
1451 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1452
1453 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1454 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1455 arg_register = false;
1456 if (arg_start_mode != START_PID1)
1457 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1458 }
1459
1460 if (arg_userns_mode == USER_NAMESPACE_PICK)
1461 arg_userns_chown = true;
1462
1463 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1464 arg_kill_signal = SIGRTMIN+3;
1465
1466 if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1467 arg_read_only = true;
1468
1469 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1470 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1471 * The latter is not technically a user session, but we don't need to labour the point. */
1472 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1473
1474 if (arg_directory && arg_image)
1475 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1476
1477 if (arg_template && arg_image)
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1479
1480 if (arg_template && !(arg_directory || arg_machine))
1481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1482
1483 if (arg_ephemeral && arg_template)
1484 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1485
1486 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1487 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1488
1489 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1490 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1491
1492 if (arg_userns_chown && arg_read_only)
1493 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1494 "--read-only and --private-users-chown may not be combined.");
1495
1496 /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
1497 * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
1498 * copy-up (in case of overlay) making the entire excercise pointless. */
1499 if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
1500 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
1501
1502 /* If --network-namespace-path is given with any other network-related option, we need to error out,
1503 * to avoid conflicts between different network options. */
1504 if (arg_network_namespace_path &&
1505 (arg_network_interfaces || arg_network_macvlan ||
1506 arg_network_ipvlan || arg_network_veth_extra ||
1507 arg_network_bridge || arg_network_zone ||
1508 arg_network_veth || arg_private_network))
1509 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1510
1511 if (arg_network_bridge && arg_network_zone)
1512 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1513 "--network-bridge= and --network-zone= may not be combined.");
1514
1515 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1516 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1517
1518 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1519 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1520
1521 if (arg_expose_ports && !arg_private_network)
1522 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1523
1524 #if ! HAVE_LIBIPTC
1525 if (arg_expose_ports)
1526 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1527 #endif
1528
1529 r = custom_mount_check_all();
1530 if (r < 0)
1531 return r;
1532
1533 return 0;
1534 }
1535
1536 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1537 assert(p);
1538
1539 if (arg_userns_mode == USER_NAMESPACE_NO)
1540 return 0;
1541
1542 if (uid == UID_INVALID && gid == GID_INVALID)
1543 return 0;
1544
1545 if (uid != UID_INVALID) {
1546 uid += arg_uid_shift;
1547
1548 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1549 return -EOVERFLOW;
1550 }
1551
1552 if (gid != GID_INVALID) {
1553 gid += (gid_t) arg_uid_shift;
1554
1555 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1556 return -EOVERFLOW;
1557 }
1558
1559 if (lchown(p, uid, gid) < 0)
1560 return -errno;
1561
1562 return 0;
1563 }
1564
1565 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1566 const char *q;
1567 int r;
1568
1569 q = prefix_roota(root, path);
1570 r = mkdir_errno_wrapper(q, mode);
1571 if (r == -EEXIST)
1572 return 0;
1573 if (r < 0)
1574 return r;
1575
1576 return userns_lchown(q, uid, gid);
1577 }
1578
1579 static const char *timezone_from_path(const char *path) {
1580 return PATH_STARTSWITH_SET(
1581 path,
1582 "../usr/share/zoneinfo/",
1583 "/usr/share/zoneinfo/");
1584 }
1585
1586 static bool etc_writable(void) {
1587 return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1588 }
1589
1590 static int setup_timezone(const char *dest) {
1591 _cleanup_free_ char *p = NULL, *etc = NULL;
1592 const char *where, *check;
1593 TimezoneMode m;
1594 int r;
1595
1596 assert(dest);
1597
1598 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1599 r = readlink_malloc("/etc/localtime", &p);
1600 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1601 m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1602 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1603 m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1604 else if (r < 0) {
1605 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1606 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1607 * file.
1608 *
1609 * Example:
1610 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1611 */
1612 return 0;
1613 } else if (arg_timezone == TIMEZONE_AUTO)
1614 m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1615 else
1616 m = arg_timezone;
1617 } else
1618 m = arg_timezone;
1619
1620 if (m == TIMEZONE_OFF)
1621 return 0;
1622
1623 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1624 if (r < 0) {
1625 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1626 return 0;
1627 }
1628
1629 where = strjoina(etc, "/localtime");
1630
1631 switch (m) {
1632
1633 case TIMEZONE_DELETE:
1634 if (unlink(where) < 0)
1635 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1636
1637 return 0;
1638
1639 case TIMEZONE_SYMLINK: {
1640 _cleanup_free_ char *q = NULL;
1641 const char *z, *what;
1642
1643 z = timezone_from_path(p);
1644 if (!z) {
1645 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1646 return 0;
1647 }
1648
1649 r = readlink_malloc(where, &q);
1650 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1651 return 0; /* Already pointing to the right place? Then do nothing .. */
1652
1653 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1654 r = chase_symlinks(check, dest, 0, NULL);
1655 if (r < 0)
1656 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1657 else {
1658 if (unlink(where) < 0 && errno != ENOENT) {
1659 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1660 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1661 return 0;
1662 }
1663
1664 what = strjoina("../usr/share/zoneinfo/", z);
1665 if (symlink(what, where) < 0) {
1666 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1667 errno, "Failed to correct timezone of container, ignoring: %m");
1668 return 0;
1669 }
1670
1671 break;
1672 }
1673
1674 _fallthrough_;
1675 }
1676
1677 case TIMEZONE_BIND: {
1678 _cleanup_free_ char *resolved = NULL;
1679 int found;
1680
1681 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1682 if (found < 0) {
1683 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1684 return 0;
1685 }
1686
1687 if (found == 0) /* missing? */
1688 (void) touch(resolved);
1689
1690 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1691 if (r >= 0)
1692 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1693
1694 _fallthrough_;
1695 }
1696
1697 case TIMEZONE_COPY:
1698 /* If mounting failed, try to copy */
1699 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
1700 if (r < 0) {
1701 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1702 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1703 return 0;
1704 }
1705
1706 break;
1707
1708 default:
1709 assert_not_reached("unexpected mode");
1710 }
1711
1712 /* Fix permissions of the symlink or file copy we just created */
1713 r = userns_lchown(where, 0, 0);
1714 if (r < 0)
1715 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1716
1717 return 0;
1718 }
1719
1720 static int have_resolv_conf(const char *path) {
1721 assert(path);
1722
1723 if (access(path, F_OK) < 0) {
1724 if (errno == ENOENT)
1725 return 0;
1726
1727 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1728 }
1729
1730 return 1;
1731 }
1732
1733 static int resolved_listening(void) {
1734 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1735 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1736 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1737 int r;
1738
1739 /* Check if resolved is listening */
1740
1741 r = sd_bus_open_system(&bus);
1742 if (r < 0)
1743 return log_debug_errno(r, "Failed to open system bus: %m");
1744
1745 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1746 if (r < 0)
1747 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1748 if (r == 0)
1749 return 0;
1750
1751 r = sd_bus_get_property_string(bus,
1752 "org.freedesktop.resolve1",
1753 "/org/freedesktop/resolve1",
1754 "org.freedesktop.resolve1.Manager",
1755 "DNSStubListener",
1756 &error,
1757 &dns_stub_listener_mode);
1758 if (r < 0)
1759 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1760
1761 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1762 }
1763
1764 static int setup_resolv_conf(const char *dest) {
1765 _cleanup_free_ char *etc = NULL;
1766 const char *where, *what;
1767 ResolvConfMode m;
1768 int r;
1769
1770 assert(dest);
1771
1772 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1773 if (arg_private_network)
1774 m = RESOLV_CONF_OFF;
1775 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1776 m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
1777 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1778 m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
1779 else
1780 m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
1781 } else
1782 m = arg_resolv_conf;
1783
1784 if (m == RESOLV_CONF_OFF)
1785 return 0;
1786
1787 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1788 if (r < 0) {
1789 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1790 return 0;
1791 }
1792
1793 where = strjoina(etc, "/resolv.conf");
1794
1795 if (m == RESOLV_CONF_DELETE) {
1796 if (unlink(where) < 0)
1797 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1798
1799 return 0;
1800 }
1801
1802 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1803 what = STATIC_RESOLV_CONF;
1804 else
1805 what = "/etc/resolv.conf";
1806
1807 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1808 _cleanup_free_ char *resolved = NULL;
1809 int found;
1810
1811 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1812 if (found < 0) {
1813 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1814 return 0;
1815 }
1816
1817 if (found == 0) /* missing? */
1818 (void) touch(resolved);
1819
1820 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1821 if (r >= 0)
1822 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1823 }
1824
1825 /* If that didn't work, let's copy the file */
1826 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
1827 if (r < 0) {
1828 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1829 * resolved or something similar runs inside and the symlink points there.
1830 *
1831 * If the disk image is read-only, there's also no point in complaining.
1832 */
1833 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1834 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1835 return 0;
1836 }
1837
1838 r = userns_lchown(where, 0, 0);
1839 if (r < 0)
1840 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1841
1842 return 0;
1843 }
1844
1845 static int setup_boot_id(void) {
1846 _cleanup_(unlink_and_freep) char *from = NULL;
1847 _cleanup_free_ char *path = NULL;
1848 sd_id128_t rnd = SD_ID128_NULL;
1849 const char *to;
1850 int r;
1851
1852 /* Generate a new randomized boot ID, so that each boot-up of
1853 * the container gets a new one */
1854
1855 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1856 if (r < 0)
1857 return log_error_errno(r, "Failed to generate random boot ID path: %m");
1858
1859 r = sd_id128_randomize(&rnd);
1860 if (r < 0)
1861 return log_error_errno(r, "Failed to generate random boot id: %m");
1862
1863 r = id128_write(path, ID128_UUID, rnd, false);
1864 if (r < 0)
1865 return log_error_errno(r, "Failed to write boot id: %m");
1866
1867 from = TAKE_PTR(path);
1868 to = "/proc/sys/kernel/random/boot_id";
1869
1870 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1871 if (r < 0)
1872 return r;
1873
1874 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1875 }
1876
1877 static int copy_devnodes(const char *dest) {
1878 static const char devnodes[] =
1879 "null\0"
1880 "zero\0"
1881 "full\0"
1882 "random\0"
1883 "urandom\0"
1884 "tty\0"
1885 "net/tun\0";
1886
1887 _cleanup_umask_ mode_t u;
1888 const char *d;
1889 int r = 0;
1890
1891 assert(dest);
1892
1893 u = umask(0000);
1894
1895 /* Create /dev/net, so that we can create /dev/net/tun in it */
1896 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1897 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1898
1899 NULSTR_FOREACH(d, devnodes) {
1900 _cleanup_free_ char *from = NULL, *to = NULL;
1901 struct stat st;
1902
1903 from = strappend("/dev/", d);
1904 if (!from)
1905 return log_oom();
1906
1907 to = prefix_root(dest, from);
1908 if (!to)
1909 return log_oom();
1910
1911 if (stat(from, &st) < 0) {
1912
1913 if (errno != ENOENT)
1914 return log_error_errno(errno, "Failed to stat %s: %m", from);
1915
1916 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1917 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1918 "%s is not a char or block device, cannot copy.", from);
1919 else {
1920 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1921
1922 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1923 /* Explicitly warn the user when /dev is already populated. */
1924 if (errno == EEXIST)
1925 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1926 if (errno != EPERM)
1927 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1928
1929 /* Some systems abusively restrict mknod but allow bind mounts. */
1930 r = touch(to);
1931 if (r < 0)
1932 return log_error_errno(r, "touch (%s) failed: %m", to);
1933 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1934 if (r < 0)
1935 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1936 }
1937
1938 r = userns_lchown(to, 0, 0);
1939 if (r < 0)
1940 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1941
1942 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1943 if (!dn)
1944 return log_oom();
1945
1946 r = userns_mkdir(dest, dn, 0755, 0, 0);
1947 if (r < 0)
1948 return log_error_errno(r, "Failed to create '%s': %m", dn);
1949
1950 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1951 return log_oom();
1952
1953 prefixed = prefix_root(dest, sl);
1954 if (!prefixed)
1955 return log_oom();
1956
1957 t = strjoin("../", d);
1958 if (!t)
1959 return log_oom();
1960
1961 if (symlink(t, prefixed) < 0)
1962 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
1963 }
1964 }
1965
1966 return r;
1967 }
1968
1969 static int make_extra_nodes(const char *dest) {
1970 _cleanup_umask_ mode_t u;
1971 size_t i;
1972 int r;
1973
1974 u = umask(0000);
1975
1976 for (i = 0; i < arg_n_extra_nodes; i++) {
1977 _cleanup_free_ char *path = NULL;
1978 DeviceNode *n = arg_extra_nodes + i;
1979
1980 path = prefix_root(dest, n->path);
1981 if (!path)
1982 return log_oom();
1983
1984 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
1985 return log_error_errno(errno, "Failed to create device node '%s': %m", path);
1986
1987 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
1988 if (r < 0)
1989 return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
1990 }
1991
1992 return 0;
1993 }
1994
1995 static int setup_pts(const char *dest) {
1996 _cleanup_free_ char *options = NULL;
1997 const char *p;
1998 int r;
1999
2000 #if HAVE_SELINUX
2001 if (arg_selinux_apifs_context)
2002 (void) asprintf(&options,
2003 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2004 arg_uid_shift + TTY_GID,
2005 arg_selinux_apifs_context);
2006 else
2007 #endif
2008 (void) asprintf(&options,
2009 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2010 arg_uid_shift + TTY_GID);
2011
2012 if (!options)
2013 return log_oom();
2014
2015 /* Mount /dev/pts itself */
2016 p = prefix_roota(dest, "/dev/pts");
2017 r = mkdir_errno_wrapper(p, 0755);
2018 if (r < 0)
2019 return log_error_errno(r, "Failed to create /dev/pts: %m");
2020
2021 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2022 if (r < 0)
2023 return r;
2024 r = userns_lchown(p, 0, 0);
2025 if (r < 0)
2026 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2027
2028 /* Create /dev/ptmx symlink */
2029 p = prefix_roota(dest, "/dev/ptmx");
2030 if (symlink("pts/ptmx", p) < 0)
2031 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2032 r = userns_lchown(p, 0, 0);
2033 if (r < 0)
2034 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2035
2036 /* And fix /dev/pts/ptmx ownership */
2037 p = prefix_roota(dest, "/dev/pts/ptmx");
2038 r = userns_lchown(p, 0, 0);
2039 if (r < 0)
2040 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2041
2042 return 0;
2043 }
2044
2045 static int setup_dev_console(const char *dest, const char *console) {
2046 _cleanup_umask_ mode_t u;
2047 const char *to;
2048 int r;
2049
2050 assert(dest);
2051
2052 u = umask(0000);
2053
2054 if (!console)
2055 return 0;
2056
2057 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
2058 if (r < 0)
2059 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
2060
2061 /* We need to bind mount the right tty to /dev/console since
2062 * ptys can only exist on pts file systems. To have something
2063 * to bind mount things on we create a empty regular file. */
2064
2065 to = prefix_roota(dest, "/dev/console");
2066 r = touch(to);
2067 if (r < 0)
2068 return log_error_errno(r, "touch() for /dev/console failed: %m");
2069
2070 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
2071 }
2072
2073 static int setup_keyring(void) {
2074 key_serial_t keyring;
2075
2076 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
2077 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
2078 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
2079 * these system calls let's make sure we don't leak anything into the container. */
2080
2081 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2082 if (keyring == -1) {
2083 if (errno == ENOSYS)
2084 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2085 else if (IN_SET(errno, EACCES, EPERM))
2086 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2087 else
2088 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2089 }
2090
2091 return 0;
2092 }
2093
2094 static int setup_kmsg(int kmsg_socket) {
2095 _cleanup_(unlink_and_freep) char *from = NULL;
2096 _cleanup_free_ char *fifo = NULL;
2097 _cleanup_close_ int fd = -1;
2098 _cleanup_umask_ mode_t u;
2099 const char *to;
2100 int r;
2101
2102 assert(kmsg_socket >= 0);
2103
2104 u = umask(0000);
2105
2106 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
2107 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2108 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2109 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2110
2111 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
2112 if (r < 0)
2113 return log_error_errno(r, "Failed to generate kmsg path: %m");
2114
2115 if (mkfifo(fifo, 0600) < 0)
2116 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2117
2118 from = TAKE_PTR(fifo);
2119 to = "/proc/kmsg";
2120
2121 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2122 if (r < 0)
2123 return r;
2124
2125 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2126 if (fd < 0)
2127 return log_error_errno(errno, "Failed to open fifo: %m");
2128
2129 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2130 r = send_one_fd(kmsg_socket, fd, 0);
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to send FIFO fd: %m");
2133
2134 return 0;
2135 }
2136
2137 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2138 union in_addr_union *exposed = userdata;
2139
2140 assert(rtnl);
2141 assert(m);
2142 assert(exposed);
2143
2144 expose_port_execute(rtnl, arg_expose_ports, exposed);
2145 return 0;
2146 }
2147
2148 static int setup_hostname(void) {
2149 int r;
2150
2151 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2152 return 0;
2153
2154 r = sethostname_idempotent(arg_hostname ?: arg_machine);
2155 if (r < 0)
2156 return log_error_errno(r, "Failed to set hostname: %m");
2157
2158 return 0;
2159 }
2160
2161 static int setup_journal(const char *directory) {
2162 _cleanup_free_ char *d = NULL;
2163 const char *dirname, *p, *q;
2164 sd_id128_t this_id;
2165 char id[33];
2166 bool try;
2167 int r;
2168
2169 /* Don't link journals in ephemeral mode */
2170 if (arg_ephemeral)
2171 return 0;
2172
2173 if (arg_link_journal == LINK_NO)
2174 return 0;
2175
2176 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2177
2178 r = sd_id128_get_machine(&this_id);
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2181
2182 if (sd_id128_equal(arg_uuid, this_id)) {
2183 log_full(try ? LOG_WARNING : LOG_ERR,
2184 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
2185 if (try)
2186 return 0;
2187 return -EEXIST;
2188 }
2189
2190 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2191 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2192 if (r < 0) {
2193 bool ignore = r == -EROFS && try;
2194 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2195 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2196 return ignore ? 0 : r;
2197 }
2198 }
2199
2200 (void) sd_id128_to_string(arg_uuid, id);
2201
2202 p = strjoina("/var/log/journal/", id);
2203 q = prefix_roota(directory, p);
2204
2205 if (path_is_mount_point(p, NULL, 0) > 0) {
2206 if (try)
2207 return 0;
2208
2209 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2210 "%s: already a mount point, refusing to use for journal", p);
2211 }
2212
2213 if (path_is_mount_point(q, NULL, 0) > 0) {
2214 if (try)
2215 return 0;
2216
2217 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2218 "%s: already a mount point, refusing to use for journal", q);
2219 }
2220
2221 r = readlink_and_make_absolute(p, &d);
2222 if (r >= 0) {
2223 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2224 path_equal(d, q)) {
2225
2226 r = userns_mkdir(directory, p, 0755, 0, 0);
2227 if (r < 0)
2228 log_warning_errno(r, "Failed to create directory %s: %m", q);
2229 return 0;
2230 }
2231
2232 if (unlink(p) < 0)
2233 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2234 } else if (r == -EINVAL) {
2235
2236 if (arg_link_journal == LINK_GUEST &&
2237 rmdir(p) < 0) {
2238
2239 if (errno == ENOTDIR) {
2240 log_error("%s already exists and is neither a symlink nor a directory", p);
2241 return r;
2242 } else
2243 return log_error_errno(errno, "Failed to remove %s: %m", p);
2244 }
2245 } else if (r != -ENOENT)
2246 return log_error_errno(r, "readlink(%s) failed: %m", p);
2247
2248 if (arg_link_journal == LINK_GUEST) {
2249
2250 if (symlink(q, p) < 0) {
2251 if (try) {
2252 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2253 return 0;
2254 } else
2255 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2256 }
2257
2258 r = userns_mkdir(directory, p, 0755, 0, 0);
2259 if (r < 0)
2260 log_warning_errno(r, "Failed to create directory %s: %m", q);
2261 return 0;
2262 }
2263
2264 if (arg_link_journal == LINK_HOST) {
2265 /* don't create parents here — if the host doesn't have
2266 * permanent journal set up, don't force it here */
2267
2268 r = mkdir_errno_wrapper(p, 0755);
2269 if (r < 0 && r != -EEXIST) {
2270 if (try) {
2271 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2272 return 0;
2273 } else
2274 return log_error_errno(r, "Failed to create %s: %m", p);
2275 }
2276
2277 } else if (access(p, F_OK) < 0)
2278 return 0;
2279
2280 if (dir_is_empty(q) == 0)
2281 log_warning("%s is not empty, proceeding anyway.", q);
2282
2283 r = userns_mkdir(directory, p, 0755, 0, 0);
2284 if (r < 0)
2285 return log_error_errno(r, "Failed to create %s: %m", q);
2286
2287 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2288 if (r < 0)
2289 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2290
2291 return 0;
2292 }
2293
2294 static int drop_capabilities(uid_t uid) {
2295 CapabilityQuintet q;
2296
2297 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2298 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2299 * arg_caps_retain. */
2300
2301 if (capability_quintet_is_set(&arg_full_capabilities)) {
2302 q = arg_full_capabilities;
2303
2304 if (q.bounding == (uint64_t) -1)
2305 q.bounding = uid == 0 ? arg_caps_retain : 0;
2306
2307 if (q.effective == (uint64_t) -1)
2308 q.effective = uid == 0 ? q.bounding : 0;
2309
2310 if (q.inheritable == (uint64_t) -1)
2311 q.inheritable = uid == 0 ? q.bounding : 0;
2312
2313 if (q.permitted == (uint64_t) -1)
2314 q.permitted = uid == 0 ? q.bounding : 0;
2315
2316 if (q.ambient == (uint64_t) -1 && ambient_capabilities_supported())
2317 q.ambient = 0;
2318 } else
2319 q = (CapabilityQuintet) {
2320 .bounding = arg_caps_retain,
2321 .effective = uid == 0 ? arg_caps_retain : 0,
2322 .inheritable = uid == 0 ? arg_caps_retain : 0,
2323 .permitted = uid == 0 ? arg_caps_retain : 0,
2324 .ambient = ambient_capabilities_supported() ? 0 : (uint64_t) -1,
2325 };
2326
2327 return capability_quintet_enforce(&q);
2328 }
2329
2330 static int reset_audit_loginuid(void) {
2331 _cleanup_free_ char *p = NULL;
2332 int r;
2333
2334 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2335 return 0;
2336
2337 r = read_one_line_file("/proc/self/loginuid", &p);
2338 if (r == -ENOENT)
2339 return 0;
2340 if (r < 0)
2341 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2342
2343 /* Already reset? */
2344 if (streq(p, "4294967295"))
2345 return 0;
2346
2347 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2348 if (r < 0) {
2349 log_error_errno(r,
2350 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2351 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2352 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2353 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2354 "using systemd-nspawn. Sleeping for 5s... (%m)");
2355
2356 sleep(5);
2357 }
2358
2359 return 0;
2360 }
2361
2362 static int setup_propagate(const char *root) {
2363 const char *p, *q;
2364 int r;
2365
2366 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2367 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2368 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2369 (void) mkdir_p(p, 0600);
2370
2371 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2372 if (r < 0)
2373 return log_error_errno(r, "Failed to create /run/systemd: %m");
2374
2375 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2376 if (r < 0)
2377 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2378
2379 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2382
2383 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
2384 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2385 if (r < 0)
2386 return r;
2387
2388 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2389 if (r < 0)
2390 return r;
2391
2392 /* machined will MS_MOVE into that directory, and that's only
2393 * supported for non-shared mounts. */
2394 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2395 }
2396
2397 static int setup_machine_id(const char *directory) {
2398 const char *etc_machine_id;
2399 sd_id128_t id;
2400 int r;
2401
2402 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2403 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2404 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2405 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2406 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2407 * container behaves nicely). */
2408
2409 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2410
2411 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2412 if (r < 0) {
2413 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2414 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2415
2416 if (sd_id128_is_null(arg_uuid)) {
2417 r = sd_id128_randomize(&arg_uuid);
2418 if (r < 0)
2419 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2420 }
2421 } else {
2422 if (sd_id128_is_null(id))
2423 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2424 "Machine ID in container image is zero, refusing.");
2425
2426 arg_uuid = id;
2427 }
2428
2429 return 0;
2430 }
2431
2432 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2433 int r;
2434
2435 assert(directory);
2436
2437 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2438 return 0;
2439
2440 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2441 if (r == -EOPNOTSUPP)
2442 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2443 if (r == -EBADE)
2444 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2445 if (r < 0)
2446 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2447 if (r == 0)
2448 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2449 else
2450 log_debug("Patched directory tree to match UID/GID range.");
2451
2452 return r;
2453 }
2454
2455 /*
2456 * Return values:
2457 * < 0 : wait_for_terminate() failed to get the state of the
2458 * container, the container was terminated by a signal, or
2459 * failed for an unknown reason. No change is made to the
2460 * container argument.
2461 * > 0 : The program executed in the container terminated with an
2462 * error. The exit code of the program executed in the
2463 * container is returned. The container argument has been set
2464 * to CONTAINER_TERMINATED.
2465 * 0 : The container is being rebooted, has been shut down or exited
2466 * successfully. The container argument has been set to either
2467 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2468 *
2469 * That is, success is indicated by a return value of zero, and an
2470 * error is indicated by a non-zero value.
2471 */
2472 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2473 siginfo_t status;
2474 int r;
2475
2476 r = wait_for_terminate(pid, &status);
2477 if (r < 0)
2478 return log_warning_errno(r, "Failed to wait for container: %m");
2479
2480 switch (status.si_code) {
2481
2482 case CLD_EXITED:
2483 if (status.si_status == 0)
2484 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2485 else
2486 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2487
2488 *container = CONTAINER_TERMINATED;
2489 return status.si_status;
2490
2491 case CLD_KILLED:
2492 if (status.si_status == SIGINT) {
2493 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2494 *container = CONTAINER_TERMINATED;
2495 return 0;
2496
2497 } else if (status.si_status == SIGHUP) {
2498 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2499 *container = CONTAINER_REBOOTED;
2500 return 0;
2501 }
2502
2503 _fallthrough_;
2504 case CLD_DUMPED:
2505 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2506 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2507
2508 default:
2509 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2510 "Container %s failed due to unknown reason.", arg_machine);
2511 }
2512 }
2513
2514 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2515 pid_t pid;
2516
2517 pid = PTR_TO_PID(userdata);
2518 if (pid > 0) {
2519 if (kill(pid, arg_kill_signal) >= 0) {
2520 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2521 sd_event_source_set_userdata(s, NULL);
2522 return 0;
2523 }
2524 }
2525
2526 sd_event_exit(sd_event_source_get_event(s), 0);
2527 return 0;
2528 }
2529
2530 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2531 pid_t pid;
2532
2533 assert(s);
2534 assert(ssi);
2535
2536 pid = PTR_TO_PID(userdata);
2537
2538 for (;;) {
2539 siginfo_t si = {};
2540
2541 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2542 return log_error_errno(errno, "Failed to waitid(): %m");
2543 if (si.si_pid == 0) /* No pending children. */
2544 break;
2545 if (si.si_pid == pid) {
2546 /* The main process we care for has exited. Return from
2547 * signal handler but leave the zombie. */
2548 sd_event_exit(sd_event_source_get_event(s), 0);
2549 break;
2550 }
2551
2552 /* Reap all other children. */
2553 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2554 }
2555
2556 return 0;
2557 }
2558
2559 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2560 pid_t pid;
2561
2562 assert(m);
2563
2564 pid = PTR_TO_PID(userdata);
2565
2566 if (arg_kill_signal > 0) {
2567 log_info("Container termination requested. Attempting to halt container.");
2568 (void) kill(pid, arg_kill_signal);
2569 } else {
2570 log_info("Container termination requested. Exiting.");
2571 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2572 }
2573
2574 return 0;
2575 }
2576
2577 static int determine_names(void) {
2578 int r;
2579
2580 if (arg_template && !arg_directory && arg_machine) {
2581
2582 /* If --template= was specified then we should not
2583 * search for a machine, but instead create a new one
2584 * in /var/lib/machine. */
2585
2586 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2587 if (!arg_directory)
2588 return log_oom();
2589 }
2590
2591 if (!arg_image && !arg_directory) {
2592 if (arg_machine) {
2593 _cleanup_(image_unrefp) Image *i = NULL;
2594
2595 r = image_find(IMAGE_MACHINE, arg_machine, &i);
2596 if (r == -ENOENT)
2597 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2598 if (r < 0)
2599 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2600
2601 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2602 r = free_and_strdup(&arg_image, i->path);
2603 else
2604 r = free_and_strdup(&arg_directory, i->path);
2605 if (r < 0)
2606 return log_oom();
2607
2608 if (!arg_ephemeral)
2609 arg_read_only = arg_read_only || i->read_only;
2610 } else {
2611 r = safe_getcwd(&arg_directory);
2612 if (r < 0)
2613 return log_error_errno(r, "Failed to determine current directory: %m");
2614 }
2615
2616 if (!arg_directory && !arg_image)
2617 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
2618 }
2619
2620 if (!arg_machine) {
2621 if (arg_directory && path_equal(arg_directory, "/"))
2622 arg_machine = gethostname_malloc();
2623 else {
2624 if (arg_image) {
2625 char *e;
2626
2627 arg_machine = strdup(basename(arg_image));
2628
2629 /* Truncate suffix if there is one */
2630 e = endswith(arg_machine, ".raw");
2631 if (e)
2632 *e = 0;
2633 } else
2634 arg_machine = strdup(basename(arg_directory));
2635 }
2636 if (!arg_machine)
2637 return log_oom();
2638
2639 hostname_cleanup(arg_machine);
2640 if (!machine_name_is_valid(arg_machine))
2641 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
2642
2643 if (arg_ephemeral) {
2644 char *b;
2645
2646 /* Add a random suffix when this is an
2647 * ephemeral machine, so that we can run many
2648 * instances at once without manually having
2649 * to specify -M each time. */
2650
2651 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2652 return log_oom();
2653
2654 free(arg_machine);
2655 arg_machine = b;
2656 }
2657 }
2658
2659 return 0;
2660 }
2661
2662 static int chase_symlinks_and_update(char **p, unsigned flags) {
2663 char *chased;
2664 int r;
2665
2666 assert(p);
2667
2668 if (!*p)
2669 return 0;
2670
2671 r = chase_symlinks(*p, NULL, flags, &chased);
2672 if (r < 0)
2673 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2674
2675 free_and_replace(*p, chased);
2676 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
2677 }
2678
2679 static int determine_uid_shift(const char *directory) {
2680 int r;
2681
2682 if (arg_userns_mode == USER_NAMESPACE_NO) {
2683 arg_uid_shift = 0;
2684 return 0;
2685 }
2686
2687 if (arg_uid_shift == UID_INVALID) {
2688 struct stat st;
2689
2690 r = stat(directory, &st);
2691 if (r < 0)
2692 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2693
2694 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2695
2696 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2697 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2698 "UID and GID base of %s don't match.", directory);
2699
2700 arg_uid_range = UINT32_C(0x10000);
2701 }
2702
2703 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2704 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2705 "UID base too high for UID range.");
2706
2707 return 0;
2708 }
2709
2710 static unsigned long effective_clone_ns_flags(void) {
2711 unsigned long flags = arg_clone_ns_flags;
2712
2713 if (arg_private_network)
2714 flags |= CLONE_NEWNET;
2715 if (arg_use_cgns)
2716 flags |= CLONE_NEWCGROUP;
2717 if (arg_userns_mode != USER_NAMESPACE_NO)
2718 flags |= CLONE_NEWUSER;
2719
2720 return flags;
2721 }
2722
2723 static int patch_sysctl(void) {
2724
2725 /* This table is inspired by runc's sysctl() function */
2726 static const struct {
2727 const char *key;
2728 bool prefix;
2729 unsigned long clone_flags;
2730 } safe_sysctl[] = {
2731 { "kernel.hostname", false, CLONE_NEWUTS },
2732 { "kernel.domainname", false, CLONE_NEWUTS },
2733 { "kernel.msgmax", false, CLONE_NEWIPC },
2734 { "kernel.msgmnb", false, CLONE_NEWIPC },
2735 { "kernel.msgmni", false, CLONE_NEWIPC },
2736 { "kernel.sem", false, CLONE_NEWIPC },
2737 { "kernel.shmall", false, CLONE_NEWIPC },
2738 { "kernel.shmmax", false, CLONE_NEWIPC },
2739 { "kernel.shmmni", false, CLONE_NEWIPC },
2740 { "fs.mqueue.", true, CLONE_NEWIPC },
2741 { "net.", true, CLONE_NEWNET },
2742 };
2743
2744 unsigned long flags;
2745 char **k, **v;
2746 int r;
2747
2748 flags = effective_clone_ns_flags();
2749
2750 STRV_FOREACH_PAIR(k, v, arg_sysctl) {
2751 bool good = false;
2752 size_t i;
2753
2754 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
2755
2756 if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
2757 continue;
2758
2759 if (safe_sysctl[i].prefix)
2760 good = startswith(*k, safe_sysctl[i].key);
2761 else
2762 good = streq(*k, safe_sysctl[i].key);
2763
2764 if (good)
2765 break;
2766 }
2767
2768 if (!good)
2769 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
2770
2771 r = sysctl_write(*k, *v);
2772 if (r < 0)
2773 return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
2774 }
2775
2776 return 0;
2777 }
2778
2779 static int inner_child(
2780 Barrier *barrier,
2781 const char *directory,
2782 bool secondary,
2783 int kmsg_socket,
2784 int rtnl_socket,
2785 FDSet *fds) {
2786
2787 _cleanup_free_ char *home = NULL;
2788 char as_uuid[37];
2789 size_t n_env = 1;
2790 const char *envp[] = {
2791 "PATH=" DEFAULT_PATH_COMPAT,
2792 NULL, /* container */
2793 NULL, /* TERM */
2794 NULL, /* HOME */
2795 NULL, /* USER */
2796 NULL, /* LOGNAME */
2797 NULL, /* container_uuid */
2798 NULL, /* LISTEN_FDS */
2799 NULL, /* LISTEN_PID */
2800 NULL, /* NOTIFY_SOCKET */
2801 NULL
2802 };
2803 const char *exec_target;
2804 _cleanup_strv_free_ char **env_use = NULL;
2805 int r, which_failed;
2806
2807 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2808 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2809 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2810 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2811 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2812 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2813 * namespace.
2814 *
2815 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2816 * unshare(). See below. */
2817
2818 assert(barrier);
2819 assert(directory);
2820 assert(kmsg_socket >= 0);
2821
2822 log_debug("Inner child is initializing.");
2823
2824 if (arg_userns_mode != USER_NAMESPACE_NO) {
2825 /* Tell the parent, that it now can write the UID map. */
2826 (void) barrier_place(barrier); /* #1 */
2827
2828 /* Wait until the parent wrote the UID map */
2829 if (!barrier_place_and_sync(barrier)) /* #2 */
2830 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2831 "Parent died too early");
2832 }
2833
2834 r = reset_uid_gid();
2835 if (r < 0)
2836 return log_error_errno(r, "Couldn't become new root: %m");
2837
2838 r = mount_all(NULL,
2839 arg_mount_settings | MOUNT_IN_USERNS,
2840 arg_uid_shift,
2841 arg_selinux_apifs_context);
2842 if (r < 0)
2843 return r;
2844
2845 if (!arg_network_namespace_path && arg_private_network) {
2846 r = unshare(CLONE_NEWNET);
2847 if (r < 0)
2848 return log_error_errno(errno, "Failed to unshare network namespace: %m");
2849
2850 /* Tell the parent that it can setup network interfaces. */
2851 (void) barrier_place(barrier); /* #3 */
2852 }
2853
2854 r = mount_sysfs(NULL, arg_mount_settings);
2855 if (r < 0)
2856 return r;
2857
2858 /* Wait until we are cgroup-ified, so that we
2859 * can mount the right cgroup path writable */
2860 if (!barrier_place_and_sync(barrier)) /* #4 */
2861 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2862 "Parent died too early");
2863
2864 if (arg_use_cgns) {
2865 r = unshare(CLONE_NEWCGROUP);
2866 if (r < 0)
2867 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2868 r = mount_cgroups(
2869 "",
2870 arg_unified_cgroup_hierarchy,
2871 arg_userns_mode != USER_NAMESPACE_NO,
2872 arg_uid_shift,
2873 arg_uid_range,
2874 arg_selinux_apifs_context,
2875 true);
2876 if (r < 0)
2877 return r;
2878 } else {
2879 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2880 if (r < 0)
2881 return r;
2882 }
2883
2884 r = setup_boot_id();
2885 if (r < 0)
2886 return r;
2887
2888 r = setup_kmsg(kmsg_socket);
2889 if (r < 0)
2890 return r;
2891 kmsg_socket = safe_close(kmsg_socket);
2892
2893 r = mount_custom(
2894 "/",
2895 arg_custom_mounts,
2896 arg_n_custom_mounts,
2897 false,
2898 0,
2899 0,
2900 arg_selinux_apifs_context,
2901 true);
2902 if (r < 0)
2903 return r;
2904
2905 if (setsid() < 0)
2906 return log_error_errno(errno, "setsid() failed: %m");
2907
2908 if (arg_private_network)
2909 loopback_setup();
2910
2911 if (arg_expose_ports) {
2912 r = expose_port_send_rtnl(rtnl_socket);
2913 if (r < 0)
2914 return r;
2915 rtnl_socket = safe_close(rtnl_socket);
2916 }
2917
2918 r = patch_sysctl();
2919 if (r < 0)
2920 return r;
2921
2922 if (arg_oom_score_adjust_set) {
2923 r = set_oom_score_adjust(arg_oom_score_adjust);
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to adjust OOM score: %m");
2926 }
2927
2928 if (arg_cpuset)
2929 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2930 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2931
2932 (void) setup_hostname();
2933
2934 if (arg_personality != PERSONALITY_INVALID) {
2935 r = safe_personality(arg_personality);
2936 if (r < 0)
2937 return log_error_errno(r, "personality() failed: %m");
2938 } else if (secondary) {
2939 r = safe_personality(PER_LINUX32);
2940 if (r < 0)
2941 return log_error_errno(r, "personality() failed: %m");
2942 }
2943
2944 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2945 if (r < 0)
2946 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2947
2948 #if HAVE_SECCOMP
2949 if (arg_seccomp) {
2950
2951 if (is_seccomp_available()) {
2952
2953 r = seccomp_load(arg_seccomp);
2954 if (IN_SET(r, -EPERM, -EACCES))
2955 return log_error_errno(r, "Failed to install seccomp filter: %m");
2956 if (r < 0)
2957 log_debug_errno(r, "Failed to install seccomp filter: %m");
2958 }
2959 } else
2960 #endif
2961 {
2962 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
2963 if (r < 0)
2964 return r;
2965 }
2966
2967 #if HAVE_SELINUX
2968 if (arg_selinux_context)
2969 if (setexeccon(arg_selinux_context) < 0)
2970 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2971 #endif
2972
2973 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
2974 * if we need to later on. */
2975 if (prctl(PR_SET_KEEPCAPS, 1) < 0)
2976 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
2977
2978 if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
2979 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids);
2980 else
2981 r = change_uid_gid(arg_user, &home);
2982 if (r < 0)
2983 return r;
2984
2985 r = drop_capabilities(getuid());
2986 if (r < 0)
2987 return log_error_errno(r, "Dropping capabilities failed: %m");
2988
2989 if (arg_no_new_privileges)
2990 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2991 return log_error_errno(errno, "Failed to disable new privileges: %m");
2992
2993 /* LXC sets container=lxc, so follow the scheme here */
2994 envp[n_env++] = strjoina("container=", arg_container_service_name);
2995
2996 envp[n_env] = strv_find_prefix(environ, "TERM=");
2997 if (envp[n_env])
2998 n_env++;
2999
3000 if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3001 if (asprintf((char**)(envp + n_env++), "HOME=%s", home ?: "/root") < 0)
3002 return log_oom();
3003
3004 if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3005 if (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ?: "root") < 0 ||
3006 asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3007 return log_oom();
3008
3009 assert(!sd_id128_is_null(arg_uuid));
3010
3011 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
3012 return log_oom();
3013
3014 if (fdset_size(fds) > 0) {
3015 r = fdset_cloexec(fds, false);
3016 if (r < 0)
3017 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3018
3019 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3020 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3021 return log_oom();
3022 }
3023 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3024 return log_oom();
3025
3026 env_use = strv_env_merge(2, envp, arg_setenv);
3027 if (!env_use)
3028 return log_oom();
3029
3030 /* Let the parent know that we are ready and
3031 * wait until the parent is ready with the
3032 * setup, too... */
3033 if (!barrier_place_and_sync(barrier)) /* #5 */
3034 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3035 "Parent died too early");
3036
3037 if (arg_chdir)
3038 if (chdir(arg_chdir) < 0)
3039 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3040
3041 if (arg_start_mode == START_PID2) {
3042 r = stub_pid1(arg_uuid);
3043 if (r < 0)
3044 return r;
3045 }
3046
3047 log_debug("Inner child completed, invoking payload.");
3048
3049 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3050 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3051 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3052 log_close();
3053 log_set_open_when_needed(true);
3054
3055 (void) fdset_close_others(fds);
3056
3057 if (arg_start_mode == START_BOOT) {
3058 char **a;
3059 size_t m;
3060
3061 /* Automatically search for the init system */
3062
3063 m = strv_length(arg_parameters);
3064 a = newa(char*, m + 2);
3065 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3066 a[1 + m] = NULL;
3067
3068 a[0] = (char*) "/usr/lib/systemd/systemd";
3069 execve(a[0], a, env_use);
3070
3071 a[0] = (char*) "/lib/systemd/systemd";
3072 execve(a[0], a, env_use);
3073
3074 a[0] = (char*) "/sbin/init";
3075 execve(a[0], a, env_use);
3076
3077 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3078 } else if (!strv_isempty(arg_parameters)) {
3079 const char *dollar_path;
3080
3081 exec_target = arg_parameters[0];
3082
3083 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3084 * binary. */
3085 dollar_path = strv_env_get(env_use, "PATH");
3086 if (dollar_path) {
3087 if (putenv((char*) dollar_path) != 0)
3088 return log_error_errno(errno, "Failed to update $PATH: %m");
3089 }
3090
3091 execvpe(arg_parameters[0], arg_parameters, env_use);
3092 } else {
3093 if (!arg_chdir)
3094 /* If we cannot change the directory, we'll end up in /, that is expected. */
3095 (void) chdir(home ?: "/root");
3096
3097 execle("/bin/bash", "-bash", NULL, env_use);
3098 execle("/bin/sh", "-sh", NULL, env_use);
3099
3100 exec_target = "/bin/bash, /bin/sh";
3101 }
3102
3103 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3104 }
3105
3106 static int setup_sd_notify_child(void) {
3107 _cleanup_close_ int fd = -1;
3108 union sockaddr_union sa = {
3109 .un.sun_family = AF_UNIX,
3110 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3111 };
3112 int r;
3113
3114 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3115 if (fd < 0)
3116 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3117
3118 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3119 (void) sockaddr_un_unlink(&sa.un);
3120
3121 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3122 if (r < 0)
3123 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3124
3125 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3126 if (r < 0)
3127 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3128
3129 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3130 if (r < 0)
3131 return log_error_errno(r, "SO_PASSCRED failed: %m");
3132
3133 return TAKE_FD(fd);
3134 }
3135
3136 static int outer_child(
3137 Barrier *barrier,
3138 const char *directory,
3139 const char *console,
3140 DissectedImage *dissected_image,
3141 bool secondary,
3142 int pid_socket,
3143 int uuid_socket,
3144 int notify_socket,
3145 int kmsg_socket,
3146 int rtnl_socket,
3147 int uid_shift_socket,
3148 int unified_cgroup_hierarchy_socket,
3149 FDSet *fds,
3150 int netns_fd) {
3151
3152 _cleanup_close_ int fd = -1;
3153 pid_t pid;
3154 ssize_t l;
3155 int r;
3156
3157 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
3158 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
3159 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
3160 * initializations a second child (the "inner" one) is forked off it, and it exits. */
3161
3162 assert(barrier);
3163 assert(directory);
3164 assert(pid_socket >= 0);
3165 assert(uuid_socket >= 0);
3166 assert(notify_socket >= 0);
3167 assert(kmsg_socket >= 0);
3168
3169 log_debug("Outer child is initializing.");
3170
3171 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3172 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3173
3174 if (arg_console_mode != CONSOLE_PIPE) {
3175 int terminal;
3176
3177 assert(console);
3178
3179 terminal = open_terminal(console, O_RDWR);
3180 if (terminal < 0)
3181 return log_error_errno(terminal, "Failed to open console: %m");
3182
3183 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
3184 r = log_dup_console();
3185 if (r < 0)
3186 return log_error_errno(r, "Failed to duplicate stderr: %m");
3187
3188 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
3189 if (r < 0)
3190 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
3191 }
3192
3193 r = reset_audit_loginuid();
3194 if (r < 0)
3195 return r;
3196
3197 /* Mark everything as slave, so that we still
3198 * receive mounts from the real root, but don't
3199 * propagate mounts to the real root. */
3200 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3201 if (r < 0)
3202 return r;
3203
3204 if (dissected_image) {
3205 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
3206 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
3207 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
3208 * makes sure ESP partitions and userns are compatible. */
3209
3210 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3211 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
3212 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
3213 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3214 if (r < 0)
3215 return r;
3216 }
3217
3218 r = determine_uid_shift(directory);
3219 if (r < 0)
3220 return r;
3221
3222 if (arg_userns_mode != USER_NAMESPACE_NO) {
3223 /* Let the parent know which UID shift we read from the image */
3224 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3225 if (l < 0)
3226 return log_error_errno(errno, "Failed to send UID shift: %m");
3227 if (l != sizeof(arg_uid_shift))
3228 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3229 "Short write while sending UID shift.");
3230
3231 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3232 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3233 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3234 * not it will pick a different one, and send it back to us. */
3235
3236 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3237 if (l < 0)
3238 return log_error_errno(errno, "Failed to recv UID shift: %m");
3239 if (l != sizeof(arg_uid_shift))
3240 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3241 "Short read while receiving UID shift.");
3242 }
3243
3244 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3245 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3246 }
3247
3248 if (!dissected_image) {
3249 /* Turn directory into bind mount */
3250 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3251 if (r < 0)
3252 return r;
3253 }
3254
3255 r = setup_pivot_root(
3256 directory,
3257 arg_pivot_root_new,
3258 arg_pivot_root_old);
3259 if (r < 0)
3260 return r;
3261
3262 r = setup_volatile_mode(
3263 directory,
3264 arg_volatile_mode,
3265 arg_userns_mode != USER_NAMESPACE_NO,
3266 arg_uid_shift,
3267 arg_uid_range,
3268 arg_selinux_context);
3269 if (r < 0)
3270 return r;
3271
3272 if (dissected_image) {
3273 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3274 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
3275 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
3276 if (r < 0)
3277 return r;
3278 }
3279
3280 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3281 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3282
3283 r = detect_unified_cgroup_hierarchy_from_image(directory);
3284 if (r < 0)
3285 return r;
3286
3287 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3288 if (l < 0)
3289 return log_error_errno(errno, "Failed to send cgroup mode: %m");
3290 if (l != sizeof(arg_unified_cgroup_hierarchy))
3291 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3292 "Short write while sending cgroup mode.");
3293
3294 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3295 }
3296
3297 /* Mark everything as shared so our mounts get propagated down. This is
3298 * required to make new bind mounts available in systemd services
3299 * inside the containter that create a new mount namespace.
3300 * See https://github.com/systemd/systemd/issues/3860
3301 * Further submounts (such as /dev) done after this will inherit the
3302 * shared propagation mode. */
3303 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3304 if (r < 0)
3305 return r;
3306
3307 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3308 if (r < 0)
3309 return r;
3310
3311 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3312 if (r < 0)
3313 return r;
3314
3315 if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
3316 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3317 if (r < 0)
3318 return log_error_errno(r, "Failed to make tree read-only: %m");
3319 }
3320
3321 r = mount_all(directory,
3322 arg_mount_settings,
3323 arg_uid_shift,
3324 arg_selinux_apifs_context);
3325 if (r < 0)
3326 return r;
3327
3328 r = copy_devnodes(directory);
3329 if (r < 0)
3330 return r;
3331
3332 r = make_extra_nodes(directory);
3333 if (r < 0)
3334 return r;
3335
3336 (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3337 (void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
3338
3339 r = setup_pts(directory);
3340 if (r < 0)
3341 return r;
3342
3343 r = setup_propagate(directory);
3344 if (r < 0)
3345 return r;
3346
3347 r = setup_dev_console(directory, console);
3348 if (r < 0)
3349 return r;
3350
3351 r = setup_keyring();
3352 if (r < 0)
3353 return r;
3354
3355 r = setup_timezone(directory);
3356 if (r < 0)
3357 return r;
3358
3359 r = setup_resolv_conf(directory);
3360 if (r < 0)
3361 return r;
3362
3363 r = setup_machine_id(directory);
3364 if (r < 0)
3365 return r;
3366
3367 r = setup_journal(directory);
3368 if (r < 0)
3369 return r;
3370
3371 r = mount_custom(
3372 directory,
3373 arg_custom_mounts,
3374 arg_n_custom_mounts,
3375 arg_userns_mode != USER_NAMESPACE_NO,
3376 arg_uid_shift,
3377 arg_uid_range,
3378 arg_selinux_apifs_context,
3379 false);
3380 if (r < 0)
3381 return r;
3382
3383 if (!arg_use_cgns) {
3384 r = mount_cgroups(
3385 directory,
3386 arg_unified_cgroup_hierarchy,
3387 arg_userns_mode != USER_NAMESPACE_NO,
3388 arg_uid_shift,
3389 arg_uid_range,
3390 arg_selinux_apifs_context,
3391 false);
3392 if (r < 0)
3393 return r;
3394 }
3395
3396 r = mount_move_root(directory);
3397 if (r < 0)
3398 return log_error_errno(r, "Failed to move root directory: %m");
3399
3400 fd = setup_sd_notify_child();
3401 if (fd < 0)
3402 return fd;
3403
3404 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3405 arg_clone_ns_flags |
3406 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3407 if (pid < 0)
3408 return log_error_errno(errno, "Failed to fork inner child: %m");
3409 if (pid == 0) {
3410 pid_socket = safe_close(pid_socket);
3411 uuid_socket = safe_close(uuid_socket);
3412 notify_socket = safe_close(notify_socket);
3413 uid_shift_socket = safe_close(uid_shift_socket);
3414
3415 /* The inner child has all namespaces that are
3416 * requested, so that we all are owned by the user if
3417 * user namespaces are turned on. */
3418
3419 if (arg_network_namespace_path) {
3420 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3421 if (r < 0)
3422 return log_error_errno(r, "Failed to join network namespace: %m");
3423 }
3424
3425 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3426 if (r < 0)
3427 _exit(EXIT_FAILURE);
3428
3429 _exit(EXIT_SUCCESS);
3430 }
3431
3432 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3433 if (l < 0)
3434 return log_error_errno(errno, "Failed to send PID: %m");
3435 if (l != sizeof(pid))
3436 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3437 "Short write while sending PID.");
3438
3439 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3440 if (l < 0)
3441 return log_error_errno(errno, "Failed to send machine ID: %m");
3442 if (l != sizeof(arg_uuid))
3443 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3444 "Short write while sending machine ID.");
3445
3446 l = send_one_fd(notify_socket, fd, 0);
3447 if (l < 0)
3448 return log_error_errno(errno, "Failed to send notify fd: %m");
3449
3450 pid_socket = safe_close(pid_socket);
3451 uuid_socket = safe_close(uuid_socket);
3452 notify_socket = safe_close(notify_socket);
3453 kmsg_socket = safe_close(kmsg_socket);
3454 rtnl_socket = safe_close(rtnl_socket);
3455 netns_fd = safe_close(netns_fd);
3456
3457 return 0;
3458 }
3459
3460 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3461 bool tried_hashed = false;
3462 unsigned n_tries = 100;
3463 uid_t candidate;
3464 int r;
3465
3466 assert(shift);
3467 assert(ret_lock_file);
3468 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3469 assert(arg_uid_range == 0x10000U);
3470
3471 candidate = *shift;
3472
3473 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3474
3475 for (;;) {
3476 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3477 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3478
3479 if (--n_tries <= 0)
3480 return -EBUSY;
3481
3482 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3483 goto next;
3484 if ((candidate & UINT32_C(0xFFFF)) != 0)
3485 goto next;
3486
3487 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3488 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3489 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3490 goto next;
3491 if (r < 0)
3492 return r;
3493
3494 /* Make some superficial checks whether the range is currently known in the user database */
3495 if (getpwuid(candidate))
3496 goto next;
3497 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3498 goto next;
3499 if (getgrgid(candidate))
3500 goto next;
3501 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3502 goto next;
3503
3504 *ret_lock_file = lf;
3505 lf = (struct LockFile) LOCK_FILE_INIT;
3506 *shift = candidate;
3507 return 0;
3508
3509 next:
3510 if (arg_machine && !tried_hashed) {
3511 /* Try to hash the base from the container name */
3512
3513 static const uint8_t hash_key[] = {
3514 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3515 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3516 };
3517
3518 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3519
3520 tried_hashed = true;
3521 } else
3522 random_bytes(&candidate, sizeof(candidate));
3523
3524 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3525 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3526 }
3527 }
3528
3529 static int setup_uid_map(pid_t pid) {
3530 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3531 int r;
3532
3533 assert(pid > 1);
3534
3535 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3536 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3537 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3538 if (r < 0)
3539 return log_error_errno(r, "Failed to write UID map: %m");
3540
3541 /* We always assign the same UID and GID ranges */
3542 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3543 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3544 if (r < 0)
3545 return log_error_errno(r, "Failed to write GID map: %m");
3546
3547 return 0;
3548 }
3549
3550 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3551 char buf[NOTIFY_BUFFER_MAX+1];
3552 char *p = NULL;
3553 struct iovec iovec = {
3554 .iov_base = buf,
3555 .iov_len = sizeof(buf)-1,
3556 };
3557 union {
3558 struct cmsghdr cmsghdr;
3559 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3560 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3561 } control = {};
3562 struct msghdr msghdr = {
3563 .msg_iov = &iovec,
3564 .msg_iovlen = 1,
3565 .msg_control = &control,
3566 .msg_controllen = sizeof(control),
3567 };
3568 struct cmsghdr *cmsg;
3569 struct ucred *ucred = NULL;
3570 ssize_t n;
3571 pid_t inner_child_pid;
3572 _cleanup_strv_free_ char **tags = NULL;
3573
3574 assert(userdata);
3575
3576 inner_child_pid = PTR_TO_PID(userdata);
3577
3578 if (revents != EPOLLIN) {
3579 log_warning("Got unexpected poll event for notify fd.");
3580 return 0;
3581 }
3582
3583 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3584 if (n < 0) {
3585 if (IN_SET(errno, EAGAIN, EINTR))
3586 return 0;
3587
3588 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3589 }
3590 cmsg_close_all(&msghdr);
3591
3592 CMSG_FOREACH(cmsg, &msghdr) {
3593 if (cmsg->cmsg_level == SOL_SOCKET &&
3594 cmsg->cmsg_type == SCM_CREDENTIALS &&
3595 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3596
3597 ucred = (struct ucred*) CMSG_DATA(cmsg);
3598 }
3599 }
3600
3601 if (!ucred || ucred->pid != inner_child_pid) {
3602 log_debug("Received notify message without valid credentials. Ignoring.");
3603 return 0;
3604 }
3605
3606 if ((size_t) n >= sizeof(buf)) {
3607 log_warning("Received notify message exceeded maximum size. Ignoring.");
3608 return 0;
3609 }
3610
3611 buf[n] = 0;
3612 tags = strv_split(buf, "\n\r");
3613 if (!tags)
3614 return log_oom();
3615
3616 if (strv_find(tags, "READY=1"))
3617 (void) sd_notifyf(false, "READY=1\n");
3618
3619 p = strv_find_startswith(tags, "STATUS=");
3620 if (p)
3621 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
3622
3623 return 0;
3624 }
3625
3626 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3627 int r;
3628
3629 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3630 if (r < 0)
3631 return log_error_errno(r, "Failed to allocate notify event source: %m");
3632
3633 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3634
3635 return 0;
3636 }
3637
3638 static int merge_settings(Settings *settings, const char *path) {
3639 int rl;
3640
3641 assert(settings);
3642 assert(path);
3643
3644 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3645 * that this steals the fields of the Settings* structure, and hence modifies it. */
3646
3647 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3648 settings->start_mode >= 0) {
3649 arg_start_mode = settings->start_mode;
3650 strv_free_and_replace(arg_parameters, settings->parameters);
3651 }
3652
3653 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3654 arg_ephemeral = settings->ephemeral;
3655
3656 if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
3657 settings->root) {
3658
3659 if (!arg_settings_trusted)
3660 log_warning("Ignoring root directory setting, file %s is not trusted.", path);
3661 else
3662 free_and_replace(arg_directory, settings->root);
3663 }
3664
3665 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3666 settings->pivot_root_new) {
3667 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3668 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3669 }
3670
3671 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3672 settings->working_directory)
3673 free_and_replace(arg_chdir, settings->working_directory);
3674
3675 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3676 settings->environment)
3677 strv_free_and_replace(arg_setenv, settings->environment);
3678
3679 if ((arg_settings_mask & SETTING_USER) == 0) {
3680
3681 if (settings->user)
3682 free_and_replace(arg_user, settings->user);
3683
3684 if (uid_is_valid(settings->uid))
3685 arg_uid = settings->uid;
3686 if (gid_is_valid(settings->gid))
3687 arg_gid = settings->gid;
3688 if (settings->n_supplementary_gids > 0) {
3689 free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
3690 arg_n_supplementary_gids = settings->n_supplementary_gids;
3691 }
3692 }
3693
3694 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3695 uint64_t plus, minus;
3696
3697 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
3698 * Settings structure */
3699
3700 plus = settings->capability;
3701 minus = settings->drop_capability;
3702
3703 if ((arg_settings_mask & SETTING_NETWORK) == 0) {
3704 if (settings_private_network(settings))
3705 plus |= UINT64_C(1) << CAP_NET_ADMIN;
3706 else
3707 minus |= UINT64_C(1) << CAP_NET_ADMIN;
3708 }
3709
3710 if (!arg_settings_trusted && plus != 0) {
3711 if (settings->capability != 0)
3712 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3713 } else
3714 arg_caps_retain |= plus;
3715
3716 arg_caps_retain &= ~minus;
3717
3718 /* Copy the full capabilities over too */
3719 if (capability_quintet_is_set(&settings->full_capabilities)) {
3720 if (!arg_settings_trusted)
3721 log_warning("Ignoring capabilitiy settings, file %s is not trusted.", path);
3722 else
3723 arg_full_capabilities = settings->full_capabilities;
3724 }
3725 }
3726
3727 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3728 settings->kill_signal > 0)
3729 arg_kill_signal = settings->kill_signal;
3730
3731 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3732 settings->personality != PERSONALITY_INVALID)
3733 arg_personality = settings->personality;
3734
3735 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3736 !sd_id128_is_null(settings->machine_id)) {
3737
3738 if (!arg_settings_trusted)
3739 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3740 else
3741 arg_uuid = settings->machine_id;
3742 }
3743
3744 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3745 settings->read_only >= 0)
3746 arg_read_only = settings->read_only;
3747
3748 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3749 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3750 arg_volatile_mode = settings->volatile_mode;
3751
3752 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3753 settings->n_custom_mounts > 0) {
3754
3755 if (!arg_settings_trusted)
3756 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3757 else {
3758 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3759 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3760 arg_n_custom_mounts = settings->n_custom_mounts;
3761 settings->n_custom_mounts = 0;
3762 }
3763 }
3764
3765 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3766 (settings->private_network >= 0 ||
3767 settings->network_veth >= 0 ||
3768 settings->network_bridge ||
3769 settings->network_zone ||
3770 settings->network_interfaces ||
3771 settings->network_macvlan ||
3772 settings->network_ipvlan ||
3773 settings->network_veth_extra ||
3774 settings->network_namespace_path)) {
3775
3776 if (!arg_settings_trusted)
3777 log_warning("Ignoring network settings, file %s is not trusted.", path);
3778 else {
3779 arg_network_veth = settings_network_veth(settings);
3780 arg_private_network = settings_private_network(settings);
3781
3782 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3783 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3784 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3785 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3786
3787 free_and_replace(arg_network_bridge, settings->network_bridge);
3788 free_and_replace(arg_network_zone, settings->network_zone);
3789
3790 free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
3791 }
3792 }
3793
3794 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3795 settings->expose_ports) {
3796
3797 if (!arg_settings_trusted)
3798 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3799 else {
3800 expose_port_free_all(arg_expose_ports);
3801 arg_expose_ports = TAKE_PTR(settings->expose_ports);
3802 }
3803 }
3804
3805 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3806 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3807
3808 if (!arg_settings_trusted)
3809 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3810 else {
3811 arg_userns_mode = settings->userns_mode;
3812 arg_uid_shift = settings->uid_shift;
3813 arg_uid_range = settings->uid_range;
3814 arg_userns_chown = settings->userns_chown;
3815 }
3816 }
3817
3818 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3819 arg_notify_ready = settings->notify_ready;
3820
3821 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3822
3823 if (!arg_settings_trusted && !strv_isempty(settings->syscall_whitelist))
3824 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3825 else {
3826 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3827 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3828 }
3829
3830 #if HAVE_SECCOMP
3831 if (!arg_settings_trusted && settings->seccomp)
3832 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
3833 else {
3834 seccomp_release(arg_seccomp);
3835 arg_seccomp = TAKE_PTR(settings->seccomp);
3836 }
3837 #endif
3838 }
3839
3840 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3841 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3842 continue;
3843
3844 if (!settings->rlimit[rl])
3845 continue;
3846
3847 if (!arg_settings_trusted) {
3848 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3849 continue;
3850 }
3851
3852 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3853 }
3854
3855 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3856 settings->hostname)
3857 free_and_replace(arg_hostname, settings->hostname);
3858
3859 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3860 settings->no_new_privileges >= 0)
3861 arg_no_new_privileges = settings->no_new_privileges;
3862
3863 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3864 settings->oom_score_adjust_set) {
3865
3866 if (!arg_settings_trusted)
3867 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3868 else {
3869 arg_oom_score_adjust = settings->oom_score_adjust;
3870 arg_oom_score_adjust_set = true;
3871 }
3872 }
3873
3874 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3875 settings->cpuset) {
3876
3877 if (!arg_settings_trusted)
3878 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3879 else {
3880 if (arg_cpuset)
3881 CPU_FREE(arg_cpuset);
3882 arg_cpuset = TAKE_PTR(settings->cpuset);
3883 arg_cpuset_ncpus = settings->cpuset_ncpus;
3884 }
3885 }
3886
3887 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3888 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3889 arg_resolv_conf = settings->resolv_conf;
3890
3891 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3892 settings->link_journal != _LINK_JOURNAL_INVALID) {
3893
3894 if (!arg_settings_trusted)
3895 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3896 else {
3897 arg_link_journal = settings->link_journal;
3898 arg_link_journal_try = settings->link_journal_try;
3899 }
3900 }
3901
3902 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3903 settings->timezone != _TIMEZONE_MODE_INVALID)
3904 arg_timezone = settings->timezone;
3905
3906 if ((arg_settings_mask & SETTING_SLICE) == 0 &&
3907 settings->slice) {
3908
3909 if (!arg_settings_trusted)
3910 log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
3911 else
3912 free_and_replace(arg_slice, settings->slice);
3913 }
3914
3915 if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
3916 settings->use_cgns >= 0) {
3917
3918 if (!arg_settings_trusted)
3919 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
3920 else
3921 arg_use_cgns = settings->use_cgns;
3922 }
3923
3924 if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
3925 settings->clone_ns_flags != (unsigned long) -1) {
3926
3927 if (!arg_settings_trusted)
3928 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
3929 else
3930 arg_clone_ns_flags = settings->clone_ns_flags;
3931 }
3932
3933 if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
3934 settings->console_mode >= 0) {
3935
3936 if (!arg_settings_trusted)
3937 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
3938 else
3939 arg_console_mode = settings->console_mode;
3940 }
3941
3942 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
3943 * don't consult arg_settings_mask for them. */
3944
3945 sd_bus_message_unref(arg_property_message);
3946 arg_property_message = TAKE_PTR(settings->properties);
3947
3948 arg_console_width = settings->console_width;
3949 arg_console_height = settings->console_height;
3950
3951 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
3952 arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
3953 arg_n_extra_nodes = settings->n_extra_nodes;
3954
3955 return 0;
3956 }
3957
3958 static int load_settings(void) {
3959 _cleanup_(settings_freep) Settings *settings = NULL;
3960 _cleanup_fclose_ FILE *f = NULL;
3961 _cleanup_free_ char *p = NULL;
3962 const char *fn, *i;
3963 int r;
3964
3965 if (arg_oci_bundle)
3966 return 0;
3967
3968 /* If all settings are masked, there's no point in looking for
3969 * the settings file */
3970 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3971 return 0;
3972
3973 fn = strjoina(arg_machine, ".nspawn");
3974
3975 /* We first look in the admin's directories in /etc and /run */
3976 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3977 _cleanup_free_ char *j = NULL;
3978
3979 j = strjoin(i, "/", fn);
3980 if (!j)
3981 return log_oom();
3982
3983 f = fopen(j, "re");
3984 if (f) {
3985 p = TAKE_PTR(j);
3986
3987 /* By default, we trust configuration from /etc and /run */
3988 if (arg_settings_trusted < 0)
3989 arg_settings_trusted = true;
3990
3991 break;
3992 }
3993
3994 if (errno != ENOENT)
3995 return log_error_errno(errno, "Failed to open %s: %m", j);
3996 }
3997
3998 if (!f) {
3999 /* After that, let's look for a file next to the
4000 * actual image we shall boot. */
4001
4002 if (arg_image) {
4003 p = file_in_same_dir(arg_image, fn);
4004 if (!p)
4005 return log_oom();
4006 } else if (arg_directory) {
4007 p = file_in_same_dir(arg_directory, fn);
4008 if (!p)
4009 return log_oom();
4010 }
4011
4012 if (p) {
4013 f = fopen(p, "re");
4014 if (!f && errno != ENOENT)
4015 return log_error_errno(errno, "Failed to open %s: %m", p);
4016
4017 /* By default, we do not trust configuration from /var/lib/machines */
4018 if (arg_settings_trusted < 0)
4019 arg_settings_trusted = false;
4020 }
4021 }
4022
4023 if (!f)
4024 return 0;
4025
4026 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4027
4028 r = settings_load(f, p, &settings);
4029 if (r < 0)
4030 return r;
4031
4032 return merge_settings(settings, p);
4033 }
4034
4035 static int load_oci_bundle(void) {
4036 _cleanup_(settings_freep) Settings *settings = NULL;
4037 int r;
4038
4039 if (!arg_oci_bundle)
4040 return 0;
4041
4042 /* By default let's trust OCI bundles */
4043 if (arg_settings_trusted < 0)
4044 arg_settings_trusted = true;
4045
4046 r = oci_load(NULL, arg_oci_bundle, &settings);
4047 if (r < 0)
4048 return r;
4049
4050 return merge_settings(settings, arg_oci_bundle);
4051 }
4052
4053 static int run_container(int master,
4054 const char* console,
4055 DissectedImage *dissected_image,
4056 bool secondary,
4057 FDSet *fds,
4058 char veth_name[IFNAMSIZ], bool *veth_created,
4059 union in_addr_union *exposed,
4060 pid_t *pid, int *ret) {
4061
4062 static const struct sigaction sa = {
4063 .sa_handler = nop_signal_handler,
4064 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4065 };
4066
4067 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4068 _cleanup_close_ int etc_passwd_lock = -1;
4069 _cleanup_close_pair_ int
4070 kmsg_socket_pair[2] = { -1, -1 },
4071 rtnl_socket_pair[2] = { -1, -1 },
4072 pid_socket_pair[2] = { -1, -1 },
4073 uuid_socket_pair[2] = { -1, -1 },
4074 notify_socket_pair[2] = { -1, -1 },
4075 uid_shift_socket_pair[2] = { -1, -1 },
4076 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4077
4078 _cleanup_close_ int notify_socket= -1;
4079 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4080 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4081 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4082 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4083 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4084 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4085 ContainerStatus container_status = 0;
4086 int ifi = 0, r;
4087 ssize_t l;
4088 sigset_t mask_chld;
4089 _cleanup_close_ int netns_fd = -1;
4090
4091 assert_se(sigemptyset(&mask_chld) == 0);
4092 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4093
4094 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4095 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4096 * check with getpwuid() if the specific user already exists. Note that /etc might be
4097 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4098 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4099 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4100 * really ours. */
4101
4102 etc_passwd_lock = take_etc_passwd_lock(NULL);
4103 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4104 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4105 }
4106
4107 r = barrier_create(&barrier);
4108 if (r < 0)
4109 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4110
4111 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4112 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4113
4114 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4115 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4116
4117 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4118 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4119
4120 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4121 return log_error_errno(errno, "Failed to create id socket pair: %m");
4122
4123 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4124 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4125
4126 if (arg_userns_mode != USER_NAMESPACE_NO)
4127 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4128 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4129
4130 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4131 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4132 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4133
4134 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4135 * parent's blocking calls and give it a chance to call wait() and terminate. */
4136 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4137 if (r < 0)
4138 return log_error_errno(errno, "Failed to change the signal mask: %m");
4139
4140 r = sigaction(SIGCHLD, &sa, NULL);
4141 if (r < 0)
4142 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4143
4144 if (arg_network_namespace_path) {
4145 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4146 if (netns_fd < 0)
4147 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4148
4149 r = fd_is_network_ns(netns_fd);
4150 if (r == -EUCLEAN)
4151 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4152 else if (r < 0)
4153 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4154 else if (r == 0)
4155 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4156 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4157 }
4158
4159 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4160 if (*pid < 0)
4161 return log_error_errno(errno, "clone() failed%s: %m",
4162 errno == EINVAL ?
4163 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4164
4165 if (*pid == 0) {
4166 /* The outer child only has a file system namespace. */
4167 barrier_set_role(&barrier, BARRIER_CHILD);
4168
4169 master = safe_close(master);
4170
4171 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4172 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4173 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4174 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4175 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4176 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4177 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4178
4179 (void) reset_all_signal_handlers();
4180 (void) reset_signal_mask();
4181
4182 r = outer_child(&barrier,
4183 arg_directory,
4184 console,
4185 dissected_image,
4186 secondary,
4187 pid_socket_pair[1],
4188 uuid_socket_pair[1],
4189 notify_socket_pair[1],
4190 kmsg_socket_pair[1],
4191 rtnl_socket_pair[1],
4192 uid_shift_socket_pair[1],
4193 unified_cgroup_hierarchy_socket_pair[1],
4194 fds,
4195 netns_fd);
4196 if (r < 0)
4197 _exit(EXIT_FAILURE);
4198
4199 _exit(EXIT_SUCCESS);
4200 }
4201
4202 barrier_set_role(&barrier, BARRIER_PARENT);
4203
4204 fdset_close(fds);
4205
4206 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4207 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4208 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4209 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4210 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4211 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4212 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4213
4214 if (arg_userns_mode != USER_NAMESPACE_NO) {
4215 /* The child just let us know the UID shift it might have read from the image. */
4216 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4217 if (l < 0)
4218 return log_error_errno(errno, "Failed to read UID shift: %m");
4219 if (l != sizeof arg_uid_shift)
4220 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4221
4222 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4223 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4224 * image, but if that's already in use, pick a new one, and report back to the child,
4225 * which one we now picked. */
4226
4227 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4228 if (r < 0)
4229 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4230
4231 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4232 if (l < 0)
4233 return log_error_errno(errno, "Failed to send UID shift: %m");
4234 if (l != sizeof arg_uid_shift)
4235 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4236 }
4237 }
4238
4239 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4240 /* The child let us know the support cgroup mode it might have read from the image. */
4241 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4242 if (l < 0)
4243 return log_error_errno(errno, "Failed to read cgroup mode: %m");
4244 if (l != sizeof(arg_unified_cgroup_hierarchy))
4245 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4246 l, l == 0 ? " The child is most likely dead." : "");
4247 }
4248
4249 /* Wait for the outer child. */
4250 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4251 if (r < 0)
4252 return r;
4253 if (r != EXIT_SUCCESS)
4254 return -EIO;
4255
4256 /* And now retrieve the PID of the inner child. */
4257 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4258 if (l < 0)
4259 return log_error_errno(errno, "Failed to read inner child PID: %m");
4260 if (l != sizeof *pid)
4261 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4262
4263 /* We also retrieve container UUID in case it was generated by outer child */
4264 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4265 if (l < 0)
4266 return log_error_errno(errno, "Failed to read container machine ID: %m");
4267 if (l != sizeof(arg_uuid))
4268 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4269
4270 /* We also retrieve the socket used for notifications generated by outer child */
4271 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4272 if (notify_socket < 0)
4273 return log_error_errno(notify_socket,
4274 "Failed to receive notification socket from the outer child: %m");
4275
4276 log_debug("Init process invoked as PID "PID_FMT, *pid);
4277
4278 if (arg_userns_mode != USER_NAMESPACE_NO) {
4279 if (!barrier_place_and_sync(&barrier)) /* #1 */
4280 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4281
4282 r = setup_uid_map(*pid);
4283 if (r < 0)
4284 return r;
4285
4286 (void) barrier_place(&barrier); /* #2 */
4287 }
4288
4289 if (arg_private_network) {
4290 if (!arg_network_namespace_path) {
4291 /* Wait until the child has unshared its network namespace. */
4292 if (!barrier_place_and_sync(&barrier)) /* #3 */
4293 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4294 }
4295
4296 r = move_network_interfaces(*pid, arg_network_interfaces);
4297 if (r < 0)
4298 return r;
4299
4300 if (arg_network_veth) {
4301 r = setup_veth(arg_machine, *pid, veth_name,
4302 arg_network_bridge || arg_network_zone);
4303 if (r < 0)
4304 return r;
4305 else if (r > 0)
4306 ifi = r;
4307
4308 if (arg_network_bridge) {
4309 /* Add the interface to a bridge */
4310 r = setup_bridge(veth_name, arg_network_bridge, false);
4311 if (r < 0)
4312 return r;
4313 if (r > 0)
4314 ifi = r;
4315 } else if (arg_network_zone) {
4316 /* Add the interface to a bridge, possibly creating it */
4317 r = setup_bridge(veth_name, arg_network_zone, true);
4318 if (r < 0)
4319 return r;
4320 if (r > 0)
4321 ifi = r;
4322 }
4323 }
4324
4325 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
4326 if (r < 0)
4327 return r;
4328
4329 /* We created the primary and extra veth links now; let's remember this, so that we know to
4330 remove them later on. Note that we don't bother with removing veth links that were created
4331 here when their setup failed half-way, because in that case the kernel should be able to
4332 remove them on its own, since they cannot be referenced by anything yet. */
4333 *veth_created = true;
4334
4335 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
4336 if (r < 0)
4337 return r;
4338
4339 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
4340 if (r < 0)
4341 return r;
4342 }
4343
4344 if (arg_register || !arg_keep_unit) {
4345 r = sd_bus_default_system(&bus);
4346 if (r < 0)
4347 return log_error_errno(r, "Failed to open system bus: %m");
4348
4349 r = sd_bus_set_close_on_exit(bus, false);
4350 if (r < 0)
4351 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
4352 }
4353
4354 if (!arg_keep_unit) {
4355 /* When a new scope is created for this container, then we'll be registered as its controller, in which
4356 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
4357 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
4358
4359 r = sd_bus_match_signal_async(
4360 bus,
4361 NULL,
4362 "org.freedesktop.systemd1",
4363 NULL,
4364 "org.freedesktop.systemd1.Scope",
4365 "RequestStop",
4366 on_request_stop, NULL, PID_TO_PTR(*pid));
4367 if (r < 0)
4368 return log_error_errno(r, "Failed to request RequestStop match: %m");
4369 }
4370
4371 if (arg_register) {
4372 r = register_machine(
4373 bus,
4374 arg_machine,
4375 *pid,
4376 arg_directory,
4377 arg_uuid,
4378 ifi,
4379 arg_slice,
4380 arg_custom_mounts, arg_n_custom_mounts,
4381 arg_kill_signal,
4382 arg_property,
4383 arg_property_message,
4384 arg_keep_unit,
4385 arg_container_service_name);
4386 if (r < 0)
4387 return r;
4388
4389 } else if (!arg_keep_unit) {
4390 r = allocate_scope(
4391 bus,
4392 arg_machine,
4393 *pid,
4394 arg_slice,
4395 arg_custom_mounts, arg_n_custom_mounts,
4396 arg_kill_signal,
4397 arg_property,
4398 arg_property_message);
4399 if (r < 0)
4400 return r;
4401
4402 } else if (arg_slice || arg_property)
4403 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
4404
4405 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
4406 if (r < 0)
4407 return r;
4408
4409 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4410 if (r < 0)
4411 return r;
4412
4413 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
4414 if (r < 0)
4415 return r;
4416
4417 /* Notify the child that the parent is ready with all
4418 * its setup (including cgroup-ification), and that
4419 * the child can now hand over control to the code to
4420 * run inside the container. */
4421 (void) barrier_place(&barrier); /* #4 */
4422
4423 /* Block SIGCHLD here, before notifying child.
4424 * process_pty() will handle it with the other signals. */
4425 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4426
4427 /* Reset signal to default */
4428 r = default_signals(SIGCHLD, -1);
4429 if (r < 0)
4430 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4431
4432 r = sd_event_new(&event);
4433 if (r < 0)
4434 return log_error_errno(r, "Failed to get default event source: %m");
4435
4436 (void) sd_event_set_watchdog(event, true);
4437
4438 if (bus) {
4439 r = sd_bus_attach_event(bus, event, 0);
4440 if (r < 0)
4441 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4442 }
4443
4444 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
4445 if (r < 0)
4446 return r;
4447
4448 /* Let the child know that we are ready and wait that the child is completely ready now. */
4449 if (!barrier_place_and_sync(&barrier)) /* #5 */
4450 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4451
4452 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4453 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4454 etc_passwd_lock = safe_close(etc_passwd_lock);
4455
4456 (void) sd_notifyf(false,
4457 "STATUS=Container running.\n"
4458 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4459 if (!arg_notify_ready)
4460 (void) sd_notify(false, "READY=1\n");
4461
4462 if (arg_kill_signal > 0) {
4463 /* Try to kill the init system on SIGINT or SIGTERM */
4464 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4465 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4466 } else {
4467 /* Immediately exit */
4468 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4469 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4470 }
4471
4472 /* Exit when the child exits */
4473 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4474
4475 if (arg_expose_ports) {
4476 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4477 if (r < 0)
4478 return r;
4479
4480 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4481 }
4482
4483 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4484
4485 if (IN_SET(arg_console_mode, CONSOLE_INTERACTIVE, CONSOLE_READ_ONLY)) {
4486 assert(master >= 0);
4487
4488 r = pty_forward_new(event, master,
4489 PTY_FORWARD_IGNORE_VHANGUP | (arg_console_mode == CONSOLE_READ_ONLY ? PTY_FORWARD_READ_ONLY : 0),
4490 &forward);
4491 if (r < 0)
4492 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4493
4494 if (arg_console_width != (unsigned) -1 || arg_console_height != (unsigned) -1)
4495 (void) pty_forward_set_width_height(forward, arg_console_width, arg_console_height);
4496 }
4497
4498 r = sd_event_loop(event);
4499 if (r < 0)
4500 return log_error_errno(r, "Failed to run event loop: %m");
4501
4502 if (forward) {
4503 char last_char = 0;
4504
4505 (void) pty_forward_get_last_char(forward, &last_char);
4506 forward = pty_forward_free(forward);
4507
4508 if (!arg_quiet && last_char != '\n')
4509 putc('\n', stdout);
4510 }
4511
4512 /* Kill if it is not dead yet anyway */
4513 if (bus) {
4514 if (arg_register)
4515 terminate_machine(bus, arg_machine);
4516 else if (!arg_keep_unit)
4517 terminate_scope(bus, arg_machine);
4518 }
4519
4520 /* Normally redundant, but better safe than sorry */
4521 (void) kill(*pid, SIGKILL);
4522
4523 r = wait_for_container(*pid, &container_status);
4524 *pid = 0;
4525
4526 if (r < 0)
4527 /* We failed to wait for the container, or the container exited abnormally. */
4528 return r;
4529 if (r > 0 || container_status == CONTAINER_TERMINATED) {
4530 /* r > 0 → The container exited with a non-zero status.
4531 * As a special case, we need to replace 133 with a different value,
4532 * because 133 is special-cased in the service file to reboot the container.
4533 * otherwise → The container exited with zero status and a reboot was not requested.
4534 */
4535 if (r == EXIT_FORCE_RESTART)
4536 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4537 *ret = r;
4538 return 0; /* finito */
4539 }
4540
4541 /* CONTAINER_REBOOTED, loop again */
4542
4543 if (arg_keep_unit) {
4544 /* Special handling if we are running as a service: instead of simply
4545 * restarting the machine we want to restart the entire service, so let's
4546 * inform systemd about this with the special exit code 133. The service
4547 * file uses RestartForceExitStatus=133 so that this results in a full
4548 * nspawn restart. This is necessary since we might have cgroup parameters
4549 * set we want to have flushed out. */
4550 *ret = EXIT_FORCE_RESTART;
4551 return 0; /* finito */
4552 }
4553
4554 expose_port_flush(arg_expose_ports, exposed);
4555
4556 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4557 *veth_created = false;
4558 return 1; /* loop again */
4559 }
4560
4561 static int initialize_rlimits(void) {
4562 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4563 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4564 * container execution environments. */
4565
4566 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4567 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4568 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4569 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4570 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4571 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4572 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4573 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4574 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4575 [RLIMIT_NICE] = { 0, 0 },
4576 [RLIMIT_NOFILE] = { 1024, 4096 },
4577 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4578 [RLIMIT_RTPRIO] = { 0, 0 },
4579 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4580 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4581
4582 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4583 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4584 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4585 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4586 * that PID 1 changes a number of other resource limits during early initialization which is why we
4587 * don't read the other limits from PID 1 but prefer the static table above. */
4588 };
4589
4590 int rl;
4591
4592 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4593 /* Let's only fill in what the user hasn't explicitly configured anyway */
4594 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4595 const struct rlimit *v;
4596 struct rlimit buffer;
4597
4598 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4599 /* For these two let's read the limits off PID 1. See above for an explanation. */
4600
4601 if (prlimit(1, rl, NULL, &buffer) < 0)
4602 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4603
4604 v = &buffer;
4605 } else
4606 v = kernel_defaults + rl;
4607
4608 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4609 if (!arg_rlimit[rl])
4610 return log_oom();
4611 }
4612
4613 if (DEBUG_LOGGING) {
4614 _cleanup_free_ char *k = NULL;
4615
4616 (void) rlimit_format(arg_rlimit[rl], &k);
4617 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4618 }
4619 }
4620
4621 return 0;
4622 }
4623
4624 static int run(int argc, char *argv[]) {
4625 _cleanup_free_ char *console = NULL;
4626 _cleanup_close_ int master = -1;
4627 _cleanup_fdset_free_ FDSet *fds = NULL;
4628 int r, n_fd_passed, ret = EXIT_SUCCESS;
4629 char veth_name[IFNAMSIZ] = "";
4630 bool secondary = false, remove_directory = false, remove_image = false;
4631 pid_t pid = 0;
4632 union in_addr_union exposed = {};
4633 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4634 bool veth_created = false, remove_tmprootdir = false;
4635 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4636 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4637 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4638 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4639
4640 log_parse_environment();
4641 log_open();
4642
4643 r = parse_argv(argc, argv);
4644 if (r <= 0)
4645 goto finish;
4646
4647 r = must_be_root();
4648 if (r < 0)
4649 goto finish;
4650
4651 r = initialize_rlimits();
4652 if (r < 0)
4653 goto finish;
4654
4655 r = load_oci_bundle();
4656 if (r < 0)
4657 goto finish;
4658
4659 r = determine_names();
4660 if (r < 0)
4661 goto finish;
4662
4663 r = load_settings();
4664 if (r < 0)
4665 goto finish;
4666
4667 r = cg_unified_flush();
4668 if (r < 0) {
4669 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4670 goto finish;
4671 }
4672
4673 r = verify_arguments();
4674 if (r < 0)
4675 goto finish;
4676
4677 r = detect_unified_cgroup_hierarchy_from_environment();
4678 if (r < 0)
4679 goto finish;
4680
4681 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4682 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4683 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4684 (void) ignore_signals(SIGPIPE, -1);
4685
4686 n_fd_passed = sd_listen_fds(false);
4687 if (n_fd_passed > 0) {
4688 r = fdset_new_listen_fds(&fds, false);
4689 if (r < 0) {
4690 log_error_errno(r, "Failed to collect file descriptors: %m");
4691 goto finish;
4692 }
4693 }
4694
4695 /* The "default" umask. This is appropriate for most file and directory
4696 * operations performed by nspawn, and is the umask that will be used for
4697 * the child. Functions like copy_devnodes() change the umask temporarily. */
4698 umask(0022);
4699
4700 if (arg_directory) {
4701 assert(!arg_image);
4702
4703 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4704 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4705 r = -EINVAL;
4706 goto finish;
4707 }
4708
4709 if (arg_ephemeral) {
4710 _cleanup_free_ char *np = NULL;
4711
4712 r = chase_symlinks_and_update(&arg_directory, 0);
4713 if (r < 0)
4714 goto finish;
4715
4716 /* If the specified path is a mount point we
4717 * generate the new snapshot immediately
4718 * inside it under a random name. However if
4719 * the specified is not a mount point we
4720 * create the new snapshot in the parent
4721 * directory, just next to it. */
4722 r = path_is_mount_point(arg_directory, NULL, 0);
4723 if (r < 0) {
4724 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4725 goto finish;
4726 }
4727 if (r > 0)
4728 r = tempfn_random_child(arg_directory, "machine.", &np);
4729 else
4730 r = tempfn_random(arg_directory, "machine.", &np);
4731 if (r < 0) {
4732 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4733 goto finish;
4734 }
4735
4736 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4737 if (r < 0) {
4738 log_error_errno(r, "Failed to lock %s: %m", np);
4739 goto finish;
4740 }
4741
4742 r = btrfs_subvol_snapshot(arg_directory, np,
4743 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4744 BTRFS_SNAPSHOT_FALLBACK_COPY |
4745 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4746 BTRFS_SNAPSHOT_RECURSIVE |
4747 BTRFS_SNAPSHOT_QUOTA);
4748 if (r < 0) {
4749 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4750 goto finish;
4751 }
4752
4753 free_and_replace(arg_directory, np);
4754
4755 remove_directory = true;
4756
4757 } else {
4758 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4759 if (r < 0)
4760 goto finish;
4761
4762 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4763 if (r == -EBUSY) {
4764 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4765 goto finish;
4766 }
4767 if (r < 0) {
4768 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4769 goto finish;
4770 }
4771
4772 if (arg_template) {
4773 r = chase_symlinks_and_update(&arg_template, 0);
4774 if (r < 0)
4775 goto finish;
4776
4777 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4778 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4779 BTRFS_SNAPSHOT_FALLBACK_COPY |
4780 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4781 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4782 BTRFS_SNAPSHOT_RECURSIVE |
4783 BTRFS_SNAPSHOT_QUOTA);
4784 if (r == -EEXIST)
4785 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4786 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4787 else if (r < 0) {
4788 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4789 goto finish;
4790 } else
4791 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4792 "Populated %s from template %s.", arg_directory, arg_template);
4793 }
4794 }
4795
4796 if (arg_start_mode == START_BOOT) {
4797 const char *p;
4798
4799 if (arg_pivot_root_new)
4800 p = prefix_roota(arg_directory, arg_pivot_root_new);
4801 else
4802 p = arg_directory;
4803
4804 if (path_is_os_tree(p) <= 0) {
4805 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4806 r = -EINVAL;
4807 goto finish;
4808 }
4809 } else {
4810 const char *p, *q;
4811
4812 if (arg_pivot_root_new)
4813 p = prefix_roota(arg_directory, arg_pivot_root_new);
4814 else
4815 p = arg_directory;
4816
4817 q = strjoina(p, "/usr/");
4818
4819 if (laccess(q, F_OK) < 0) {
4820 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4821 r = -EINVAL;
4822 goto finish;
4823 }
4824 }
4825
4826 } else {
4827 assert(arg_image);
4828 assert(!arg_template);
4829
4830 r = chase_symlinks_and_update(&arg_image, 0);
4831 if (r < 0)
4832 goto finish;
4833
4834 if (arg_ephemeral) {
4835 _cleanup_free_ char *np = NULL;
4836
4837 r = tempfn_random(arg_image, "machine.", &np);
4838 if (r < 0) {
4839 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4840 goto finish;
4841 }
4842
4843 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4844 if (r < 0) {
4845 r = log_error_errno(r, "Failed to create image lock: %m");
4846 goto finish;
4847 }
4848
4849 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
4850 if (r < 0) {
4851 r = log_error_errno(r, "Failed to copy image file: %m");
4852 goto finish;
4853 }
4854
4855 free_and_replace(arg_image, np);
4856
4857 remove_image = true;
4858 } else {
4859 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4860 if (r == -EBUSY) {
4861 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4862 goto finish;
4863 }
4864 if (r < 0) {
4865 r = log_error_errno(r, "Failed to create image lock: %m");
4866 goto finish;
4867 }
4868
4869 if (!arg_root_hash) {
4870 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4871 if (r < 0) {
4872 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4873 goto finish;
4874 }
4875 }
4876 }
4877
4878 if (!mkdtemp(tmprootdir)) {
4879 r = log_error_errno(errno, "Failed to create temporary directory: %m");
4880 goto finish;
4881 }
4882
4883 remove_tmprootdir = true;
4884
4885 arg_directory = strdup(tmprootdir);
4886 if (!arg_directory) {
4887 r = log_oom();
4888 goto finish;
4889 }
4890
4891 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4892 if (r < 0) {
4893 log_error_errno(r, "Failed to set up loopback block device: %m");
4894 goto finish;
4895 }
4896
4897 r = dissect_image_and_warn(
4898 loop->fd,
4899 arg_image,
4900 arg_root_hash, arg_root_hash_size,
4901 DISSECT_IMAGE_REQUIRE_ROOT,
4902 &dissected_image);
4903 if (r == -ENOPKG) {
4904 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
4905 log_notice("Note that the disk image needs to\n"
4906 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4907 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4908 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4909 " d) or contain a file system without a partition table\n"
4910 "in order to be bootable with systemd-nspawn.");
4911 goto finish;
4912 }
4913 if (r < 0)
4914 goto finish;
4915
4916 if (!arg_root_hash && dissected_image->can_verity)
4917 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4918
4919 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
4920 if (r < 0)
4921 goto finish;
4922
4923 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4924 if (remove_image && unlink(arg_image) >= 0)
4925 remove_image = false;
4926 }
4927
4928 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
4929 if (r < 0)
4930 goto finish;
4931
4932 if (arg_console_mode < 0)
4933 arg_console_mode =
4934 isatty(STDIN_FILENO) > 0 &&
4935 isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
4936
4937 if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
4938 arg_quiet = true;
4939
4940 if (arg_console_mode != CONSOLE_PIPE) {
4941 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
4942 if (master < 0) {
4943 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4944 goto finish;
4945 }
4946
4947 r = ptsname_malloc(master, &console);
4948 if (r < 0) {
4949 r = log_error_errno(r, "Failed to determine tty name: %m");
4950 goto finish;
4951 }
4952
4953 if (arg_selinux_apifs_context) {
4954 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4955 if (r < 0)
4956 goto finish;
4957 }
4958
4959 if (unlockpt(master) < 0) {
4960 r = log_error_errno(errno, "Failed to unlock tty: %m");
4961 goto finish;
4962 }
4963 }
4964
4965 if (!arg_quiet)
4966 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4967 arg_machine, arg_image ?: arg_directory);
4968
4969 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4970
4971 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
4972 r = log_error_errno(errno, "Failed to become subreaper: %m");
4973 goto finish;
4974 }
4975
4976 for (;;) {
4977 r = run_container(master,
4978 console,
4979 dissected_image,
4980 secondary,
4981 fds,
4982 veth_name, &veth_created,
4983 &exposed,
4984 &pid, &ret);
4985 if (r <= 0)
4986 break;
4987 }
4988
4989 finish:
4990 (void) sd_notify(false,
4991 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4992 "STOPPING=1\nSTATUS=Terminating...");
4993
4994 if (pid > 0)
4995 (void) kill(pid, SIGKILL);
4996
4997 /* Try to flush whatever is still queued in the pty */
4998 if (master >= 0) {
4999 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
5000 master = safe_close(master);
5001 }
5002
5003 if (pid > 0)
5004 (void) wait_for_terminate(pid, NULL);
5005
5006 pager_close();
5007
5008 if (remove_directory && arg_directory) {
5009 int k;
5010
5011 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5012 if (k < 0)
5013 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5014 }
5015
5016 if (remove_image && arg_image) {
5017 if (unlink(arg_image) < 0)
5018 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5019 }
5020
5021 if (remove_tmprootdir) {
5022 if (rmdir(tmprootdir) < 0)
5023 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5024 }
5025
5026 if (arg_machine) {
5027 const char *p;
5028
5029 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5030 (void) rm_rf(p, REMOVE_ROOT);
5031 }
5032
5033 expose_port_flush(arg_expose_ports, &exposed);
5034
5035 if (veth_created)
5036 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5037 (void) remove_bridge(arg_network_zone);
5038
5039 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5040 expose_port_free_all(arg_expose_ports);
5041 rlimit_free_all(arg_rlimit);
5042 device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5043
5044 if (r < 0)
5045 return r;
5046
5047 return ret;
5048 }
5049
5050 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);