]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: add --volatile=overlay support
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #if HAVE_BLKID
4 #include <blkid.h>
5 #endif
6 #include <errno.h>
7 #include <getopt.h>
8 #include <grp.h>
9 #include <linux/fs.h>
10 #include <linux/loop.h>
11 #include <pwd.h>
12 #include <sched.h>
13 #if HAVE_SELINUX
14 #include <selinux/selinux.h>
15 #endif
16 #include <signal.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/personality.h>
22 #include <sys/prctl.h>
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 #include <unistd.h>
26
27 #include "sd-bus.h"
28 #include "sd-daemon.h"
29 #include "sd-id128.h"
30
31 #include "alloc-util.h"
32 #include "barrier.h"
33 #include "base-filesystem.h"
34 #include "blkid-util.h"
35 #include "btrfs-util.h"
36 #include "bus-error.h"
37 #include "bus-util.h"
38 #include "cap-list.h"
39 #include "capability-util.h"
40 #include "cgroup-util.h"
41 #include "copy.h"
42 #include "cpu-set-util.h"
43 #include "dev-setup.h"
44 #include "dissect-image.h"
45 #include "env-util.h"
46 #include "fd-util.h"
47 #include "fdset.h"
48 #include "fileio.h"
49 #include "format-util.h"
50 #include "fs-util.h"
51 #include "gpt.h"
52 #include "hexdecoct.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "log.h"
56 #include "loop-util.h"
57 #include "loopback-setup.h"
58 #include "machine-image.h"
59 #include "macro.h"
60 #include "missing.h"
61 #include "mkdir.h"
62 #include "mount-util.h"
63 #include "mountpoint-util.h"
64 #include "netlink-util.h"
65 #include "nspawn-cgroup.h"
66 #include "nspawn-def.h"
67 #include "nspawn-expose-ports.h"
68 #include "nspawn-mount.h"
69 #include "nspawn-network.h"
70 #include "nspawn-patch-uid.h"
71 #include "nspawn-register.h"
72 #include "nspawn-seccomp.h"
73 #include "nspawn-settings.h"
74 #include "nspawn-setuid.h"
75 #include "nspawn-stub-pid1.h"
76 #include "os-util.h"
77 #include "pager.h"
78 #include "parse-util.h"
79 #include "path-util.h"
80 #include "pretty-print.h"
81 #include "process-util.h"
82 #include "ptyfwd.h"
83 #include "random-util.h"
84 #include "raw-clone.h"
85 #include "rlimit-util.h"
86 #include "rm-rf.h"
87 #include "selinux-util.h"
88 #include "signal-util.h"
89 #include "socket-util.h"
90 #include "stat-util.h"
91 #include "stdio-util.h"
92 #include "string-table.h"
93 #include "string-util.h"
94 #include "strv.h"
95 #include "terminal-util.h"
96 #include "tmpfile-util.h"
97 #include "umask-util.h"
98 #include "user-util.h"
99 #include "util.h"
100
101 #if HAVE_SPLIT_USR
102 #define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
103 #else
104 #define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
105 #endif
106
107 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
108 * nspawn_notify_socket_path is relative to the container
109 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
110 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
111
112 #define EXIT_FORCE_RESTART 133
113
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 static char *arg_directory = NULL;
120 static char *arg_template = NULL;
121 static char *arg_chdir = NULL;
122 static char *arg_pivot_root_new = NULL;
123 static char *arg_pivot_root_old = NULL;
124 static char *arg_user = NULL;
125 static sd_id128_t arg_uuid = {};
126 static char *arg_machine = NULL; /* The name used by the host to refer to this */
127 static char *arg_hostname = NULL; /* The name the payload sees by default */
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static StartMode arg_start_mode = START_PID1;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_caps_retain =
138 (1ULL << CAP_AUDIT_CONTROL) |
139 (1ULL << CAP_AUDIT_WRITE) |
140 (1ULL << CAP_CHOWN) |
141 (1ULL << CAP_DAC_OVERRIDE) |
142 (1ULL << CAP_DAC_READ_SEARCH) |
143 (1ULL << CAP_FOWNER) |
144 (1ULL << CAP_FSETID) |
145 (1ULL << CAP_IPC_OWNER) |
146 (1ULL << CAP_KILL) |
147 (1ULL << CAP_LEASE) |
148 (1ULL << CAP_LINUX_IMMUTABLE) |
149 (1ULL << CAP_MKNOD) |
150 (1ULL << CAP_NET_BIND_SERVICE) |
151 (1ULL << CAP_NET_BROADCAST) |
152 (1ULL << CAP_NET_RAW) |
153 (1ULL << CAP_SETFCAP) |
154 (1ULL << CAP_SETGID) |
155 (1ULL << CAP_SETPCAP) |
156 (1ULL << CAP_SETUID) |
157 (1ULL << CAP_SYS_ADMIN) |
158 (1ULL << CAP_SYS_BOOT) |
159 (1ULL << CAP_SYS_CHROOT) |
160 (1ULL << CAP_SYS_NICE) |
161 (1ULL << CAP_SYS_PTRACE) |
162 (1ULL << CAP_SYS_RESOURCE) |
163 (1ULL << CAP_SYS_TTY_CONFIG);
164 static CustomMount *arg_custom_mounts = NULL;
165 static size_t arg_n_custom_mounts = 0;
166 static char **arg_setenv = NULL;
167 static bool arg_quiet = false;
168 static bool arg_register = true;
169 static bool arg_keep_unit = false;
170 static char **arg_network_interfaces = NULL;
171 static char **arg_network_macvlan = NULL;
172 static char **arg_network_ipvlan = NULL;
173 static bool arg_network_veth = false;
174 static char **arg_network_veth_extra = NULL;
175 static char *arg_network_bridge = NULL;
176 static char *arg_network_zone = NULL;
177 static char *arg_network_namespace_path = NULL;
178 static unsigned long arg_personality = PERSONALITY_INVALID;
179 static char *arg_image = NULL;
180 static VolatileMode arg_volatile_mode = VOLATILE_NO;
181 static ExposePort *arg_expose_ports = NULL;
182 static char **arg_property = NULL;
183 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
184 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
185 static bool arg_userns_chown = false;
186 static int arg_kill_signal = 0;
187 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
188 static SettingsMask arg_settings_mask = 0;
189 static int arg_settings_trusted = -1;
190 static char **arg_parameters = NULL;
191 static const char *arg_container_service_name = "systemd-nspawn";
192 static bool arg_notify_ready = false;
193 static bool arg_use_cgns = true;
194 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
195 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
196 static void *arg_root_hash = NULL;
197 static size_t arg_root_hash_size = 0;
198 static char **arg_syscall_whitelist = NULL;
199 static char **arg_syscall_blacklist = NULL;
200 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
201 static bool arg_no_new_privileges = false;
202 static int arg_oom_score_adjust = 0;
203 static bool arg_oom_score_adjust_set = false;
204 static cpu_set_t *arg_cpuset = NULL;
205 static unsigned arg_cpuset_ncpus = 0;
206 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
207 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
208
209 static int help(void) {
210 _cleanup_free_ char *link = NULL;
211 int r;
212
213 (void) pager_open(false);
214
215 r = terminal_urlify_man("systemd-nspawn", "1", &link);
216 if (r < 0)
217 return log_oom();
218
219 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
220 "Spawn a command or OS in a light-weight container.\n\n"
221 " -h --help Show this help\n"
222 " --version Print version string\n"
223 " -q --quiet Do not show status information\n"
224 " -D --directory=PATH Root directory for the container\n"
225 " --template=PATH Initialize root directory from template directory,\n"
226 " if missing\n"
227 " -x --ephemeral Run container with snapshot of root directory, and\n"
228 " remove it after exit\n"
229 " -i --image=PATH File system device or disk image for the container\n"
230 " --root-hash=HASH Specify verity root hash\n"
231 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
232 " -b --boot Boot up full system (i.e. invoke init)\n"
233 " --chdir=PATH Set working directory in the container\n"
234 " --pivot-root=PATH[:PATH]\n"
235 " Pivot root to given directory in the container\n"
236 " -u --user=USER Run the command under specified user or uid\n"
237 " -M --machine=NAME Set the machine name for the container\n"
238 " --hostname=NAME Override the hostname for the container\n"
239 " --uuid=UUID Set a specific machine UUID for the container\n"
240 " -S --slice=SLICE Place the container in the specified slice\n"
241 " --property=NAME=VALUE Set scope unit property\n"
242 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
243 " --private-users[=UIDBASE[:NUIDS]]\n"
244 " Similar, but with user configured UID/GID range\n"
245 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
246 " --private-network Disable network in container\n"
247 " --network-interface=INTERFACE\n"
248 " Assign an existing network interface to the\n"
249 " container\n"
250 " --network-macvlan=INTERFACE\n"
251 " Create a macvlan network interface based on an\n"
252 " existing network interface to the container\n"
253 " --network-ipvlan=INTERFACE\n"
254 " Create a ipvlan network interface based on an\n"
255 " existing network interface to the container\n"
256 " -n --network-veth Add a virtual Ethernet connection between host\n"
257 " and container\n"
258 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
259 " Add an additional virtual Ethernet link between\n"
260 " host and container\n"
261 " --network-bridge=INTERFACE\n"
262 " Add a virtual Ethernet connection to the container\n"
263 " and attach it to an existing bridge on the host\n"
264 " --network-zone=NAME Similar, but attach the new interface to an\n"
265 " an automatically managed bridge interface\n"
266 " --network-namespace-path=PATH\n"
267 " Set network namespace to the one represented by\n"
268 " the specified kernel namespace file node\n"
269 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
270 " Expose a container IP port on the host\n"
271 " -Z --selinux-context=SECLABEL\n"
272 " Set the SELinux security context to be used by\n"
273 " processes in the container\n"
274 " -L --selinux-apifs-context=SECLABEL\n"
275 " Set the SELinux security context to be used by\n"
276 " API/tmpfs file systems in the container\n"
277 " --capability=CAP In addition to the default, retain specified\n"
278 " capability\n"
279 " --drop-capability=CAP Drop the specified capability from the default set\n"
280 " --system-call-filter=LIST|~LIST\n"
281 " Permit/prohibit specific system calls\n"
282 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
283 " --oom-score-adjust=VALUE\n"
284 " Adjust the OOM score value for the payload\n"
285 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
286 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
287 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
288 " host, try-guest, try-host\n"
289 " -j Equivalent to --link-journal=try-guest\n"
290 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
291 " --timezone=MODE Select mode of /etc/localtime initialization\n"
292 " --read-only Mount the root directory read-only\n"
293 " --bind=PATH[:PATH[:OPTIONS]]\n"
294 " Bind mount a file or directory from the host into\n"
295 " the container\n"
296 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
297 " Similar, but creates a read-only bind mount\n"
298 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
299 " --overlay=PATH[:PATH...]:PATH\n"
300 " Create an overlay mount from the host to \n"
301 " the container\n"
302 " --overlay-ro=PATH[:PATH...]:PATH\n"
303 " Similar, but creates a read-only overlay mount\n"
304 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
305 " --register=BOOLEAN Register container as machine\n"
306 " --keep-unit Do not register a scope for the machine, reuse\n"
307 " the service unit nspawn is running in\n"
308 " --volatile[=MODE] Run the system in volatile mode\n"
309 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
310 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
311 "\nSee the %s for details.\n"
312 , program_invocation_short_name
313 , link
314 );
315
316 return 0;
317 }
318
319 static int custom_mount_check_all(void) {
320 size_t i;
321
322 for (i = 0; i < arg_n_custom_mounts; i++) {
323 CustomMount *m = &arg_custom_mounts[i];
324
325 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
326 if (arg_userns_chown)
327 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
328 "--private-users-chown may not be combined with custom root mounts.");
329 else if (arg_uid_shift == UID_INVALID)
330 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
331 "--private-users with automatic UID shift may not be combined with custom root mounts.");
332 }
333 }
334
335 return 0;
336 }
337
338 static int detect_unified_cgroup_hierarchy_from_environment(void) {
339 const char *e;
340 int r;
341
342 /* Allow the user to control whether the unified hierarchy is used */
343 e = getenv("UNIFIED_CGROUP_HIERARCHY");
344 if (e) {
345 r = parse_boolean(e);
346 if (r < 0)
347 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
348 if (r > 0)
349 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
350 else
351 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
352 }
353
354 return 0;
355 }
356
357 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
358 int r;
359
360 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
361 * image actually supports. */
362 r = cg_all_unified();
363 if (r < 0)
364 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
365 if (r > 0) {
366 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
367 * routine only detects 231, so we'll have a false negative here for 230. */
368 r = systemd_installation_has_version(directory, 230);
369 if (r < 0)
370 return log_error_errno(r, "Failed to determine systemd version in container: %m");
371 if (r > 0)
372 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
373 else
374 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
375 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
376 /* Mixed cgroup hierarchy support was added in 233 */
377 r = systemd_installation_has_version(directory, 233);
378 if (r < 0)
379 return log_error_errno(r, "Failed to determine systemd version in container: %m");
380 if (r > 0)
381 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
382 else
383 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
384 } else
385 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
386
387 log_debug("Using %s hierarchy for container.",
388 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
389 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
390
391 return 0;
392 }
393
394 static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
395 int r;
396
397 r = getenv_bool(name);
398 if (r == -ENXIO)
399 return;
400 if (r < 0)
401 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
402 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
403 }
404
405 static void parse_mount_settings_env(void) {
406 const char *e;
407 int r;
408
409 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
410 if (r >= 0)
411 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
412 else if (r != -ENXIO)
413 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
414
415 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
416 if (!e)
417 return;
418
419 if (streq(e, "network")) {
420 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
421 return;
422 }
423
424 r = parse_boolean(e);
425 if (r < 0) {
426 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
427 return;
428 }
429
430 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
431 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
432 }
433
434 static void parse_environment(void) {
435 const char *e;
436 int r;
437
438 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
439 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
440 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
441 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
442
443 parse_mount_settings_env();
444
445 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
446 * even if it is supported. If not supported, it has no effect. */
447 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
448 if (r == 0 || !cg_ns_supported())
449 arg_use_cgns = false;
450
451 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
452 if (e)
453 arg_container_service_name = e;
454
455 detect_unified_cgroup_hierarchy_from_environment();
456 }
457
458 static int parse_argv(int argc, char *argv[]) {
459 enum {
460 ARG_VERSION = 0x100,
461 ARG_PRIVATE_NETWORK,
462 ARG_UUID,
463 ARG_READ_ONLY,
464 ARG_CAPABILITY,
465 ARG_DROP_CAPABILITY,
466 ARG_LINK_JOURNAL,
467 ARG_BIND,
468 ARG_BIND_RO,
469 ARG_TMPFS,
470 ARG_OVERLAY,
471 ARG_OVERLAY_RO,
472 ARG_SHARE_SYSTEM,
473 ARG_REGISTER,
474 ARG_KEEP_UNIT,
475 ARG_NETWORK_INTERFACE,
476 ARG_NETWORK_MACVLAN,
477 ARG_NETWORK_IPVLAN,
478 ARG_NETWORK_BRIDGE,
479 ARG_NETWORK_ZONE,
480 ARG_NETWORK_VETH_EXTRA,
481 ARG_NETWORK_NAMESPACE_PATH,
482 ARG_PERSONALITY,
483 ARG_VOLATILE,
484 ARG_TEMPLATE,
485 ARG_PROPERTY,
486 ARG_PRIVATE_USERS,
487 ARG_KILL_SIGNAL,
488 ARG_SETTINGS,
489 ARG_CHDIR,
490 ARG_PIVOT_ROOT,
491 ARG_PRIVATE_USERS_CHOWN,
492 ARG_NOTIFY_READY,
493 ARG_ROOT_HASH,
494 ARG_SYSTEM_CALL_FILTER,
495 ARG_RLIMIT,
496 ARG_HOSTNAME,
497 ARG_NO_NEW_PRIVILEGES,
498 ARG_OOM_SCORE_ADJUST,
499 ARG_CPU_AFFINITY,
500 ARG_RESOLV_CONF,
501 ARG_TIMEZONE,
502 };
503
504 static const struct option options[] = {
505 { "help", no_argument, NULL, 'h' },
506 { "version", no_argument, NULL, ARG_VERSION },
507 { "directory", required_argument, NULL, 'D' },
508 { "template", required_argument, NULL, ARG_TEMPLATE },
509 { "ephemeral", no_argument, NULL, 'x' },
510 { "user", required_argument, NULL, 'u' },
511 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
512 { "as-pid2", no_argument, NULL, 'a' },
513 { "boot", no_argument, NULL, 'b' },
514 { "uuid", required_argument, NULL, ARG_UUID },
515 { "read-only", no_argument, NULL, ARG_READ_ONLY },
516 { "capability", required_argument, NULL, ARG_CAPABILITY },
517 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
518 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
519 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
520 { "bind", required_argument, NULL, ARG_BIND },
521 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
522 { "tmpfs", required_argument, NULL, ARG_TMPFS },
523 { "overlay", required_argument, NULL, ARG_OVERLAY },
524 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
525 { "machine", required_argument, NULL, 'M' },
526 { "hostname", required_argument, NULL, ARG_HOSTNAME },
527 { "slice", required_argument, NULL, 'S' },
528 { "setenv", required_argument, NULL, 'E' },
529 { "selinux-context", required_argument, NULL, 'Z' },
530 { "selinux-apifs-context", required_argument, NULL, 'L' },
531 { "quiet", no_argument, NULL, 'q' },
532 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
533 { "register", required_argument, NULL, ARG_REGISTER },
534 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
535 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
536 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
537 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
538 { "network-veth", no_argument, NULL, 'n' },
539 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
540 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
541 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
542 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
543 { "personality", required_argument, NULL, ARG_PERSONALITY },
544 { "image", required_argument, NULL, 'i' },
545 { "volatile", optional_argument, NULL, ARG_VOLATILE },
546 { "port", required_argument, NULL, 'p' },
547 { "property", required_argument, NULL, ARG_PROPERTY },
548 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
549 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
550 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
551 { "settings", required_argument, NULL, ARG_SETTINGS },
552 { "chdir", required_argument, NULL, ARG_CHDIR },
553 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
554 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
555 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
556 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
557 { "rlimit", required_argument, NULL, ARG_RLIMIT },
558 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
559 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
560 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
561 { "timezone", required_argument, NULL, ARG_TIMEZONE },
562 {}
563 };
564
565 int c, r;
566 const char *p;
567 uint64_t plus = 0, minus = 0;
568 bool mask_all_settings = false, mask_no_settings = false;
569
570 assert(argc >= 0);
571 assert(argv);
572
573 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
574 switch (c) {
575
576 case 'h':
577 return help();
578
579 case ARG_VERSION:
580 return version();
581
582 case 'D':
583 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
584 if (r < 0)
585 return r;
586 break;
587
588 case ARG_TEMPLATE:
589 r = parse_path_argument_and_warn(optarg, false, &arg_template);
590 if (r < 0)
591 return r;
592 break;
593
594 case 'i':
595 r = parse_path_argument_and_warn(optarg, false, &arg_image);
596 if (r < 0)
597 return r;
598 break;
599
600 case 'x':
601 arg_ephemeral = true;
602 arg_settings_mask |= SETTING_EPHEMERAL;
603 break;
604
605 case 'u':
606 r = free_and_strdup(&arg_user, optarg);
607 if (r < 0)
608 return log_oom();
609
610 arg_settings_mask |= SETTING_USER;
611 break;
612
613 case ARG_NETWORK_ZONE: {
614 char *j;
615
616 j = strappend("vz-", optarg);
617 if (!j)
618 return log_oom();
619
620 if (!ifname_valid(j)) {
621 log_error("Network zone name not valid: %s", j);
622 free(j);
623 return -EINVAL;
624 }
625
626 free_and_replace(arg_network_zone, j);
627
628 arg_network_veth = true;
629 arg_private_network = true;
630 arg_settings_mask |= SETTING_NETWORK;
631 break;
632 }
633
634 case ARG_NETWORK_BRIDGE:
635
636 if (!ifname_valid(optarg))
637 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
638 "Bridge interface name not valid: %s", optarg);
639
640 r = free_and_strdup(&arg_network_bridge, optarg);
641 if (r < 0)
642 return log_oom();
643
644 _fallthrough_;
645 case 'n':
646 arg_network_veth = true;
647 arg_private_network = true;
648 arg_settings_mask |= SETTING_NETWORK;
649 break;
650
651 case ARG_NETWORK_VETH_EXTRA:
652 r = veth_extra_parse(&arg_network_veth_extra, optarg);
653 if (r < 0)
654 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
655
656 arg_private_network = true;
657 arg_settings_mask |= SETTING_NETWORK;
658 break;
659
660 case ARG_NETWORK_INTERFACE:
661 if (!ifname_valid(optarg))
662 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
663 "Network interface name not valid: %s", optarg);
664
665 if (strv_extend(&arg_network_interfaces, optarg) < 0)
666 return log_oom();
667
668 arg_private_network = true;
669 arg_settings_mask |= SETTING_NETWORK;
670 break;
671
672 case ARG_NETWORK_MACVLAN:
673
674 if (!ifname_valid(optarg))
675 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
676 "MACVLAN network interface name not valid: %s", optarg);
677
678 if (strv_extend(&arg_network_macvlan, optarg) < 0)
679 return log_oom();
680
681 arg_private_network = true;
682 arg_settings_mask |= SETTING_NETWORK;
683 break;
684
685 case ARG_NETWORK_IPVLAN:
686
687 if (!ifname_valid(optarg))
688 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
689 "IPVLAN network interface name not valid: %s", optarg);
690
691 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
692 return log_oom();
693
694 _fallthrough_;
695 case ARG_PRIVATE_NETWORK:
696 arg_private_network = true;
697 arg_settings_mask |= SETTING_NETWORK;
698 break;
699
700 case ARG_NETWORK_NAMESPACE_PATH:
701 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
702 if (r < 0)
703 return r;
704
705 break;
706
707 case 'b':
708 if (arg_start_mode == START_PID2)
709 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
710 "--boot and --as-pid2 may not be combined.");
711
712 arg_start_mode = START_BOOT;
713 arg_settings_mask |= SETTING_START_MODE;
714 break;
715
716 case 'a':
717 if (arg_start_mode == START_BOOT)
718 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
719 "--boot and --as-pid2 may not be combined.");
720
721 arg_start_mode = START_PID2;
722 arg_settings_mask |= SETTING_START_MODE;
723 break;
724
725 case ARG_UUID:
726 r = sd_id128_from_string(optarg, &arg_uuid);
727 if (r < 0)
728 return log_error_errno(r, "Invalid UUID: %s", optarg);
729
730 if (sd_id128_is_null(arg_uuid))
731 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
732 "Machine UUID may not be all zeroes.");
733
734 arg_settings_mask |= SETTING_MACHINE_ID;
735 break;
736
737 case 'S':
738 arg_slice = optarg;
739 break;
740
741 case 'M':
742 if (isempty(optarg))
743 arg_machine = mfree(arg_machine);
744 else {
745 if (!machine_name_is_valid(optarg))
746 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
747 "Invalid machine name: %s", optarg);
748
749 r = free_and_strdup(&arg_machine, optarg);
750 if (r < 0)
751 return log_oom();
752 }
753 break;
754
755 case ARG_HOSTNAME:
756 if (isempty(optarg))
757 arg_hostname = mfree(arg_hostname);
758 else {
759 if (!hostname_is_valid(optarg, false))
760 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
761 "Invalid hostname: %s", optarg);
762
763 r = free_and_strdup(&arg_hostname, optarg);
764 if (r < 0)
765 return log_oom();
766 }
767
768 arg_settings_mask |= SETTING_HOSTNAME;
769 break;
770
771 case 'Z':
772 arg_selinux_context = optarg;
773 break;
774
775 case 'L':
776 arg_selinux_apifs_context = optarg;
777 break;
778
779 case ARG_READ_ONLY:
780 arg_read_only = true;
781 arg_settings_mask |= SETTING_READ_ONLY;
782 break;
783
784 case ARG_CAPABILITY:
785 case ARG_DROP_CAPABILITY: {
786 p = optarg;
787 for (;;) {
788 _cleanup_free_ char *t = NULL;
789
790 r = extract_first_word(&p, &t, ",", 0);
791 if (r < 0)
792 return log_error_errno(r, "Failed to parse capability %s.", t);
793
794 if (r == 0)
795 break;
796
797 if (streq(t, "all")) {
798 if (c == ARG_CAPABILITY)
799 plus = (uint64_t) -1;
800 else
801 minus = (uint64_t) -1;
802 } else {
803 r = capability_from_name(t);
804 if (r < 0)
805 return log_error_errno(r, "Failed to parse capability %s.", t);
806
807 if (c == ARG_CAPABILITY)
808 plus |= 1ULL << r;
809 else
810 minus |= 1ULL << r;
811 }
812 }
813
814 arg_settings_mask |= SETTING_CAPABILITY;
815 break;
816 }
817
818 case ARG_NO_NEW_PRIVILEGES:
819 r = parse_boolean(optarg);
820 if (r < 0)
821 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
822
823 arg_no_new_privileges = r;
824 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
825 break;
826
827 case 'j':
828 arg_link_journal = LINK_GUEST;
829 arg_link_journal_try = true;
830 arg_settings_mask |= SETTING_LINK_JOURNAL;
831 break;
832
833 case ARG_LINK_JOURNAL:
834 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
835 if (r < 0) {
836 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
837 return -EINVAL;
838 }
839
840 arg_settings_mask |= SETTING_LINK_JOURNAL;
841 break;
842
843 case ARG_BIND:
844 case ARG_BIND_RO:
845 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
846 if (r < 0)
847 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
848
849 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
850 break;
851
852 case ARG_TMPFS:
853 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
854 if (r < 0)
855 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
856
857 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
858 break;
859
860 case ARG_OVERLAY:
861 case ARG_OVERLAY_RO:
862 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
863 if (r == -EADDRNOTAVAIL)
864 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
865 if (r < 0)
866 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
867
868 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
869 break;
870
871 case 'E': {
872 char **n;
873
874 if (!env_assignment_is_valid(optarg))
875 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
876 "Environment variable assignment '%s' is not valid.", optarg);
877
878 n = strv_env_set(arg_setenv, optarg);
879 if (!n)
880 return log_oom();
881
882 strv_free_and_replace(arg_setenv, n);
883 arg_settings_mask |= SETTING_ENVIRONMENT;
884 break;
885 }
886
887 case 'q':
888 arg_quiet = true;
889 break;
890
891 case ARG_SHARE_SYSTEM:
892 /* We don't officially support this anymore, except for compat reasons. People should use the
893 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
894 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
895 arg_clone_ns_flags = 0;
896 break;
897
898 case ARG_REGISTER:
899 r = parse_boolean(optarg);
900 if (r < 0) {
901 log_error("Failed to parse --register= argument: %s", optarg);
902 return r;
903 }
904
905 arg_register = r;
906 break;
907
908 case ARG_KEEP_UNIT:
909 arg_keep_unit = true;
910 break;
911
912 case ARG_PERSONALITY:
913
914 arg_personality = personality_from_string(optarg);
915 if (arg_personality == PERSONALITY_INVALID)
916 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
917 "Unknown or unsupported personality '%s'.", optarg);
918
919 arg_settings_mask |= SETTING_PERSONALITY;
920 break;
921
922 case ARG_VOLATILE:
923
924 if (!optarg)
925 arg_volatile_mode = VOLATILE_YES;
926 else if (streq(optarg, "help")) {
927 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
928 return 0;
929 } else {
930 VolatileMode m;
931
932 m = volatile_mode_from_string(optarg);
933 if (m < 0)
934 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
935 "Failed to parse --volatile= argument: %s", optarg);
936 else
937 arg_volatile_mode = m;
938 }
939
940 arg_settings_mask |= SETTING_VOLATILE_MODE;
941 break;
942
943 case 'p':
944 r = expose_port_parse(&arg_expose_ports, optarg);
945 if (r == -EEXIST)
946 return log_error_errno(r, "Duplicate port specification: %s", optarg);
947 if (r < 0)
948 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
949
950 arg_settings_mask |= SETTING_EXPOSE_PORTS;
951 break;
952
953 case ARG_PROPERTY:
954 if (strv_extend(&arg_property, optarg) < 0)
955 return log_oom();
956
957 break;
958
959 case ARG_PRIVATE_USERS: {
960 int boolean = -1;
961
962 if (!optarg)
963 boolean = true;
964 else if (!in_charset(optarg, DIGITS))
965 /* do *not* parse numbers as booleans */
966 boolean = parse_boolean(optarg);
967
968 if (boolean == false) {
969 /* no: User namespacing off */
970 arg_userns_mode = USER_NAMESPACE_NO;
971 arg_uid_shift = UID_INVALID;
972 arg_uid_range = UINT32_C(0x10000);
973 } else if (boolean == true) {
974 /* yes: User namespacing on, UID range is read from root dir */
975 arg_userns_mode = USER_NAMESPACE_FIXED;
976 arg_uid_shift = UID_INVALID;
977 arg_uid_range = UINT32_C(0x10000);
978 } else if (streq(optarg, "pick")) {
979 /* pick: User namespacing on, UID range is picked randomly */
980 arg_userns_mode = USER_NAMESPACE_PICK;
981 arg_uid_shift = UID_INVALID;
982 arg_uid_range = UINT32_C(0x10000);
983 } else {
984 _cleanup_free_ char *buffer = NULL;
985 const char *range, *shift;
986
987 /* anything else: User namespacing on, UID range is explicitly configured */
988
989 range = strchr(optarg, ':');
990 if (range) {
991 buffer = strndup(optarg, range - optarg);
992 if (!buffer)
993 return log_oom();
994 shift = buffer;
995
996 range++;
997 r = safe_atou32(range, &arg_uid_range);
998 if (r < 0)
999 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1000 } else
1001 shift = optarg;
1002
1003 r = parse_uid(shift, &arg_uid_shift);
1004 if (r < 0)
1005 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1006
1007 arg_userns_mode = USER_NAMESPACE_FIXED;
1008 }
1009
1010 if (arg_uid_range <= 0)
1011 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1012 "UID range cannot be 0.");
1013
1014 arg_settings_mask |= SETTING_USERNS;
1015 break;
1016 }
1017
1018 case 'U':
1019 if (userns_supported()) {
1020 arg_userns_mode = USER_NAMESPACE_PICK;
1021 arg_uid_shift = UID_INVALID;
1022 arg_uid_range = UINT32_C(0x10000);
1023
1024 arg_settings_mask |= SETTING_USERNS;
1025 }
1026
1027 break;
1028
1029 case ARG_PRIVATE_USERS_CHOWN:
1030 arg_userns_chown = true;
1031
1032 arg_settings_mask |= SETTING_USERNS;
1033 break;
1034
1035 case ARG_KILL_SIGNAL:
1036 if (streq(optarg, "help")) {
1037 DUMP_STRING_TABLE(signal, int, _NSIG);
1038 return 0;
1039 }
1040
1041 arg_kill_signal = signal_from_string(optarg);
1042 if (arg_kill_signal < 0)
1043 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1044 "Cannot parse signal: %s", optarg);
1045
1046 arg_settings_mask |= SETTING_KILL_SIGNAL;
1047 break;
1048
1049 case ARG_SETTINGS:
1050
1051 /* no → do not read files
1052 * yes → read files, do not override cmdline, trust only subset
1053 * override → read files, override cmdline, trust only subset
1054 * trusted → read files, do not override cmdline, trust all
1055 */
1056
1057 r = parse_boolean(optarg);
1058 if (r < 0) {
1059 if (streq(optarg, "trusted")) {
1060 mask_all_settings = false;
1061 mask_no_settings = false;
1062 arg_settings_trusted = true;
1063
1064 } else if (streq(optarg, "override")) {
1065 mask_all_settings = false;
1066 mask_no_settings = true;
1067 arg_settings_trusted = -1;
1068 } else
1069 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1070 } else if (r > 0) {
1071 /* yes */
1072 mask_all_settings = false;
1073 mask_no_settings = false;
1074 arg_settings_trusted = -1;
1075 } else {
1076 /* no */
1077 mask_all_settings = true;
1078 mask_no_settings = false;
1079 arg_settings_trusted = false;
1080 }
1081
1082 break;
1083
1084 case ARG_CHDIR:
1085 if (!path_is_absolute(optarg))
1086 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1087 "Working directory %s is not an absolute path.", optarg);
1088
1089 r = free_and_strdup(&arg_chdir, optarg);
1090 if (r < 0)
1091 return log_oom();
1092
1093 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1094 break;
1095
1096 case ARG_PIVOT_ROOT:
1097 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1098 if (r < 0)
1099 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1100
1101 arg_settings_mask |= SETTING_PIVOT_ROOT;
1102 break;
1103
1104 case ARG_NOTIFY_READY:
1105 r = parse_boolean(optarg);
1106 if (r < 0)
1107 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1108 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1109 arg_notify_ready = r;
1110 arg_settings_mask |= SETTING_NOTIFY_READY;
1111 break;
1112
1113 case ARG_ROOT_HASH: {
1114 void *k;
1115 size_t l;
1116
1117 r = unhexmem(optarg, strlen(optarg), &k, &l);
1118 if (r < 0)
1119 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1120 if (l < sizeof(sd_id128_t)) {
1121 log_error("Root hash must be at least 128bit long: %s", optarg);
1122 free(k);
1123 return -EINVAL;
1124 }
1125
1126 free(arg_root_hash);
1127 arg_root_hash = k;
1128 arg_root_hash_size = l;
1129 break;
1130 }
1131
1132 case ARG_SYSTEM_CALL_FILTER: {
1133 bool negative;
1134 const char *items;
1135
1136 negative = optarg[0] == '~';
1137 items = negative ? optarg + 1 : optarg;
1138
1139 for (;;) {
1140 _cleanup_free_ char *word = NULL;
1141
1142 r = extract_first_word(&items, &word, NULL, 0);
1143 if (r == 0)
1144 break;
1145 if (r == -ENOMEM)
1146 return log_oom();
1147 if (r < 0)
1148 return log_error_errno(r, "Failed to parse system call filter: %m");
1149
1150 if (negative)
1151 r = strv_extend(&arg_syscall_blacklist, word);
1152 else
1153 r = strv_extend(&arg_syscall_whitelist, word);
1154 if (r < 0)
1155 return log_oom();
1156 }
1157
1158 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1159 break;
1160 }
1161
1162 case ARG_RLIMIT: {
1163 const char *eq;
1164 char *name;
1165 int rl;
1166
1167 if (streq(optarg, "help")) {
1168 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1169 return 0;
1170 }
1171
1172 eq = strchr(optarg, '=');
1173 if (!eq)
1174 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1175 "--rlimit= expects an '=' assignment.");
1176
1177 name = strndup(optarg, eq - optarg);
1178 if (!name)
1179 return log_oom();
1180
1181 rl = rlimit_from_string_harder(name);
1182 if (rl < 0)
1183 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1184 "Unknown resource limit: %s", name);
1185
1186 if (!arg_rlimit[rl]) {
1187 arg_rlimit[rl] = new0(struct rlimit, 1);
1188 if (!arg_rlimit[rl])
1189 return log_oom();
1190 }
1191
1192 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1193 if (r < 0)
1194 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1195
1196 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1197 break;
1198 }
1199
1200 case ARG_OOM_SCORE_ADJUST:
1201 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1202 if (r < 0)
1203 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1204
1205 arg_oom_score_adjust_set = true;
1206 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1207 break;
1208
1209 case ARG_CPU_AFFINITY: {
1210 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1211
1212 r = parse_cpu_set(optarg, &cpuset);
1213 if (r < 0)
1214 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1215
1216 if (arg_cpuset)
1217 CPU_FREE(arg_cpuset);
1218
1219 arg_cpuset = TAKE_PTR(cpuset);
1220 arg_cpuset_ncpus = r;
1221 arg_settings_mask |= SETTING_CPU_AFFINITY;
1222 break;
1223 }
1224
1225 case ARG_RESOLV_CONF:
1226 if (streq(optarg, "help")) {
1227 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1228 return 0;
1229 }
1230
1231 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1232 if (arg_resolv_conf < 0)
1233 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1234 "Failed to parse /etc/resolv.conf mode: %s", optarg);
1235
1236 arg_settings_mask |= SETTING_RESOLV_CONF;
1237 break;
1238
1239 case ARG_TIMEZONE:
1240 if (streq(optarg, "help")) {
1241 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1242 return 0;
1243 }
1244
1245 arg_timezone = timezone_mode_from_string(optarg);
1246 if (arg_timezone < 0)
1247 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1248 "Failed to parse /etc/localtime mode: %s", optarg);
1249
1250 arg_settings_mask |= SETTING_TIMEZONE;
1251 break;
1252
1253 case '?':
1254 return -EINVAL;
1255
1256 default:
1257 assert_not_reached("Unhandled option");
1258 }
1259
1260 if (argc > optind) {
1261 strv_free(arg_parameters);
1262 arg_parameters = strv_copy(argv + optind);
1263 if (!arg_parameters)
1264 return log_oom();
1265
1266 arg_settings_mask |= SETTING_START_MODE;
1267 }
1268
1269 if (arg_ephemeral && arg_template && !arg_directory)
1270 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1271 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1272 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1273 * --directory=". */
1274 arg_directory = TAKE_PTR(arg_template);
1275
1276 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1277
1278 /* Load all settings from .nspawn files */
1279 if (mask_no_settings)
1280 arg_settings_mask = 0;
1281
1282 /* Don't load any settings from .nspawn files */
1283 if (mask_all_settings)
1284 arg_settings_mask = _SETTINGS_MASK_ALL;
1285
1286 return 1;
1287 }
1288
1289 static int verify_arguments(void) {
1290 int r;
1291
1292 if (arg_userns_mode != USER_NAMESPACE_NO)
1293 arg_mount_settings |= MOUNT_USE_USERNS;
1294
1295 if (arg_private_network)
1296 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1297
1298 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1299 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1300 arg_register = false;
1301 if (arg_start_mode != START_PID1)
1302 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1303 }
1304
1305 if (arg_userns_mode == USER_NAMESPACE_PICK)
1306 arg_userns_chown = true;
1307
1308 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1309 arg_kill_signal = SIGRTMIN+3;
1310
1311 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1312 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1313 * The latter is not technically a user session, but we don't need to labour the point. */
1314 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1315
1316 if (arg_directory && arg_image)
1317 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1318
1319 if (arg_template && arg_image)
1320 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1321
1322 if (arg_template && !(arg_directory || arg_machine))
1323 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1324
1325 if (arg_ephemeral && arg_template)
1326 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1327
1328 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1329 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1330
1331 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1332 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1333
1334 if (arg_userns_chown && arg_read_only)
1335 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
1336
1337 /* If --network-namespace-path is given with any other network-related option,
1338 * we need to error out, to avoid conflicts between different network options. */
1339 if (arg_network_namespace_path &&
1340 (arg_network_interfaces || arg_network_macvlan ||
1341 arg_network_ipvlan || arg_network_veth_extra ||
1342 arg_network_bridge || arg_network_zone ||
1343 arg_network_veth || arg_private_network))
1344 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path cannot be combined with other network options.");
1345
1346 if (arg_network_bridge && arg_network_zone)
1347 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-bridge= and --network-zone= may not be combined.");
1348
1349 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1350 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1351
1352 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1353 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1354
1355 if (arg_volatile_mode != VOLATILE_NO && arg_read_only)
1356 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1357
1358 if (arg_expose_ports && !arg_private_network)
1359 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1360
1361 #if ! HAVE_LIBIPTC
1362 if (arg_expose_ports)
1363 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--port= is not supported, compiled without libiptc support.");
1364 #endif
1365
1366 r = custom_mount_check_all();
1367 if (r < 0)
1368 return r;
1369
1370 return 0;
1371 }
1372
1373 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1374 assert(p);
1375
1376 if (arg_userns_mode == USER_NAMESPACE_NO)
1377 return 0;
1378
1379 if (uid == UID_INVALID && gid == GID_INVALID)
1380 return 0;
1381
1382 if (uid != UID_INVALID) {
1383 uid += arg_uid_shift;
1384
1385 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1386 return -EOVERFLOW;
1387 }
1388
1389 if (gid != GID_INVALID) {
1390 gid += (gid_t) arg_uid_shift;
1391
1392 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1393 return -EOVERFLOW;
1394 }
1395
1396 if (lchown(p, uid, gid) < 0)
1397 return -errno;
1398
1399 return 0;
1400 }
1401
1402 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1403 const char *q;
1404 int r;
1405
1406 q = prefix_roota(root, path);
1407 r = mkdir_errno_wrapper(q, mode);
1408 if (r == -EEXIST)
1409 return 0;
1410 if (r < 0)
1411 return r;
1412
1413 return userns_lchown(q, uid, gid);
1414 }
1415
1416 static const char *timezone_from_path(const char *path) {
1417 return PATH_STARTSWITH_SET(
1418 path,
1419 "../usr/share/zoneinfo/",
1420 "/usr/share/zoneinfo/");
1421 }
1422
1423 static int setup_timezone(const char *dest) {
1424 _cleanup_free_ char *p = NULL, *etc = NULL;
1425 const char *where, *check;
1426 TimezoneMode m;
1427 int r;
1428
1429 assert(dest);
1430
1431 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1432 r = readlink_malloc("/etc/localtime", &p);
1433 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1434 m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_OFF : TIMEZONE_DELETE;
1435 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1436 m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_BIND : TIMEZONE_COPY;
1437 else if (r < 0) {
1438 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1439 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1440 * file.
1441 *
1442 * Example:
1443 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1444 */
1445 return 0;
1446 } else if (arg_timezone == TIMEZONE_AUTO)
1447 m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? TIMEZONE_BIND : TIMEZONE_SYMLINK;
1448 else
1449 m = arg_timezone;
1450 } else
1451 m = arg_timezone;
1452
1453 if (m == TIMEZONE_OFF)
1454 return 0;
1455
1456 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1457 if (r < 0) {
1458 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1459 return 0;
1460 }
1461
1462 where = strjoina(etc, "/localtime");
1463
1464 switch (m) {
1465
1466 case TIMEZONE_DELETE:
1467 if (unlink(where) < 0)
1468 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1469
1470 return 0;
1471
1472 case TIMEZONE_SYMLINK: {
1473 _cleanup_free_ char *q = NULL;
1474 const char *z, *what;
1475
1476 z = timezone_from_path(p);
1477 if (!z) {
1478 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1479 return 0;
1480 }
1481
1482 r = readlink_malloc(where, &q);
1483 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1484 return 0; /* Already pointing to the right place? Then do nothing .. */
1485
1486 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1487 r = chase_symlinks(check, dest, 0, NULL);
1488 if (r < 0)
1489 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1490 else {
1491 if (unlink(where) < 0 && errno != ENOENT) {
1492 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1493 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1494 return 0;
1495 }
1496
1497 what = strjoina("../usr/share/zoneinfo/", z);
1498 if (symlink(what, where) < 0) {
1499 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1500 errno, "Failed to correct timezone of container, ignoring: %m");
1501 return 0;
1502 }
1503
1504 break;
1505 }
1506
1507 _fallthrough_;
1508 }
1509
1510 case TIMEZONE_BIND: {
1511 _cleanup_free_ char *resolved = NULL;
1512 int found;
1513
1514 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1515 if (found < 0) {
1516 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1517 return 0;
1518 }
1519
1520 if (found == 0) /* missing? */
1521 (void) touch(resolved);
1522
1523 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1524 if (r >= 0)
1525 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1526
1527 _fallthrough_;
1528 }
1529
1530 case TIMEZONE_COPY:
1531 /* If mounting failed, try to copy */
1532 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1533 if (r < 0) {
1534 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1535 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1536 return 0;
1537 }
1538
1539 break;
1540
1541 default:
1542 assert_not_reached("unexpected mode");
1543 }
1544
1545 /* Fix permissions of the symlink or file copy we just created */
1546 r = userns_lchown(where, 0, 0);
1547 if (r < 0)
1548 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
1549
1550 return 0;
1551 }
1552
1553 static int have_resolv_conf(const char *path) {
1554 assert(path);
1555
1556 if (access(path, F_OK) < 0) {
1557 if (errno == ENOENT)
1558 return 0;
1559
1560 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1561 }
1562
1563 return 1;
1564 }
1565
1566 static int resolved_listening(void) {
1567 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
1568 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1569 _cleanup_free_ char *dns_stub_listener_mode = NULL;
1570 int r;
1571
1572 /* Check if resolved is listening */
1573
1574 r = sd_bus_open_system(&bus);
1575 if (r < 0)
1576 return log_debug_errno(r, "Failed to open system bus: %m");
1577
1578 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1579 if (r < 0)
1580 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1581 if (r == 0)
1582 return 0;
1583
1584 r = sd_bus_get_property_string(bus,
1585 "org.freedesktop.resolve1",
1586 "/org/freedesktop/resolve1",
1587 "org.freedesktop.resolve1.Manager",
1588 "DNSStubListener",
1589 &error,
1590 &dns_stub_listener_mode);
1591 if (r < 0)
1592 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
1593
1594 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
1595 }
1596
1597 static int setup_resolv_conf(const char *dest) {
1598 _cleanup_free_ char *etc = NULL;
1599 const char *where, *what;
1600 ResolvConfMode m;
1601 int r;
1602
1603 assert(dest);
1604
1605 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1606 if (arg_private_network)
1607 m = RESOLV_CONF_OFF;
1608 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1609 m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC;
1610 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1611 m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
1612 else
1613 m = arg_read_only && IN_SET(arg_volatile_mode, VOLATILE_NO, VOLATILE_STATE) ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
1614 } else
1615 m = arg_resolv_conf;
1616
1617 if (m == RESOLV_CONF_OFF)
1618 return 0;
1619
1620 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1621 if (r < 0) {
1622 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1623 return 0;
1624 }
1625
1626 where = strjoina(etc, "/resolv.conf");
1627
1628 if (m == RESOLV_CONF_DELETE) {
1629 if (unlink(where) < 0)
1630 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1631
1632 return 0;
1633 }
1634
1635 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1636 what = STATIC_RESOLV_CONF;
1637 else
1638 what = "/etc/resolv.conf";
1639
1640 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1641 _cleanup_free_ char *resolved = NULL;
1642 int found;
1643
1644 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1645 if (found < 0) {
1646 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1647 return 0;
1648 }
1649
1650 if (found == 0) /* missing? */
1651 (void) touch(resolved);
1652
1653 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
1654 if (r >= 0)
1655 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1656 }
1657
1658 /* If that didn't work, let's copy the file */
1659 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
1660 if (r < 0) {
1661 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1662 * resolved or something similar runs inside and the symlink points there.
1663 *
1664 * If the disk image is read-only, there's also no point in complaining.
1665 */
1666 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1667 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1668 return 0;
1669 }
1670
1671 r = userns_lchown(where, 0, 0);
1672 if (r < 0)
1673 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1674
1675 return 0;
1676 }
1677
1678 static int setup_boot_id(void) {
1679 _cleanup_(unlink_and_freep) char *from = NULL;
1680 _cleanup_free_ char *path = NULL;
1681 sd_id128_t rnd = SD_ID128_NULL;
1682 const char *to;
1683 int r;
1684
1685 /* Generate a new randomized boot ID, so that each boot-up of
1686 * the container gets a new one */
1687
1688 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1689 if (r < 0)
1690 return log_error_errno(r, "Failed to generate random boot ID path: %m");
1691
1692 r = sd_id128_randomize(&rnd);
1693 if (r < 0)
1694 return log_error_errno(r, "Failed to generate random boot id: %m");
1695
1696 r = id128_write(path, ID128_UUID, rnd, false);
1697 if (r < 0)
1698 return log_error_errno(r, "Failed to write boot id: %m");
1699
1700 from = TAKE_PTR(path);
1701 to = "/proc/sys/kernel/random/boot_id";
1702
1703 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1704 if (r < 0)
1705 return r;
1706
1707 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
1708 }
1709
1710 static int copy_devnodes(const char *dest) {
1711 static const char devnodes[] =
1712 "null\0"
1713 "zero\0"
1714 "full\0"
1715 "random\0"
1716 "urandom\0"
1717 "tty\0"
1718 "net/tun\0";
1719
1720 const char *d;
1721 int r = 0;
1722 _cleanup_umask_ mode_t u;
1723
1724 assert(dest);
1725
1726 u = umask(0000);
1727
1728 /* Create /dev/net, so that we can create /dev/net/tun in it */
1729 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1730 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1731
1732 NULSTR_FOREACH(d, devnodes) {
1733 _cleanup_free_ char *from = NULL, *to = NULL;
1734 struct stat st;
1735
1736 from = strappend("/dev/", d);
1737 if (!from)
1738 return log_oom();
1739
1740 to = prefix_root(dest, from);
1741 if (!to)
1742 return log_oom();
1743
1744 if (stat(from, &st) < 0) {
1745
1746 if (errno != ENOENT)
1747 return log_error_errno(errno, "Failed to stat %s: %m", from);
1748
1749 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1750 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1751 "%s is not a char or block device, cannot copy.", from);
1752 else {
1753 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1754
1755 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1756 /* Explicitly warn the user when /dev is already populated. */
1757 if (errno == EEXIST)
1758 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
1759 if (errno != EPERM)
1760 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1761
1762 /* Some systems abusively restrict mknod but allow bind mounts. */
1763 r = touch(to);
1764 if (r < 0)
1765 return log_error_errno(r, "touch (%s) failed: %m", to);
1766 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1767 if (r < 0)
1768 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1769 }
1770
1771 r = userns_lchown(to, 0, 0);
1772 if (r < 0)
1773 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1774
1775 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1776 if (!dn)
1777 return log_oom();
1778
1779 r = userns_mkdir(dest, dn, 0755, 0, 0);
1780 if (r < 0)
1781 return log_error_errno(r, "Failed to create '%s': %m", dn);
1782
1783 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1784 return log_oom();
1785
1786 prefixed = prefix_root(dest, sl);
1787 if (!prefixed)
1788 return log_oom();
1789
1790 t = strjoin("../", d);
1791 if (!t)
1792 return log_oom();
1793
1794 if (symlink(t, prefixed) < 0)
1795 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
1796 }
1797 }
1798
1799 return r;
1800 }
1801
1802 static int setup_pts(const char *dest) {
1803 _cleanup_free_ char *options = NULL;
1804 const char *p;
1805 int r;
1806
1807 #if HAVE_SELINUX
1808 if (arg_selinux_apifs_context)
1809 (void) asprintf(&options,
1810 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1811 arg_uid_shift + TTY_GID,
1812 arg_selinux_apifs_context);
1813 else
1814 #endif
1815 (void) asprintf(&options,
1816 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1817 arg_uid_shift + TTY_GID);
1818
1819 if (!options)
1820 return log_oom();
1821
1822 /* Mount /dev/pts itself */
1823 p = prefix_roota(dest, "/dev/pts");
1824 r = mkdir_errno_wrapper(p, 0755);
1825 if (r < 0)
1826 return log_error_errno(r, "Failed to create /dev/pts: %m");
1827
1828 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1829 if (r < 0)
1830 return r;
1831 r = userns_lchown(p, 0, 0);
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1834
1835 /* Create /dev/ptmx symlink */
1836 p = prefix_roota(dest, "/dev/ptmx");
1837 if (symlink("pts/ptmx", p) < 0)
1838 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1839 r = userns_lchown(p, 0, 0);
1840 if (r < 0)
1841 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1842
1843 /* And fix /dev/pts/ptmx ownership */
1844 p = prefix_roota(dest, "/dev/pts/ptmx");
1845 r = userns_lchown(p, 0, 0);
1846 if (r < 0)
1847 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1848
1849 return 0;
1850 }
1851
1852 static int setup_dev_console(const char *dest, const char *console) {
1853 _cleanup_umask_ mode_t u;
1854 const char *to;
1855 int r;
1856
1857 assert(dest);
1858 assert(console);
1859
1860 u = umask(0000);
1861
1862 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1863 if (r < 0)
1864 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1865
1866 /* We need to bind mount the right tty to /dev/console since
1867 * ptys can only exist on pts file systems. To have something
1868 * to bind mount things on we create a empty regular file. */
1869
1870 to = prefix_roota(dest, "/dev/console");
1871 r = touch(to);
1872 if (r < 0)
1873 return log_error_errno(r, "touch() for /dev/console failed: %m");
1874
1875 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
1876 }
1877
1878 static int setup_keyring(void) {
1879 key_serial_t keyring;
1880
1881 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1882 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1883 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1884 * these system calls let's make sure we don't leak anything into the container. */
1885
1886 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1887 if (keyring == -1) {
1888 if (errno == ENOSYS)
1889 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1890 else if (IN_SET(errno, EACCES, EPERM))
1891 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1892 else
1893 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1894 }
1895
1896 return 0;
1897 }
1898
1899 static int setup_kmsg(int kmsg_socket) {
1900 _cleanup_(unlink_and_freep) char *from = NULL;
1901 _cleanup_free_ char *fifo = NULL;
1902 _cleanup_close_ int fd = -1;
1903 _cleanup_umask_ mode_t u;
1904 const char *to;
1905 int r;
1906
1907 assert(kmsg_socket >= 0);
1908
1909 u = umask(0000);
1910
1911 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1912 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1913 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1914 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1915
1916 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1917 if (r < 0)
1918 return log_error_errno(r, "Failed to generate kmsg path: %m");
1919
1920 if (mkfifo(fifo, 0600) < 0)
1921 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1922
1923 from = TAKE_PTR(fifo);
1924 to = "/proc/kmsg";
1925
1926 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1927 if (r < 0)
1928 return r;
1929
1930 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
1931 if (fd < 0)
1932 return log_error_errno(errno, "Failed to open fifo: %m");
1933
1934 /* Store away the fd in the socket, so that it stays open as long as we run the child */
1935 r = send_one_fd(kmsg_socket, fd, 0);
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to send FIFO fd: %m");
1938
1939 return 0;
1940 }
1941
1942 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1943 union in_addr_union *exposed = userdata;
1944
1945 assert(rtnl);
1946 assert(m);
1947 assert(exposed);
1948
1949 expose_port_execute(rtnl, arg_expose_ports, exposed);
1950 return 0;
1951 }
1952
1953 static int setup_hostname(void) {
1954 int r;
1955
1956 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
1957 return 0;
1958
1959 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1960 if (r < 0)
1961 return log_error_errno(r, "Failed to set hostname: %m");
1962
1963 return 0;
1964 }
1965
1966 static int setup_journal(const char *directory) {
1967 _cleanup_free_ char *d = NULL;
1968 const char *dirname, *p, *q;
1969 sd_id128_t this_id;
1970 char id[33];
1971 bool try;
1972 int r;
1973
1974 /* Don't link journals in ephemeral mode */
1975 if (arg_ephemeral)
1976 return 0;
1977
1978 if (arg_link_journal == LINK_NO)
1979 return 0;
1980
1981 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1982
1983 r = sd_id128_get_machine(&this_id);
1984 if (r < 0)
1985 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1986
1987 if (sd_id128_equal(arg_uuid, this_id)) {
1988 log_full(try ? LOG_WARNING : LOG_ERR,
1989 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
1990 if (try)
1991 return 0;
1992 return -EEXIST;
1993 }
1994
1995 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
1996 r = userns_mkdir(directory, dirname, 0755, 0, 0);
1997 if (r < 0) {
1998 bool ignore = r == -EROFS && try;
1999 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2000 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2001 return ignore ? 0 : r;
2002 }
2003 }
2004
2005 (void) sd_id128_to_string(arg_uuid, id);
2006
2007 p = strjoina("/var/log/journal/", id);
2008 q = prefix_roota(directory, p);
2009
2010 if (path_is_mount_point(p, NULL, 0) > 0) {
2011 if (try)
2012 return 0;
2013
2014 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2015 "%s: already a mount point, refusing to use for journal", p);
2016 }
2017
2018 if (path_is_mount_point(q, NULL, 0) > 0) {
2019 if (try)
2020 return 0;
2021
2022 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2023 "%s: already a mount point, refusing to use for journal", q);
2024 }
2025
2026 r = readlink_and_make_absolute(p, &d);
2027 if (r >= 0) {
2028 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2029 path_equal(d, q)) {
2030
2031 r = userns_mkdir(directory, p, 0755, 0, 0);
2032 if (r < 0)
2033 log_warning_errno(r, "Failed to create directory %s: %m", q);
2034 return 0;
2035 }
2036
2037 if (unlink(p) < 0)
2038 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2039 } else if (r == -EINVAL) {
2040
2041 if (arg_link_journal == LINK_GUEST &&
2042 rmdir(p) < 0) {
2043
2044 if (errno == ENOTDIR) {
2045 log_error("%s already exists and is neither a symlink nor a directory", p);
2046 return r;
2047 } else
2048 return log_error_errno(errno, "Failed to remove %s: %m", p);
2049 }
2050 } else if (r != -ENOENT)
2051 return log_error_errno(r, "readlink(%s) failed: %m", p);
2052
2053 if (arg_link_journal == LINK_GUEST) {
2054
2055 if (symlink(q, p) < 0) {
2056 if (try) {
2057 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2058 return 0;
2059 } else
2060 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2061 }
2062
2063 r = userns_mkdir(directory, p, 0755, 0, 0);
2064 if (r < 0)
2065 log_warning_errno(r, "Failed to create directory %s: %m", q);
2066 return 0;
2067 }
2068
2069 if (arg_link_journal == LINK_HOST) {
2070 /* don't create parents here — if the host doesn't have
2071 * permanent journal set up, don't force it here */
2072
2073 r = mkdir_errno_wrapper(p, 0755);
2074 if (r < 0 && r != -EEXIST) {
2075 if (try) {
2076 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2077 return 0;
2078 } else
2079 return log_error_errno(r, "Failed to create %s: %m", p);
2080 }
2081
2082 } else if (access(p, F_OK) < 0)
2083 return 0;
2084
2085 if (dir_is_empty(q) == 0)
2086 log_warning("%s is not empty, proceeding anyway.", q);
2087
2088 r = userns_mkdir(directory, p, 0755, 0, 0);
2089 if (r < 0)
2090 return log_error_errno(r, "Failed to create %s: %m", q);
2091
2092 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2093 if (r < 0)
2094 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2095
2096 return 0;
2097 }
2098
2099 static int drop_capabilities(void) {
2100 return capability_bounding_set_drop(arg_caps_retain, false);
2101 }
2102
2103 static int reset_audit_loginuid(void) {
2104 _cleanup_free_ char *p = NULL;
2105 int r;
2106
2107 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2108 return 0;
2109
2110 r = read_one_line_file("/proc/self/loginuid", &p);
2111 if (r == -ENOENT)
2112 return 0;
2113 if (r < 0)
2114 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2115
2116 /* Already reset? */
2117 if (streq(p, "4294967295"))
2118 return 0;
2119
2120 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2121 if (r < 0) {
2122 log_error_errno(r,
2123 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2124 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2125 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2126 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2127 "using systemd-nspawn. Sleeping for 5s... (%m)");
2128
2129 sleep(5);
2130 }
2131
2132 return 0;
2133 }
2134
2135 static int setup_propagate(const char *root) {
2136 const char *p, *q;
2137 int r;
2138
2139 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2140 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2141 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2142 (void) mkdir_p(p, 0600);
2143
2144 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2145 if (r < 0)
2146 return log_error_errno(r, "Failed to create /run/systemd: %m");
2147
2148 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2149 if (r < 0)
2150 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
2151
2152 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2153 if (r < 0)
2154 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
2155
2156 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
2157 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2158 if (r < 0)
2159 return r;
2160
2161 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2162 if (r < 0)
2163 return r;
2164
2165 /* machined will MS_MOVE into that directory, and that's only
2166 * supported for non-shared mounts. */
2167 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2168 }
2169
2170 static int setup_machine_id(const char *directory) {
2171 const char *etc_machine_id;
2172 sd_id128_t id;
2173 int r;
2174
2175 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2176 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2177 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2178 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2179 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2180 * container behaves nicely). */
2181
2182 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2183
2184 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2185 if (r < 0) {
2186 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2187 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2188
2189 if (sd_id128_is_null(arg_uuid)) {
2190 r = sd_id128_randomize(&arg_uuid);
2191 if (r < 0)
2192 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2193 }
2194 } else {
2195 if (sd_id128_is_null(id))
2196 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2197 "Machine ID in container image is zero, refusing.");
2198
2199 arg_uuid = id;
2200 }
2201
2202 return 0;
2203 }
2204
2205 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2206 int r;
2207
2208 assert(directory);
2209
2210 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2211 return 0;
2212
2213 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2214 if (r == -EOPNOTSUPP)
2215 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2216 if (r == -EBADE)
2217 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2220 if (r == 0)
2221 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2222 else
2223 log_debug("Patched directory tree to match UID/GID range.");
2224
2225 return r;
2226 }
2227
2228 /*
2229 * Return values:
2230 * < 0 : wait_for_terminate() failed to get the state of the
2231 * container, the container was terminated by a signal, or
2232 * failed for an unknown reason. No change is made to the
2233 * container argument.
2234 * > 0 : The program executed in the container terminated with an
2235 * error. The exit code of the program executed in the
2236 * container is returned. The container argument has been set
2237 * to CONTAINER_TERMINATED.
2238 * 0 : The container is being rebooted, has been shut down or exited
2239 * successfully. The container argument has been set to either
2240 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2241 *
2242 * That is, success is indicated by a return value of zero, and an
2243 * error is indicated by a non-zero value.
2244 */
2245 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2246 siginfo_t status;
2247 int r;
2248
2249 r = wait_for_terminate(pid, &status);
2250 if (r < 0)
2251 return log_warning_errno(r, "Failed to wait for container: %m");
2252
2253 switch (status.si_code) {
2254
2255 case CLD_EXITED:
2256 if (status.si_status == 0)
2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2258 else
2259 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2260
2261 *container = CONTAINER_TERMINATED;
2262 return status.si_status;
2263
2264 case CLD_KILLED:
2265 if (status.si_status == SIGINT) {
2266 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2267 *container = CONTAINER_TERMINATED;
2268 return 0;
2269
2270 } else if (status.si_status == SIGHUP) {
2271 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2272 *container = CONTAINER_REBOOTED;
2273 return 0;
2274 }
2275
2276 _fallthrough_;
2277 case CLD_DUMPED:
2278 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2279 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2280
2281 default:
2282 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2283 "Container %s failed due to unknown reason.", arg_machine);
2284 }
2285 }
2286
2287 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2288 pid_t pid;
2289
2290 pid = PTR_TO_PID(userdata);
2291 if (pid > 0) {
2292 if (kill(pid, arg_kill_signal) >= 0) {
2293 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2294 sd_event_source_set_userdata(s, NULL);
2295 return 0;
2296 }
2297 }
2298
2299 sd_event_exit(sd_event_source_get_event(s), 0);
2300 return 0;
2301 }
2302
2303 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2304 pid_t pid;
2305
2306 assert(s);
2307 assert(ssi);
2308
2309 pid = PTR_TO_PID(userdata);
2310
2311 for (;;) {
2312 siginfo_t si = {};
2313
2314 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2315 return log_error_errno(errno, "Failed to waitid(): %m");
2316 if (si.si_pid == 0) /* No pending children. */
2317 break;
2318 if (si.si_pid == pid) {
2319 /* The main process we care for has exited. Return from
2320 * signal handler but leave the zombie. */
2321 sd_event_exit(sd_event_source_get_event(s), 0);
2322 break;
2323 }
2324
2325 /* Reap all other children. */
2326 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2327 }
2328
2329 return 0;
2330 }
2331
2332 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2333 pid_t pid;
2334
2335 assert(m);
2336
2337 pid = PTR_TO_PID(userdata);
2338
2339 if (arg_kill_signal > 0) {
2340 log_info("Container termination requested. Attempting to halt container.");
2341 (void) kill(pid, arg_kill_signal);
2342 } else {
2343 log_info("Container termination requested. Exiting.");
2344 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2345 }
2346
2347 return 0;
2348 }
2349
2350 static int determine_names(void) {
2351 int r;
2352
2353 if (arg_template && !arg_directory && arg_machine) {
2354
2355 /* If --template= was specified then we should not
2356 * search for a machine, but instead create a new one
2357 * in /var/lib/machine. */
2358
2359 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2360 if (!arg_directory)
2361 return log_oom();
2362 }
2363
2364 if (!arg_image && !arg_directory) {
2365 if (arg_machine) {
2366 _cleanup_(image_unrefp) Image *i = NULL;
2367
2368 r = image_find(IMAGE_MACHINE, arg_machine, &i);
2369 if (r == -ENOENT)
2370 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
2371 if (r < 0)
2372 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2373
2374 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
2375 r = free_and_strdup(&arg_image, i->path);
2376 else
2377 r = free_and_strdup(&arg_directory, i->path);
2378 if (r < 0)
2379 return log_oom();
2380
2381 if (!arg_ephemeral)
2382 arg_read_only = arg_read_only || i->read_only;
2383 } else {
2384 r = safe_getcwd(&arg_directory);
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to determine current directory: %m");
2387 }
2388
2389 if (!arg_directory && !arg_image) {
2390 log_error("Failed to determine path, please use -D or -i.");
2391 return -EINVAL;
2392 }
2393 }
2394
2395 if (!arg_machine) {
2396 if (arg_directory && path_equal(arg_directory, "/"))
2397 arg_machine = gethostname_malloc();
2398 else {
2399 if (arg_image) {
2400 char *e;
2401
2402 arg_machine = strdup(basename(arg_image));
2403
2404 /* Truncate suffix if there is one */
2405 e = endswith(arg_machine, ".raw");
2406 if (e)
2407 *e = 0;
2408 } else
2409 arg_machine = strdup(basename(arg_directory));
2410 }
2411 if (!arg_machine)
2412 return log_oom();
2413
2414 hostname_cleanup(arg_machine);
2415 if (!machine_name_is_valid(arg_machine)) {
2416 log_error("Failed to determine machine name automatically, please use -M.");
2417 return -EINVAL;
2418 }
2419
2420 if (arg_ephemeral) {
2421 char *b;
2422
2423 /* Add a random suffix when this is an
2424 * ephemeral machine, so that we can run many
2425 * instances at once without manually having
2426 * to specify -M each time. */
2427
2428 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2429 return log_oom();
2430
2431 free(arg_machine);
2432 arg_machine = b;
2433 }
2434 }
2435
2436 return 0;
2437 }
2438
2439 static int chase_symlinks_and_update(char **p, unsigned flags) {
2440 char *chased;
2441 int r;
2442
2443 assert(p);
2444
2445 if (!*p)
2446 return 0;
2447
2448 r = chase_symlinks(*p, NULL, flags, &chased);
2449 if (r < 0)
2450 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2451
2452 free_and_replace(*p, chased);
2453 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
2454 }
2455
2456 static int determine_uid_shift(const char *directory) {
2457 int r;
2458
2459 if (arg_userns_mode == USER_NAMESPACE_NO) {
2460 arg_uid_shift = 0;
2461 return 0;
2462 }
2463
2464 if (arg_uid_shift == UID_INVALID) {
2465 struct stat st;
2466
2467 r = stat(directory, &st);
2468 if (r < 0)
2469 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2470
2471 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2472
2473 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2474 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2475 "UID and GID base of %s don't match.", directory);
2476
2477 arg_uid_range = UINT32_C(0x10000);
2478 }
2479
2480 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2481 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2482 "UID base too high for UID range.");
2483
2484 return 0;
2485 }
2486
2487 static int inner_child(
2488 Barrier *barrier,
2489 const char *directory,
2490 bool secondary,
2491 int kmsg_socket,
2492 int rtnl_socket,
2493 FDSet *fds) {
2494
2495 _cleanup_free_ char *home = NULL;
2496 char as_uuid[37];
2497 size_t n_env = 1;
2498 const char *envp[] = {
2499 "PATH=" DEFAULT_PATH_COMPAT,
2500 NULL, /* container */
2501 NULL, /* TERM */
2502 NULL, /* HOME */
2503 NULL, /* USER */
2504 NULL, /* LOGNAME */
2505 NULL, /* container_uuid */
2506 NULL, /* LISTEN_FDS */
2507 NULL, /* LISTEN_PID */
2508 NULL, /* NOTIFY_SOCKET */
2509 NULL
2510 };
2511 const char *exec_target;
2512 _cleanup_strv_free_ char **env_use = NULL;
2513 int r;
2514
2515 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2516 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2517 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2518 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2519 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2520 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2521 * namespace.
2522 *
2523 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2524 * unshare(). See below. */
2525
2526 assert(barrier);
2527 assert(directory);
2528 assert(kmsg_socket >= 0);
2529
2530 if (arg_userns_mode != USER_NAMESPACE_NO) {
2531 /* Tell the parent, that it now can write the UID map. */
2532 (void) barrier_place(barrier); /* #1 */
2533
2534 /* Wait until the parent wrote the UID map */
2535 if (!barrier_place_and_sync(barrier)) /* #2 */
2536 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2537 "Parent died too early");
2538 }
2539
2540 r = reset_uid_gid();
2541 if (r < 0)
2542 return log_error_errno(r, "Couldn't become new root: %m");
2543
2544 r = mount_all(NULL,
2545 arg_mount_settings | MOUNT_IN_USERNS,
2546 arg_uid_shift,
2547 arg_selinux_apifs_context);
2548 if (r < 0)
2549 return r;
2550
2551 if (!arg_network_namespace_path && arg_private_network) {
2552 r = unshare(CLONE_NEWNET);
2553 if (r < 0)
2554 return log_error_errno(errno, "Failed to unshare network namespace: %m");
2555
2556 /* Tell the parent that it can setup network interfaces. */
2557 (void) barrier_place(barrier); /* #3 */
2558 }
2559
2560 r = mount_sysfs(NULL, arg_mount_settings);
2561 if (r < 0)
2562 return r;
2563
2564 /* Wait until we are cgroup-ified, so that we
2565 * can mount the right cgroup path writable */
2566 if (!barrier_place_and_sync(barrier)) /* #4 */
2567 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2568 "Parent died too early");
2569
2570 if (arg_use_cgns) {
2571 r = unshare(CLONE_NEWCGROUP);
2572 if (r < 0)
2573 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
2574 r = mount_cgroups(
2575 "",
2576 arg_unified_cgroup_hierarchy,
2577 arg_userns_mode != USER_NAMESPACE_NO,
2578 arg_uid_shift,
2579 arg_uid_range,
2580 arg_selinux_apifs_context,
2581 true);
2582 if (r < 0)
2583 return r;
2584 } else {
2585 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2586 if (r < 0)
2587 return r;
2588 }
2589
2590 r = setup_boot_id();
2591 if (r < 0)
2592 return r;
2593
2594 r = setup_kmsg(kmsg_socket);
2595 if (r < 0)
2596 return r;
2597 kmsg_socket = safe_close(kmsg_socket);
2598
2599 if (setsid() < 0)
2600 return log_error_errno(errno, "setsid() failed: %m");
2601
2602 if (arg_private_network)
2603 loopback_setup();
2604
2605 if (arg_expose_ports) {
2606 r = expose_port_send_rtnl(rtnl_socket);
2607 if (r < 0)
2608 return r;
2609 rtnl_socket = safe_close(rtnl_socket);
2610 }
2611
2612 if (arg_oom_score_adjust_set) {
2613 r = set_oom_score_adjust(arg_oom_score_adjust);
2614 if (r < 0)
2615 return log_error_errno(r, "Failed to adjust OOM score: %m");
2616 }
2617
2618 if (arg_cpuset)
2619 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2620 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2621
2622 r = drop_capabilities();
2623 if (r < 0)
2624 return log_error_errno(r, "drop_capabilities() failed: %m");
2625
2626 (void) setup_hostname();
2627
2628 if (arg_personality != PERSONALITY_INVALID) {
2629 r = safe_personality(arg_personality);
2630 if (r < 0)
2631 return log_error_errno(r, "personality() failed: %m");
2632 } else if (secondary) {
2633 r = safe_personality(PER_LINUX32);
2634 if (r < 0)
2635 return log_error_errno(r, "personality() failed: %m");
2636 }
2637
2638 #if HAVE_SELINUX
2639 if (arg_selinux_context)
2640 if (setexeccon(arg_selinux_context) < 0)
2641 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2642 #endif
2643
2644 r = change_uid_gid(arg_user, &home);
2645 if (r < 0)
2646 return r;
2647
2648 if (arg_no_new_privileges)
2649 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2650 return log_error_errno(errno, "Failed to disable new privileges: %m");
2651
2652 /* LXC sets container=lxc, so follow the scheme here */
2653 envp[n_env++] = strjoina("container=", arg_container_service_name);
2654
2655 envp[n_env] = strv_find_prefix(environ, "TERM=");
2656 if (envp[n_env])
2657 n_env++;
2658
2659 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2660 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2661 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2662 return log_oom();
2663
2664 assert(!sd_id128_is_null(arg_uuid));
2665
2666 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
2667 return log_oom();
2668
2669 if (fdset_size(fds) > 0) {
2670 r = fdset_cloexec(fds, false);
2671 if (r < 0)
2672 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2673
2674 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2675 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2676 return log_oom();
2677 }
2678 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2679 return log_oom();
2680
2681 env_use = strv_env_merge(2, envp, arg_setenv);
2682 if (!env_use)
2683 return log_oom();
2684
2685 /* Let the parent know that we are ready and
2686 * wait until the parent is ready with the
2687 * setup, too... */
2688 if (!barrier_place_and_sync(barrier)) /* #5 */
2689 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2690 "Parent died too early");
2691
2692 if (arg_chdir)
2693 if (chdir(arg_chdir) < 0)
2694 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2695
2696 if (arg_start_mode == START_PID2) {
2697 r = stub_pid1(arg_uuid);
2698 if (r < 0)
2699 return r;
2700 }
2701
2702 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
2703 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
2704 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
2705 log_close();
2706 log_set_open_when_needed(true);
2707
2708 (void) fdset_close_others(fds);
2709
2710 if (arg_start_mode == START_BOOT) {
2711 char **a;
2712 size_t m;
2713
2714 /* Automatically search for the init system */
2715
2716 m = strv_length(arg_parameters);
2717 a = newa(char*, m + 2);
2718 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2719 a[1 + m] = NULL;
2720
2721 a[0] = (char*) "/usr/lib/systemd/systemd";
2722 execve(a[0], a, env_use);
2723
2724 a[0] = (char*) "/lib/systemd/systemd";
2725 execve(a[0], a, env_use);
2726
2727 a[0] = (char*) "/sbin/init";
2728 execve(a[0], a, env_use);
2729
2730 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
2731 } else if (!strv_isempty(arg_parameters)) {
2732 const char *dollar_path;
2733
2734 exec_target = arg_parameters[0];
2735
2736 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
2737 * binary. */
2738 dollar_path = strv_env_get(env_use, "PATH");
2739 if (dollar_path) {
2740 if (putenv((char*) dollar_path) != 0)
2741 return log_error_errno(errno, "Failed to update $PATH: %m");
2742 }
2743
2744 execvpe(arg_parameters[0], arg_parameters, env_use);
2745 } else {
2746 if (!arg_chdir)
2747 /* If we cannot change the directory, we'll end up in /, that is expected. */
2748 (void) chdir(home ?: "/root");
2749
2750 execle("/bin/bash", "-bash", NULL, env_use);
2751 execle("/bin/sh", "-sh", NULL, env_use);
2752
2753 exec_target = "/bin/bash, /bin/sh";
2754 }
2755
2756 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
2757 }
2758
2759 static int setup_sd_notify_child(void) {
2760 _cleanup_close_ int fd = -1;
2761 union sockaddr_union sa = {
2762 .un.sun_family = AF_UNIX,
2763 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
2764 };
2765 int r;
2766
2767 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2768 if (fd < 0)
2769 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2770
2771 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2772 (void) sockaddr_un_unlink(&sa.un);
2773
2774 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2775 if (r < 0)
2776 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
2777
2778 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2779 if (r < 0)
2780 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2781
2782 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
2783 if (r < 0)
2784 return log_error_errno(r, "SO_PASSCRED failed: %m");
2785
2786 return TAKE_FD(fd);
2787 }
2788
2789 static int outer_child(
2790 Barrier *barrier,
2791 const char *directory,
2792 const char *console,
2793 DissectedImage *dissected_image,
2794 bool interactive,
2795 bool secondary,
2796 int pid_socket,
2797 int uuid_socket,
2798 int notify_socket,
2799 int kmsg_socket,
2800 int rtnl_socket,
2801 int uid_shift_socket,
2802 int unified_cgroup_hierarchy_socket,
2803 FDSet *fds,
2804 int netns_fd) {
2805
2806 _cleanup_close_ int fd = -1;
2807 int r, which_failed;
2808 pid_t pid;
2809 ssize_t l;
2810
2811 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
2812 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
2813 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
2814 * initializations a second child (the "inner" one) is forked off it, and it exits. */
2815
2816 assert(barrier);
2817 assert(directory);
2818 assert(console);
2819 assert(pid_socket >= 0);
2820 assert(uuid_socket >= 0);
2821 assert(notify_socket >= 0);
2822 assert(kmsg_socket >= 0);
2823
2824 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2825 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2826
2827 if (interactive) {
2828 int terminal;
2829
2830 terminal = open_terminal(console, O_RDWR);
2831 if (terminal < 0)
2832 return log_error_errno(terminal, "Failed to open console: %m");
2833
2834 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
2835 r = log_dup_console();
2836 if (r < 0)
2837 return log_error_errno(r, "Failed to duplicate stderr: %m");
2838
2839 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2840 if (r < 0)
2841 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2842 }
2843
2844 r = reset_audit_loginuid();
2845 if (r < 0)
2846 return r;
2847
2848 /* Mark everything as slave, so that we still
2849 * receive mounts from the real root, but don't
2850 * propagate mounts to the real root. */
2851 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2852 if (r < 0)
2853 return r;
2854
2855 if (dissected_image) {
2856 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2857 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2858 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2859 * makes sure ESP partitions and userns are compatible. */
2860
2861 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2862 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
2863 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
2864 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2865 if (r < 0)
2866 return r;
2867 }
2868
2869 r = determine_uid_shift(directory);
2870 if (r < 0)
2871 return r;
2872
2873 if (arg_userns_mode != USER_NAMESPACE_NO) {
2874 /* Let the parent know which UID shift we read from the image */
2875 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2876 if (l < 0)
2877 return log_error_errno(errno, "Failed to send UID shift: %m");
2878 if (l != sizeof(arg_uid_shift))
2879 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2880 "Short write while sending UID shift.");
2881
2882 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2883 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2884 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2885 * not it will pick a different one, and send it back to us. */
2886
2887 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2888 if (l < 0)
2889 return log_error_errno(errno, "Failed to recv UID shift: %m");
2890 if (l != sizeof(arg_uid_shift))
2891 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2892 "Short read while receiving UID shift.");
2893 }
2894
2895 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
2896 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2897 }
2898
2899 if (dissected_image) {
2900 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2901 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2902 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2903 if (r < 0)
2904 return r;
2905 }
2906
2907 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2908 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2909
2910 r = detect_unified_cgroup_hierarchy_from_image(directory);
2911 if (r < 0)
2912 return r;
2913
2914 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2915 if (l < 0)
2916 return log_error_errno(errno, "Failed to send cgroup mode: %m");
2917 if (l != sizeof(arg_unified_cgroup_hierarchy))
2918 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2919 "Short write while sending cgroup mode.");
2920
2921 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2922 }
2923
2924 /* Turn directory into bind mount */
2925 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2926 if (r < 0)
2927 return r;
2928
2929 r = setup_pivot_root(
2930 directory,
2931 arg_pivot_root_new,
2932 arg_pivot_root_old);
2933 if (r < 0)
2934 return r;
2935
2936 r = setup_volatile_mode(
2937 directory,
2938 arg_volatile_mode,
2939 arg_userns_mode != USER_NAMESPACE_NO,
2940 arg_uid_shift,
2941 arg_uid_range,
2942 arg_selinux_context);
2943 if (r < 0)
2944 return r;
2945
2946 /* Mark everything as shared so our mounts get propagated down. This is
2947 * required to make new bind mounts available in systemd services
2948 * inside the containter that create a new mount namespace.
2949 * See https://github.com/systemd/systemd/issues/3860
2950 * Further submounts (such as /dev) done after this will inherit the
2951 * shared propagation mode. */
2952 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2953 if (r < 0)
2954 return r;
2955
2956 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2957 if (r < 0)
2958 return r;
2959
2960 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2961 if (r < 0)
2962 return r;
2963
2964 if (arg_read_only) {
2965 r = bind_remount_recursive(directory, true, NULL);
2966 if (r < 0)
2967 return log_error_errno(r, "Failed to make tree read-only: %m");
2968 }
2969
2970 r = mount_all(directory,
2971 arg_mount_settings,
2972 arg_uid_shift,
2973 arg_selinux_apifs_context);
2974 if (r < 0)
2975 return r;
2976
2977 r = copy_devnodes(directory);
2978 if (r < 0)
2979 return r;
2980
2981 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2982
2983 r = setup_pts(directory);
2984 if (r < 0)
2985 return r;
2986
2987 r = setup_propagate(directory);
2988 if (r < 0)
2989 return r;
2990
2991 r = setup_dev_console(directory, console);
2992 if (r < 0)
2993 return r;
2994
2995 r = setup_keyring();
2996 if (r < 0)
2997 return r;
2998
2999 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
3000 if (r < 0)
3001 return r;
3002
3003 r = setup_timezone(directory);
3004 if (r < 0)
3005 return r;
3006
3007 r = setup_resolv_conf(directory);
3008 if (r < 0)
3009 return r;
3010
3011 r = setup_machine_id(directory);
3012 if (r < 0)
3013 return r;
3014
3015 r = setup_journal(directory);
3016 if (r < 0)
3017 return r;
3018
3019 r = mount_custom(
3020 directory,
3021 arg_custom_mounts,
3022 arg_n_custom_mounts,
3023 arg_userns_mode != USER_NAMESPACE_NO,
3024 arg_uid_shift,
3025 arg_uid_range,
3026 arg_selinux_apifs_context);
3027 if (r < 0)
3028 return r;
3029
3030 if (!arg_use_cgns) {
3031 r = mount_cgroups(
3032 directory,
3033 arg_unified_cgroup_hierarchy,
3034 arg_userns_mode != USER_NAMESPACE_NO,
3035 arg_uid_shift,
3036 arg_uid_range,
3037 arg_selinux_apifs_context,
3038 false);
3039 if (r < 0)
3040 return r;
3041 }
3042
3043 r = mount_move_root(directory);
3044 if (r < 0)
3045 return log_error_errno(r, "Failed to move root directory: %m");
3046
3047 fd = setup_sd_notify_child();
3048 if (fd < 0)
3049 return fd;
3050
3051 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3052 if (r < 0)
3053 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3054
3055 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3056 arg_clone_ns_flags |
3057 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3058 if (pid < 0)
3059 return log_error_errno(errno, "Failed to fork inner child: %m");
3060 if (pid == 0) {
3061 pid_socket = safe_close(pid_socket);
3062 uuid_socket = safe_close(uuid_socket);
3063 notify_socket = safe_close(notify_socket);
3064 uid_shift_socket = safe_close(uid_shift_socket);
3065
3066 /* The inner child has all namespaces that are
3067 * requested, so that we all are owned by the user if
3068 * user namespaces are turned on. */
3069
3070 if (arg_network_namespace_path) {
3071 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3072 if (r < 0)
3073 return log_error_errno(r, "Failed to join network namespace: %m");
3074 }
3075
3076 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3077 if (r < 0)
3078 _exit(EXIT_FAILURE);
3079
3080 _exit(EXIT_SUCCESS);
3081 }
3082
3083 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3084 if (l < 0)
3085 return log_error_errno(errno, "Failed to send PID: %m");
3086 if (l != sizeof(pid))
3087 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3088 "Short write while sending PID.");
3089
3090 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3091 if (l < 0)
3092 return log_error_errno(errno, "Failed to send machine ID: %m");
3093 if (l != sizeof(arg_uuid))
3094 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3095 "Short write while sending machine ID.");
3096
3097 l = send_one_fd(notify_socket, fd, 0);
3098 if (l < 0)
3099 return log_error_errno(errno, "Failed to send notify fd: %m");
3100
3101 pid_socket = safe_close(pid_socket);
3102 uuid_socket = safe_close(uuid_socket);
3103 notify_socket = safe_close(notify_socket);
3104 kmsg_socket = safe_close(kmsg_socket);
3105 rtnl_socket = safe_close(rtnl_socket);
3106 netns_fd = safe_close(netns_fd);
3107
3108 return 0;
3109 }
3110
3111 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3112 bool tried_hashed = false;
3113 unsigned n_tries = 100;
3114 uid_t candidate;
3115 int r;
3116
3117 assert(shift);
3118 assert(ret_lock_file);
3119 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3120 assert(arg_uid_range == 0x10000U);
3121
3122 candidate = *shift;
3123
3124 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3125
3126 for (;;) {
3127 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3128 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
3129
3130 if (--n_tries <= 0)
3131 return -EBUSY;
3132
3133 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
3134 goto next;
3135 if ((candidate & UINT32_C(0xFFFF)) != 0)
3136 goto next;
3137
3138 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3139 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3140 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3141 goto next;
3142 if (r < 0)
3143 return r;
3144
3145 /* Make some superficial checks whether the range is currently known in the user database */
3146 if (getpwuid(candidate))
3147 goto next;
3148 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3149 goto next;
3150 if (getgrgid(candidate))
3151 goto next;
3152 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3153 goto next;
3154
3155 *ret_lock_file = lf;
3156 lf = (struct LockFile) LOCK_FILE_INIT;
3157 *shift = candidate;
3158 return 0;
3159
3160 next:
3161 if (arg_machine && !tried_hashed) {
3162 /* Try to hash the base from the container name */
3163
3164 static const uint8_t hash_key[] = {
3165 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3166 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3167 };
3168
3169 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3170
3171 tried_hashed = true;
3172 } else
3173 random_bytes(&candidate, sizeof(candidate));
3174
3175 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
3176 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3177 }
3178 }
3179
3180 static int setup_uid_map(pid_t pid) {
3181 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3182 int r;
3183
3184 assert(pid > 1);
3185
3186 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3187 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3188 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3189 if (r < 0)
3190 return log_error_errno(r, "Failed to write UID map: %m");
3191
3192 /* We always assign the same UID and GID ranges */
3193 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3194 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
3195 if (r < 0)
3196 return log_error_errno(r, "Failed to write GID map: %m");
3197
3198 return 0;
3199 }
3200
3201 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3202 char buf[NOTIFY_BUFFER_MAX+1];
3203 char *p = NULL;
3204 struct iovec iovec = {
3205 .iov_base = buf,
3206 .iov_len = sizeof(buf)-1,
3207 };
3208 union {
3209 struct cmsghdr cmsghdr;
3210 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3211 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3212 } control = {};
3213 struct msghdr msghdr = {
3214 .msg_iov = &iovec,
3215 .msg_iovlen = 1,
3216 .msg_control = &control,
3217 .msg_controllen = sizeof(control),
3218 };
3219 struct cmsghdr *cmsg;
3220 struct ucred *ucred = NULL;
3221 ssize_t n;
3222 pid_t inner_child_pid;
3223 _cleanup_strv_free_ char **tags = NULL;
3224
3225 assert(userdata);
3226
3227 inner_child_pid = PTR_TO_PID(userdata);
3228
3229 if (revents != EPOLLIN) {
3230 log_warning("Got unexpected poll event for notify fd.");
3231 return 0;
3232 }
3233
3234 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3235 if (n < 0) {
3236 if (IN_SET(errno, EAGAIN, EINTR))
3237 return 0;
3238
3239 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3240 }
3241 cmsg_close_all(&msghdr);
3242
3243 CMSG_FOREACH(cmsg, &msghdr) {
3244 if (cmsg->cmsg_level == SOL_SOCKET &&
3245 cmsg->cmsg_type == SCM_CREDENTIALS &&
3246 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3247
3248 ucred = (struct ucred*) CMSG_DATA(cmsg);
3249 }
3250 }
3251
3252 if (!ucred || ucred->pid != inner_child_pid) {
3253 log_debug("Received notify message without valid credentials. Ignoring.");
3254 return 0;
3255 }
3256
3257 if ((size_t) n >= sizeof(buf)) {
3258 log_warning("Received notify message exceeded maximum size. Ignoring.");
3259 return 0;
3260 }
3261
3262 buf[n] = 0;
3263 tags = strv_split(buf, "\n\r");
3264 if (!tags)
3265 return log_oom();
3266
3267 if (strv_find(tags, "READY=1"))
3268 sd_notifyf(false, "READY=1\n");
3269
3270 p = strv_find_startswith(tags, "STATUS=");
3271 if (p)
3272 sd_notifyf(false, "STATUS=Container running: %s", p);
3273
3274 return 0;
3275 }
3276
3277 static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
3278 int r;
3279
3280 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3281 if (r < 0)
3282 return log_error_errno(r, "Failed to allocate notify event source: %m");
3283
3284 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
3285
3286 return 0;
3287 }
3288
3289 static int merge_settings(Settings *settings, const char *path) {
3290 int rl;
3291
3292 assert(settings);
3293 assert(path);
3294
3295 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3296 * that this steals the fields of the Settings* structure, and hence modifies it. */
3297
3298 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3299 settings->start_mode >= 0) {
3300 arg_start_mode = settings->start_mode;
3301 strv_free_and_replace(arg_parameters, settings->parameters);
3302 }
3303
3304 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3305 arg_ephemeral = settings->ephemeral;
3306
3307 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3308 settings->pivot_root_new) {
3309 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3310 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3311 }
3312
3313 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3314 settings->working_directory)
3315 free_and_replace(arg_chdir, settings->working_directory);
3316
3317 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3318 settings->environment)
3319 strv_free_and_replace(arg_setenv, settings->environment);
3320
3321 if ((arg_settings_mask & SETTING_USER) == 0 &&
3322 settings->user)
3323 free_and_replace(arg_user, settings->user);
3324
3325 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3326 uint64_t plus;
3327
3328 plus = settings->capability;
3329 if (settings_private_network(settings))
3330 plus |= (1ULL << CAP_NET_ADMIN);
3331
3332 if (!arg_settings_trusted && plus != 0) {
3333 if (settings->capability != 0)
3334 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
3335 } else
3336 arg_caps_retain |= plus;
3337
3338 arg_caps_retain &= ~settings->drop_capability;
3339 }
3340
3341 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3342 settings->kill_signal > 0)
3343 arg_kill_signal = settings->kill_signal;
3344
3345 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3346 settings->personality != PERSONALITY_INVALID)
3347 arg_personality = settings->personality;
3348
3349 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3350 !sd_id128_is_null(settings->machine_id)) {
3351
3352 if (!arg_settings_trusted)
3353 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
3354 else
3355 arg_uuid = settings->machine_id;
3356 }
3357
3358 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3359 settings->read_only >= 0)
3360 arg_read_only = settings->read_only;
3361
3362 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3363 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3364 arg_volatile_mode = settings->volatile_mode;
3365
3366 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3367 settings->n_custom_mounts > 0) {
3368
3369 if (!arg_settings_trusted)
3370 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
3371 else {
3372 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3373 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
3374 arg_n_custom_mounts = settings->n_custom_mounts;
3375 settings->n_custom_mounts = 0;
3376 }
3377 }
3378
3379 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3380 (settings->private_network >= 0 ||
3381 settings->network_veth >= 0 ||
3382 settings->network_bridge ||
3383 settings->network_zone ||
3384 settings->network_interfaces ||
3385 settings->network_macvlan ||
3386 settings->network_ipvlan ||
3387 settings->network_veth_extra)) {
3388
3389 if (!arg_settings_trusted)
3390 log_warning("Ignoring network settings, file %s is not trusted.", path);
3391 else {
3392 arg_network_veth = settings_network_veth(settings);
3393 arg_private_network = settings_private_network(settings);
3394
3395 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3396 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3397 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3398 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
3399
3400 free_and_replace(arg_network_bridge, settings->network_bridge);
3401 free_and_replace(arg_network_zone, settings->network_zone);
3402 }
3403 }
3404
3405 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3406 settings->expose_ports) {
3407
3408 if (!arg_settings_trusted)
3409 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
3410 else {
3411 expose_port_free_all(arg_expose_ports);
3412 arg_expose_ports = TAKE_PTR(settings->expose_ports);
3413 }
3414 }
3415
3416 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3417 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3418
3419 if (!arg_settings_trusted)
3420 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
3421 else {
3422 arg_userns_mode = settings->userns_mode;
3423 arg_uid_shift = settings->uid_shift;
3424 arg_uid_range = settings->uid_range;
3425 arg_userns_chown = settings->userns_chown;
3426 }
3427 }
3428
3429 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3430 arg_notify_ready = settings->notify_ready;
3431
3432 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3433
3434 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
3435 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
3436 else {
3437 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3438 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
3439 }
3440 }
3441
3442 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3443 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3444 continue;
3445
3446 if (!settings->rlimit[rl])
3447 continue;
3448
3449 if (!arg_settings_trusted) {
3450 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
3451 continue;
3452 }
3453
3454 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3455 }
3456
3457 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3458 settings->hostname)
3459 free_and_replace(arg_hostname, settings->hostname);
3460
3461 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3462 settings->no_new_privileges >= 0)
3463 arg_no_new_privileges = settings->no_new_privileges;
3464
3465 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3466 settings->oom_score_adjust_set) {
3467
3468 if (!arg_settings_trusted)
3469 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
3470 else {
3471 arg_oom_score_adjust = settings->oom_score_adjust;
3472 arg_oom_score_adjust_set = true;
3473 }
3474 }
3475
3476 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3477 settings->cpuset) {
3478
3479 if (!arg_settings_trusted)
3480 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
3481 else {
3482 if (arg_cpuset)
3483 CPU_FREE(arg_cpuset);
3484 arg_cpuset = TAKE_PTR(settings->cpuset);
3485 arg_cpuset_ncpus = settings->cpuset_ncpus;
3486 }
3487 }
3488
3489 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3490 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3491 arg_resolv_conf = settings->resolv_conf;
3492
3493 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3494 settings->link_journal != _LINK_JOURNAL_INVALID) {
3495
3496 if (!arg_settings_trusted)
3497 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3498 else {
3499 arg_link_journal = settings->link_journal;
3500 arg_link_journal_try = settings->link_journal_try;
3501 }
3502 }
3503
3504 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3505 settings->timezone != _TIMEZONE_MODE_INVALID)
3506 arg_timezone = settings->timezone;
3507
3508 return 0;
3509 }
3510
3511 static int load_settings(void) {
3512 _cleanup_(settings_freep) Settings *settings = NULL;
3513 _cleanup_fclose_ FILE *f = NULL;
3514 _cleanup_free_ char *p = NULL;
3515 const char *fn, *i;
3516 int r;
3517
3518 /* If all settings are masked, there's no point in looking for
3519 * the settings file */
3520 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3521 return 0;
3522
3523 fn = strjoina(arg_machine, ".nspawn");
3524
3525 /* We first look in the admin's directories in /etc and /run */
3526 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3527 _cleanup_free_ char *j = NULL;
3528
3529 j = strjoin(i, "/", fn);
3530 if (!j)
3531 return log_oom();
3532
3533 f = fopen(j, "re");
3534 if (f) {
3535 p = TAKE_PTR(j);
3536
3537 /* By default, we trust configuration from /etc and /run */
3538 if (arg_settings_trusted < 0)
3539 arg_settings_trusted = true;
3540
3541 break;
3542 }
3543
3544 if (errno != ENOENT)
3545 return log_error_errno(errno, "Failed to open %s: %m", j);
3546 }
3547
3548 if (!f) {
3549 /* After that, let's look for a file next to the
3550 * actual image we shall boot. */
3551
3552 if (arg_image) {
3553 p = file_in_same_dir(arg_image, fn);
3554 if (!p)
3555 return log_oom();
3556 } else if (arg_directory) {
3557 p = file_in_same_dir(arg_directory, fn);
3558 if (!p)
3559 return log_oom();
3560 }
3561
3562 if (p) {
3563 f = fopen(p, "re");
3564 if (!f && errno != ENOENT)
3565 return log_error_errno(errno, "Failed to open %s: %m", p);
3566
3567 /* By default, we do not trust configuration from /var/lib/machines */
3568 if (arg_settings_trusted < 0)
3569 arg_settings_trusted = false;
3570 }
3571 }
3572
3573 if (!f)
3574 return 0;
3575
3576 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3577
3578 r = settings_load(f, p, &settings);
3579 if (r < 0)
3580 return r;
3581
3582 return merge_settings(settings, p);
3583 }
3584
3585 static int run(int master,
3586 const char* console,
3587 DissectedImage *dissected_image,
3588 bool interactive,
3589 bool secondary,
3590 FDSet *fds,
3591 char veth_name[IFNAMSIZ], bool *veth_created,
3592 union in_addr_union *exposed,
3593 pid_t *pid, int *ret) {
3594
3595 static const struct sigaction sa = {
3596 .sa_handler = nop_signal_handler,
3597 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
3598 };
3599
3600 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
3601 _cleanup_close_ int etc_passwd_lock = -1;
3602 _cleanup_close_pair_ int
3603 kmsg_socket_pair[2] = { -1, -1 },
3604 rtnl_socket_pair[2] = { -1, -1 },
3605 pid_socket_pair[2] = { -1, -1 },
3606 uuid_socket_pair[2] = { -1, -1 },
3607 notify_socket_pair[2] = { -1, -1 },
3608 uid_shift_socket_pair[2] = { -1, -1 },
3609 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3610
3611 _cleanup_close_ int notify_socket= -1;
3612 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3613 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
3614 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3615 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3616 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3617 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
3618 ContainerStatus container_status = 0;
3619 char last_char = 0;
3620 int ifi = 0, r;
3621 ssize_t l;
3622 sigset_t mask_chld;
3623 _cleanup_close_ int netns_fd = -1;
3624
3625 assert_se(sigemptyset(&mask_chld) == 0);
3626 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3627
3628 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3629 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3630 * check with getpwuid() if the specific user already exists. Note that /etc might be
3631 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3632 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3633 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3634 * really ours. */
3635
3636 etc_passwd_lock = take_etc_passwd_lock(NULL);
3637 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3638 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3639 }
3640
3641 r = barrier_create(&barrier);
3642 if (r < 0)
3643 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3644
3645 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3646 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3647
3648 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3649 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3650
3651 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3652 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3653
3654 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3655 return log_error_errno(errno, "Failed to create id socket pair: %m");
3656
3657 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3658 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3659
3660 if (arg_userns_mode != USER_NAMESPACE_NO)
3661 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3662 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3663
3664 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3665 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3666 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3667
3668 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3669 * parent's blocking calls and give it a chance to call wait() and terminate. */
3670 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3671 if (r < 0)
3672 return log_error_errno(errno, "Failed to change the signal mask: %m");
3673
3674 r = sigaction(SIGCHLD, &sa, NULL);
3675 if (r < 0)
3676 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3677
3678 if (arg_network_namespace_path) {
3679 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3680 if (netns_fd < 0)
3681 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3682
3683 r = fd_is_network_ns(netns_fd);
3684 if (r == -EUCLEAN)
3685 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
3686 else if (r < 0)
3687 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
3688 else if (r == 0) {
3689 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
3690 return -EINVAL;
3691 }
3692 }
3693
3694 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3695 if (*pid < 0)
3696 return log_error_errno(errno, "clone() failed%s: %m",
3697 errno == EINVAL ?
3698 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3699
3700 if (*pid == 0) {
3701 /* The outer child only has a file system namespace. */
3702 barrier_set_role(&barrier, BARRIER_CHILD);
3703
3704 master = safe_close(master);
3705
3706 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3707 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3708 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3709 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3710 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3711 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3712 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
3713
3714 (void) reset_all_signal_handlers();
3715 (void) reset_signal_mask();
3716
3717 r = outer_child(&barrier,
3718 arg_directory,
3719 console,
3720 dissected_image,
3721 interactive,
3722 secondary,
3723 pid_socket_pair[1],
3724 uuid_socket_pair[1],
3725 notify_socket_pair[1],
3726 kmsg_socket_pair[1],
3727 rtnl_socket_pair[1],
3728 uid_shift_socket_pair[1],
3729 unified_cgroup_hierarchy_socket_pair[1],
3730 fds,
3731 netns_fd);
3732 if (r < 0)
3733 _exit(EXIT_FAILURE);
3734
3735 _exit(EXIT_SUCCESS);
3736 }
3737
3738 barrier_set_role(&barrier, BARRIER_PARENT);
3739
3740 fds = fdset_free(fds);
3741
3742 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3743 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3744 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3745 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3746 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3747 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3748 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
3749
3750 if (arg_userns_mode != USER_NAMESPACE_NO) {
3751 /* The child just let us know the UID shift it might have read from the image. */
3752 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3753 if (l < 0)
3754 return log_error_errno(errno, "Failed to read UID shift: %m");
3755 if (l != sizeof arg_uid_shift) {
3756 log_error("Short read while reading UID shift.");
3757 return -EIO;
3758 }
3759
3760 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3761 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3762 * image, but if that's already in use, pick a new one, and report back to the child,
3763 * which one we now picked. */
3764
3765 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3766 if (r < 0)
3767 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3768
3769 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3770 if (l < 0)
3771 return log_error_errno(errno, "Failed to send UID shift: %m");
3772 if (l != sizeof arg_uid_shift) {
3773 log_error("Short write while writing UID shift.");
3774 return -EIO;
3775 }
3776 }
3777 }
3778
3779 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3780 /* The child let us know the support cgroup mode it might have read from the image. */
3781 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3782 if (l < 0)
3783 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3784 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
3785 log_error("Short read while reading cgroup mode (%zu bytes).%s",
3786 l, l == 0 ? " The child is most likely dead." : "");
3787 return -EIO;
3788 }
3789 }
3790
3791 /* Wait for the outer child. */
3792 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3793 if (r < 0)
3794 return r;
3795 if (r != EXIT_SUCCESS)
3796 return -EIO;
3797
3798 /* And now retrieve the PID of the inner child. */
3799 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3800 if (l < 0)
3801 return log_error_errno(errno, "Failed to read inner child PID: %m");
3802 if (l != sizeof *pid) {
3803 log_error("Short read while reading inner child PID.");
3804 return -EIO;
3805 }
3806
3807 /* We also retrieve container UUID in case it was generated by outer child */
3808 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3809 if (l < 0)
3810 return log_error_errno(errno, "Failed to read container machine ID: %m");
3811 if (l != sizeof(arg_uuid)) {
3812 log_error("Short read while reading container machined ID.");
3813 return -EIO;
3814 }
3815
3816 /* We also retrieve the socket used for notifications generated by outer child */
3817 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3818 if (notify_socket < 0)
3819 return log_error_errno(notify_socket,
3820 "Failed to receive notification socket from the outer child: %m");
3821
3822 log_debug("Init process invoked as PID "PID_FMT, *pid);
3823
3824 if (arg_userns_mode != USER_NAMESPACE_NO) {
3825 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3826 log_error("Child died too early.");
3827 return -ESRCH;
3828 }
3829
3830 r = setup_uid_map(*pid);
3831 if (r < 0)
3832 return r;
3833
3834 (void) barrier_place(&barrier); /* #2 */
3835 }
3836
3837 if (arg_private_network) {
3838 if (!arg_network_namespace_path) {
3839 /* Wait until the child has unshared its network namespace. */
3840 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3841 log_error("Child died too early");
3842 return -ESRCH;
3843 }
3844 }
3845
3846 r = move_network_interfaces(*pid, arg_network_interfaces);
3847 if (r < 0)
3848 return r;
3849
3850 if (arg_network_veth) {
3851 r = setup_veth(arg_machine, *pid, veth_name,
3852 arg_network_bridge || arg_network_zone);
3853 if (r < 0)
3854 return r;
3855 else if (r > 0)
3856 ifi = r;
3857
3858 if (arg_network_bridge) {
3859 /* Add the interface to a bridge */
3860 r = setup_bridge(veth_name, arg_network_bridge, false);
3861 if (r < 0)
3862 return r;
3863 if (r > 0)
3864 ifi = r;
3865 } else if (arg_network_zone) {
3866 /* Add the interface to a bridge, possibly creating it */
3867 r = setup_bridge(veth_name, arg_network_zone, true);
3868 if (r < 0)
3869 return r;
3870 if (r > 0)
3871 ifi = r;
3872 }
3873 }
3874
3875 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3876 if (r < 0)
3877 return r;
3878
3879 /* We created the primary and extra veth links now; let's remember this, so that we know to
3880 remove them later on. Note that we don't bother with removing veth links that were created
3881 here when their setup failed half-way, because in that case the kernel should be able to
3882 remove them on its own, since they cannot be referenced by anything yet. */
3883 *veth_created = true;
3884
3885 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3886 if (r < 0)
3887 return r;
3888
3889 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3890 if (r < 0)
3891 return r;
3892 }
3893
3894 if (arg_register || !arg_keep_unit) {
3895 r = sd_bus_default_system(&bus);
3896 if (r < 0)
3897 return log_error_errno(r, "Failed to open system bus: %m");
3898
3899 r = sd_bus_set_close_on_exit(bus, false);
3900 if (r < 0)
3901 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
3902 }
3903
3904 if (!arg_keep_unit) {
3905 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3906 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3907 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3908
3909 r = sd_bus_match_signal_async(
3910 bus,
3911 NULL,
3912 "org.freedesktop.systemd1",
3913 NULL,
3914 "org.freedesktop.systemd1.Scope",
3915 "RequestStop",
3916 on_request_stop, NULL, PID_TO_PTR(*pid));
3917 if (r < 0)
3918 return log_error_errno(r, "Failed to request RequestStop match: %m");
3919 }
3920
3921 if (arg_register) {
3922 r = register_machine(
3923 bus,
3924 arg_machine,
3925 *pid,
3926 arg_directory,
3927 arg_uuid,
3928 ifi,
3929 arg_slice,
3930 arg_custom_mounts, arg_n_custom_mounts,
3931 arg_kill_signal,
3932 arg_property,
3933 arg_keep_unit,
3934 arg_container_service_name);
3935 if (r < 0)
3936 return r;
3937
3938 } else if (!arg_keep_unit) {
3939 r = allocate_scope(
3940 bus,
3941 arg_machine,
3942 *pid,
3943 arg_slice,
3944 arg_custom_mounts, arg_n_custom_mounts,
3945 arg_kill_signal,
3946 arg_property);
3947 if (r < 0)
3948 return r;
3949
3950 } else if (arg_slice || arg_property)
3951 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
3952
3953 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
3954 if (r < 0)
3955 return r;
3956
3957 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
3958 if (r < 0)
3959 return r;
3960
3961 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
3962 if (r < 0)
3963 return r;
3964
3965 /* Notify the child that the parent is ready with all
3966 * its setup (including cgroup-ification), and that
3967 * the child can now hand over control to the code to
3968 * run inside the container. */
3969 (void) barrier_place(&barrier); /* #4 */
3970
3971 /* Block SIGCHLD here, before notifying child.
3972 * process_pty() will handle it with the other signals. */
3973 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3974
3975 /* Reset signal to default */
3976 r = default_signals(SIGCHLD, -1);
3977 if (r < 0)
3978 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3979
3980 r = sd_event_new(&event);
3981 if (r < 0)
3982 return log_error_errno(r, "Failed to get default event source: %m");
3983
3984 (void) sd_event_set_watchdog(event, true);
3985
3986 if (bus) {
3987 r = sd_bus_attach_event(bus, event, 0);
3988 if (r < 0)
3989 return log_error_errno(r, "Failed to attach bus to event loop: %m");
3990 }
3991
3992 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
3993 if (r < 0)
3994 return r;
3995
3996 /* Let the child know that we are ready and wait that the child is completely ready now. */
3997 if (!barrier_place_and_sync(&barrier)) { /* #5 */
3998 log_error("Child died too early.");
3999 return -ESRCH;
4000 }
4001
4002 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4003 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4004 etc_passwd_lock = safe_close(etc_passwd_lock);
4005
4006 sd_notifyf(false,
4007 "STATUS=Container running.\n"
4008 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4009 if (!arg_notify_ready)
4010 (void) sd_notify(false, "READY=1\n");
4011
4012 if (arg_kill_signal > 0) {
4013 /* Try to kill the init system on SIGINT or SIGTERM */
4014 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4015 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
4016 } else {
4017 /* Immediately exit */
4018 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4019 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4020 }
4021
4022 /* Exit when the child exits */
4023 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
4024
4025 if (arg_expose_ports) {
4026 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4027 if (r < 0)
4028 return r;
4029
4030 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4031 }
4032
4033 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4034
4035 r = pty_forward_new(event, master,
4036 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
4037 &forward);
4038 if (r < 0)
4039 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4040
4041 r = sd_event_loop(event);
4042 if (r < 0)
4043 return log_error_errno(r, "Failed to run event loop: %m");
4044
4045 pty_forward_get_last_char(forward, &last_char);
4046
4047 forward = pty_forward_free(forward);
4048
4049 if (!arg_quiet && last_char != '\n')
4050 putc('\n', stdout);
4051
4052 /* Kill if it is not dead yet anyway */
4053 if (bus) {
4054 if (arg_register)
4055 terminate_machine(bus, arg_machine);
4056 else if (!arg_keep_unit)
4057 terminate_scope(bus, arg_machine);
4058 }
4059
4060 /* Normally redundant, but better safe than sorry */
4061 (void) kill(*pid, SIGKILL);
4062
4063 r = wait_for_container(*pid, &container_status);
4064 *pid = 0;
4065
4066 if (r < 0)
4067 /* We failed to wait for the container, or the container exited abnormally. */
4068 return r;
4069 if (r > 0 || container_status == CONTAINER_TERMINATED) {
4070 /* r > 0 → The container exited with a non-zero status.
4071 * As a special case, we need to replace 133 with a different value,
4072 * because 133 is special-cased in the service file to reboot the container.
4073 * otherwise → The container exited with zero status and a reboot was not requested.
4074 */
4075 if (r == EXIT_FORCE_RESTART)
4076 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4077 *ret = r;
4078 return 0; /* finito */
4079 }
4080
4081 /* CONTAINER_REBOOTED, loop again */
4082
4083 if (arg_keep_unit) {
4084 /* Special handling if we are running as a service: instead of simply
4085 * restarting the machine we want to restart the entire service, so let's
4086 * inform systemd about this with the special exit code 133. The service
4087 * file uses RestartForceExitStatus=133 so that this results in a full
4088 * nspawn restart. This is necessary since we might have cgroup parameters
4089 * set we want to have flushed out. */
4090 *ret = EXIT_FORCE_RESTART;
4091 return 0; /* finito */
4092 }
4093
4094 expose_port_flush(arg_expose_ports, exposed);
4095
4096 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4097 *veth_created = false;
4098 return 1; /* loop again */
4099 }
4100
4101 static int initialize_rlimits(void) {
4102 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4103 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4104 * container execution environments. */
4105
4106 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4107 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4108 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4109 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4110 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4111 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4112 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4113 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4114 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4115 [RLIMIT_NICE] = { 0, 0 },
4116 [RLIMIT_NOFILE] = { 1024, 4096 },
4117 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4118 [RLIMIT_RTPRIO] = { 0, 0 },
4119 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4120 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4121
4122 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4123 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4124 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4125 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4126 * that PID 1 changes a number of other resource limits during early initialization which is why we
4127 * don't read the other limits from PID 1 but prefer the static table above. */
4128 };
4129
4130 int rl;
4131
4132 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4133 /* Let's only fill in what the user hasn't explicitly configured anyway */
4134 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4135 const struct rlimit *v;
4136 struct rlimit buffer;
4137
4138 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4139 /* For these two let's read the limits off PID 1. See above for an explanation. */
4140
4141 if (prlimit(1, rl, NULL, &buffer) < 0)
4142 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4143
4144 v = &buffer;
4145 } else
4146 v = kernel_defaults + rl;
4147
4148 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4149 if (!arg_rlimit[rl])
4150 return log_oom();
4151 }
4152
4153 if (DEBUG_LOGGING) {
4154 _cleanup_free_ char *k = NULL;
4155
4156 (void) rlimit_format(arg_rlimit[rl], &k);
4157 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4158 }
4159 }
4160
4161 return 0;
4162 }
4163
4164 int main(int argc, char *argv[]) {
4165 _cleanup_free_ char *console = NULL;
4166 _cleanup_close_ int master = -1;
4167 _cleanup_fdset_free_ FDSet *fds = NULL;
4168 int r, n_fd_passed, ret = EXIT_SUCCESS;
4169 char veth_name[IFNAMSIZ] = "";
4170 bool secondary = false, remove_directory = false, remove_image = false;
4171 pid_t pid = 0;
4172 union in_addr_union exposed = {};
4173 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4174 bool interactive, veth_created = false, remove_tmprootdir = false;
4175 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
4176 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
4177 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4178 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
4179
4180 log_parse_environment();
4181 log_open();
4182
4183 /* Make sure rename_process() in the stub init process can work */
4184 saved_argv = argv;
4185 saved_argc = argc;
4186
4187 r = parse_argv(argc, argv);
4188 if (r <= 0)
4189 goto finish;
4190
4191 r = must_be_root();
4192 if (r < 0)
4193 goto finish;
4194
4195 r = initialize_rlimits();
4196 if (r < 0)
4197 goto finish;
4198
4199 r = determine_names();
4200 if (r < 0)
4201 goto finish;
4202
4203 r = load_settings();
4204 if (r < 0)
4205 goto finish;
4206
4207 parse_environment();
4208
4209 r = cg_unified_flush();
4210 if (r < 0) {
4211 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
4212 goto finish;
4213 }
4214
4215 r = verify_arguments();
4216 if (r < 0)
4217 goto finish;
4218
4219 r = detect_unified_cgroup_hierarchy_from_environment();
4220 if (r < 0)
4221 goto finish;
4222
4223 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
4224 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
4225 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
4226 (void) ignore_signals(SIGPIPE, -1);
4227
4228 n_fd_passed = sd_listen_fds(false);
4229 if (n_fd_passed > 0) {
4230 r = fdset_new_listen_fds(&fds, false);
4231 if (r < 0) {
4232 log_error_errno(r, "Failed to collect file descriptors: %m");
4233 goto finish;
4234 }
4235 }
4236
4237 /* The "default" umask. This is appropriate for most file and directory
4238 * operations performed by nspawn, and is the umask that will be used for
4239 * the child. Functions like copy_devnodes() change the umask temporarily. */
4240 umask(0022);
4241
4242 if (arg_directory) {
4243 assert(!arg_image);
4244
4245 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4246 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4247 r = -EINVAL;
4248 goto finish;
4249 }
4250
4251 if (arg_ephemeral) {
4252 _cleanup_free_ char *np = NULL;
4253
4254 r = chase_symlinks_and_update(&arg_directory, 0);
4255 if (r < 0)
4256 goto finish;
4257
4258 /* If the specified path is a mount point we
4259 * generate the new snapshot immediately
4260 * inside it under a random name. However if
4261 * the specified is not a mount point we
4262 * create the new snapshot in the parent
4263 * directory, just next to it. */
4264 r = path_is_mount_point(arg_directory, NULL, 0);
4265 if (r < 0) {
4266 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4267 goto finish;
4268 }
4269 if (r > 0)
4270 r = tempfn_random_child(arg_directory, "machine.", &np);
4271 else
4272 r = tempfn_random(arg_directory, "machine.", &np);
4273 if (r < 0) {
4274 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
4275 goto finish;
4276 }
4277
4278 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4279 if (r < 0) {
4280 log_error_errno(r, "Failed to lock %s: %m", np);
4281 goto finish;
4282 }
4283
4284 r = btrfs_subvol_snapshot(arg_directory, np,
4285 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4286 BTRFS_SNAPSHOT_FALLBACK_COPY |
4287 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4288 BTRFS_SNAPSHOT_RECURSIVE |
4289 BTRFS_SNAPSHOT_QUOTA);
4290 if (r < 0) {
4291 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4292 goto finish;
4293 }
4294
4295 free_and_replace(arg_directory, np);
4296
4297 remove_directory = true;
4298
4299 } else {
4300 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
4301 if (r < 0)
4302 goto finish;
4303
4304 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4305 if (r == -EBUSY) {
4306 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4307 goto finish;
4308 }
4309 if (r < 0) {
4310 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4311 goto finish;
4312 }
4313
4314 if (arg_template) {
4315 r = chase_symlinks_and_update(&arg_template, 0);
4316 if (r < 0)
4317 goto finish;
4318
4319 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4320 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4321 BTRFS_SNAPSHOT_FALLBACK_COPY |
4322 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4323 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4324 BTRFS_SNAPSHOT_RECURSIVE |
4325 BTRFS_SNAPSHOT_QUOTA);
4326 if (r == -EEXIST)
4327 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4328 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4329 else if (r < 0) {
4330 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4331 goto finish;
4332 } else
4333 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4334 "Populated %s from template %s.", arg_directory, arg_template);
4335 }
4336 }
4337
4338 if (arg_start_mode == START_BOOT) {
4339 const char *p;
4340
4341 if (arg_pivot_root_new)
4342 p = prefix_roota(arg_directory, arg_pivot_root_new);
4343 else
4344 p = arg_directory;
4345
4346 if (path_is_os_tree(p) <= 0) {
4347 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
4348 r = -EINVAL;
4349 goto finish;
4350 }
4351 } else {
4352 const char *p, *q;
4353
4354 if (arg_pivot_root_new)
4355 p = prefix_roota(arg_directory, arg_pivot_root_new);
4356 else
4357 p = arg_directory;
4358
4359 q = strjoina(p, "/usr/");
4360
4361 if (laccess(q, F_OK) < 0) {
4362 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
4363 r = -EINVAL;
4364 goto finish;
4365 }
4366 }
4367
4368 } else {
4369 assert(arg_image);
4370 assert(!arg_template);
4371
4372 r = chase_symlinks_and_update(&arg_image, 0);
4373 if (r < 0)
4374 goto finish;
4375
4376 if (arg_ephemeral) {
4377 _cleanup_free_ char *np = NULL;
4378
4379 r = tempfn_random(arg_image, "machine.", &np);
4380 if (r < 0) {
4381 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4382 goto finish;
4383 }
4384
4385 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4386 if (r < 0) {
4387 r = log_error_errno(r, "Failed to create image lock: %m");
4388 goto finish;
4389 }
4390
4391 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
4392 if (r < 0) {
4393 r = log_error_errno(r, "Failed to copy image file: %m");
4394 goto finish;
4395 }
4396
4397 free_and_replace(arg_image, np);
4398
4399 remove_image = true;
4400 } else {
4401 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4402 if (r == -EBUSY) {
4403 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4404 goto finish;
4405 }
4406 if (r < 0) {
4407 r = log_error_errno(r, "Failed to create image lock: %m");
4408 goto finish;
4409 }
4410
4411 if (!arg_root_hash) {
4412 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4413 if (r < 0) {
4414 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4415 goto finish;
4416 }
4417 }
4418 }
4419
4420 if (!mkdtemp(tmprootdir)) {
4421 r = log_error_errno(errno, "Failed to create temporary directory: %m");
4422 goto finish;
4423 }
4424
4425 remove_tmprootdir = true;
4426
4427 arg_directory = strdup(tmprootdir);
4428 if (!arg_directory) {
4429 r = log_oom();
4430 goto finish;
4431 }
4432
4433 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4434 if (r < 0) {
4435 log_error_errno(r, "Failed to set up loopback block device: %m");
4436 goto finish;
4437 }
4438
4439 r = dissect_image_and_warn(
4440 loop->fd,
4441 arg_image,
4442 arg_root_hash, arg_root_hash_size,
4443 DISSECT_IMAGE_REQUIRE_ROOT,
4444 &dissected_image);
4445 if (r == -ENOPKG) {
4446 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
4447 log_notice("Note that the disk image needs to\n"
4448 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4449 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4450 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4451 " d) or contain a file system without a partition table\n"
4452 "in order to be bootable with systemd-nspawn.");
4453 goto finish;
4454 }
4455 if (r < 0)
4456 goto finish;
4457
4458 if (!arg_root_hash && dissected_image->can_verity)
4459 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4460
4461 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
4462 if (r < 0)
4463 goto finish;
4464
4465 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4466 if (remove_image && unlink(arg_image) >= 0)
4467 remove_image = false;
4468 }
4469
4470 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
4471 if (r < 0)
4472 goto finish;
4473
4474 interactive =
4475 isatty(STDIN_FILENO) > 0 &&
4476 isatty(STDOUT_FILENO) > 0;
4477
4478 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
4479 if (master < 0) {
4480 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4481 goto finish;
4482 }
4483
4484 r = ptsname_malloc(master, &console);
4485 if (r < 0) {
4486 r = log_error_errno(r, "Failed to determine tty name: %m");
4487 goto finish;
4488 }
4489
4490 if (arg_selinux_apifs_context) {
4491 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4492 if (r < 0)
4493 goto finish;
4494 }
4495
4496 if (unlockpt(master) < 0) {
4497 r = log_error_errno(errno, "Failed to unlock tty: %m");
4498 goto finish;
4499 }
4500
4501 if (!arg_quiet)
4502 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4503 arg_machine, arg_image ?: arg_directory);
4504
4505 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4506
4507 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
4508 r = log_error_errno(errno, "Failed to become subreaper: %m");
4509 goto finish;
4510 }
4511
4512 for (;;) {
4513 r = run(master,
4514 console,
4515 dissected_image,
4516 interactive, secondary,
4517 fds,
4518 veth_name, &veth_created,
4519 &exposed,
4520 &pid, &ret);
4521 if (r <= 0)
4522 break;
4523 }
4524
4525 finish:
4526 sd_notify(false,
4527 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4528 "STOPPING=1\nSTATUS=Terminating...");
4529
4530 if (pid > 0)
4531 (void) kill(pid, SIGKILL);
4532
4533 /* Try to flush whatever is still queued in the pty */
4534 if (master >= 0) {
4535 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
4536 master = safe_close(master);
4537 }
4538
4539 if (pid > 0)
4540 (void) wait_for_terminate(pid, NULL);
4541
4542 pager_close();
4543
4544 if (remove_directory && arg_directory) {
4545 int k;
4546
4547 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
4548 if (k < 0)
4549 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
4550 }
4551
4552 if (remove_image && arg_image) {
4553 if (unlink(arg_image) < 0)
4554 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4555 }
4556
4557 if (remove_tmprootdir) {
4558 if (rmdir(tmprootdir) < 0)
4559 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4560 }
4561
4562 if (arg_machine) {
4563 const char *p;
4564
4565 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4566 (void) rm_rf(p, REMOVE_ROOT);
4567 }
4568
4569 expose_port_flush(arg_expose_ports, &exposed);
4570
4571 if (veth_created)
4572 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4573 (void) remove_bridge(arg_network_zone);
4574
4575 free(arg_directory);
4576 free(arg_template);
4577 free(arg_image);
4578 free(arg_machine);
4579 free(arg_hostname);
4580 free(arg_user);
4581 free(arg_pivot_root_new);
4582 free(arg_pivot_root_old);
4583 free(arg_chdir);
4584 strv_free(arg_setenv);
4585 free(arg_network_bridge);
4586 strv_free(arg_network_interfaces);
4587 strv_free(arg_network_macvlan);
4588 strv_free(arg_network_ipvlan);
4589 strv_free(arg_network_veth_extra);
4590 strv_free(arg_parameters);
4591 free(arg_network_zone);
4592 free(arg_network_namespace_path);
4593 strv_free(arg_property);
4594 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4595 expose_port_free_all(arg_expose_ports);
4596 free(arg_root_hash);
4597 rlimit_free_all(arg_rlimit);
4598 strv_free(arg_syscall_whitelist);
4599 strv_free(arg_syscall_blacklist);
4600 arg_cpuset = cpu_set_mfree(arg_cpuset);
4601
4602 return r < 0 ? EXIT_FAILURE : ret;
4603 }