]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: complain if people still use --share-system
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
88213476
LP
6***/
7
349cc4a5 8#if HAVE_BLKID
6b5cf3ea 9#include <blkid.h>
8fe0087e 10#endif
88213476 11#include <errno.h>
88213476 12#include <getopt.h>
0e7ac751 13#include <grp.h>
1b9e5b12 14#include <linux/loop.h>
0e7ac751 15#include <pwd.h>
8fe0087e 16#include <sched.h>
349cc4a5 17#if HAVE_SELINUX
8fe0087e 18#include <selinux/selinux.h>
1b9e5b12 19#endif
8fe0087e
LP
20#include <signal.h>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <sys/file.h>
25#include <sys/mount.h>
26#include <sys/personality.h>
27#include <sys/prctl.h>
28#include <sys/types.h>
6916b164 29#include <sys/wait.h>
8fe0087e 30#include <unistd.h>
1b9e5b12 31
b053cd5f 32#include "sd-bus.h"
1f0cd86b 33#include "sd-daemon.h"
1f0cd86b 34#include "sd-id128.h"
8fe0087e 35
b5efdb8a 36#include "alloc-util.h"
8fe0087e
LP
37#include "barrier.h"
38#include "base-filesystem.h"
39#include "blkid-util.h"
40#include "btrfs-util.h"
b8ea7a6e 41#include "bus-error.h"
b053cd5f 42#include "bus-util.h"
8fe0087e 43#include "cap-list.h"
430f0182 44#include "capability-util.h"
04d391da 45#include "cgroup-util.h"
8fe0087e 46#include "copy.h"
d107bb7d 47#include "cpu-set-util.h"
4fc9982c 48#include "dev-setup.h"
2d845785 49#include "dissect-image.h"
8fe0087e 50#include "env-util.h"
3ffd4af2 51#include "fd-util.h"
842f3b0f 52#include "fdset.h"
a5c32cff 53#include "fileio.h"
f97b34a6 54#include "format-util.h"
f4f15635 55#include "fs-util.h"
1b9e5b12 56#include "gpt.h"
4623e8e6 57#include "hexdecoct.h"
8fe0087e 58#include "hostname-util.h"
910fd145 59#include "id128-util.h"
8fe0087e 60#include "log.h"
2d845785 61#include "loop-util.h"
8fe0087e 62#include "loopback-setup.h"
1b9cebf6 63#include "machine-image.h"
8fe0087e
LP
64#include "macro.h"
65#include "missing.h"
66#include "mkdir.h"
4349cd7c 67#include "mount-util.h"
8fe0087e 68#include "netlink-util.h"
07630cea 69#include "nspawn-cgroup.h"
3603efde 70#include "nspawn-def.h"
07630cea
LP
71#include "nspawn-expose-ports.h"
72#include "nspawn-mount.h"
73#include "nspawn-network.h"
7336138e 74#include "nspawn-patch-uid.h"
07630cea 75#include "nspawn-register.h"
910fd145 76#include "nspawn-seccomp.h"
07630cea
LP
77#include "nspawn-settings.h"
78#include "nspawn-setuid.h"
7732f92b 79#include "nspawn-stub-pid1.h"
50ebcf6c 80#include "pager.h"
6bedfcbb 81#include "parse-util.h"
8fe0087e 82#include "path-util.h"
0b452006 83#include "process-util.h"
8fe0087e
LP
84#include "ptyfwd.h"
85#include "random-util.h"
8869a0b4 86#include "raw-clone.h"
bf428efb 87#include "rlimit-util.h"
8fe0087e 88#include "rm-rf.h"
68b02049 89#include "selinux-util.h"
8fe0087e 90#include "signal-util.h"
2583fbea 91#include "socket-util.h"
8fcde012 92#include "stat-util.h"
15a5e950 93#include "stdio-util.h"
5c828e66 94#include "string-table.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
62b1e758
YW
103#if HAVE_SPLIT_USR
104#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
105#else
106#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
107#endif
108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
118 CONTAINER_REBOOTED
119} ContainerStatus;
120
88213476 121static char *arg_directory = NULL;
ec16945e 122static char *arg_template = NULL;
5f932eb9 123static char *arg_chdir = NULL;
b53ede69
PW
124static char *arg_pivot_root_new = NULL;
125static char *arg_pivot_root_old = NULL;
687d0825 126static char *arg_user = NULL;
9444b1f2 127static sd_id128_t arg_uuid = {};
3a9530e5
LP
128static char *arg_machine = NULL; /* The name used by the host to refer to this */
129static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
130static const char *arg_selinux_context = NULL;
131static const char *arg_selinux_apifs_context = NULL;
9444b1f2 132static const char *arg_slice = NULL;
ff01d048 133static bool arg_private_network = false;
bc2f673e 134static bool arg_read_only = false;
7732f92b 135static StartMode arg_start_mode = START_PID1;
ec16945e 136static bool arg_ephemeral = false;
57fb9fb5 137static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 138static bool arg_link_journal_try = false;
520e0d54 139static uint64_t arg_caps_retain =
50b52222
LP
140 (1ULL << CAP_AUDIT_CONTROL) |
141 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
142 (1ULL << CAP_CHOWN) |
143 (1ULL << CAP_DAC_OVERRIDE) |
144 (1ULL << CAP_DAC_READ_SEARCH) |
145 (1ULL << CAP_FOWNER) |
146 (1ULL << CAP_FSETID) |
147 (1ULL << CAP_IPC_OWNER) |
148 (1ULL << CAP_KILL) |
149 (1ULL << CAP_LEASE) |
150 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 151 (1ULL << CAP_MKNOD) |
5076f0cc
LP
152 (1ULL << CAP_NET_BIND_SERVICE) |
153 (1ULL << CAP_NET_BROADCAST) |
154 (1ULL << CAP_NET_RAW) |
5076f0cc 155 (1ULL << CAP_SETFCAP) |
50b52222 156 (1ULL << CAP_SETGID) |
5076f0cc
LP
157 (1ULL << CAP_SETPCAP) |
158 (1ULL << CAP_SETUID) |
159 (1ULL << CAP_SYS_ADMIN) |
50b52222 160 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
161 (1ULL << CAP_SYS_CHROOT) |
162 (1ULL << CAP_SYS_NICE) |
163 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 164 (1ULL << CAP_SYS_RESOURCE) |
50b52222 165 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538 166static CustomMount *arg_custom_mounts = NULL;
88614c8a 167static size_t arg_n_custom_mounts = 0;
f4889f65 168static char **arg_setenv = NULL;
284c0b91 169static bool arg_quiet = false;
eb91eb18 170static bool arg_register = true;
89f7c846 171static bool arg_keep_unit = false;
aa28aefe 172static char **arg_network_interfaces = NULL;
c74e630d 173static char **arg_network_macvlan = NULL;
4bbfe7ad 174static char **arg_network_ipvlan = NULL;
69c79d3c 175static bool arg_network_veth = false;
f6d6bad1 176static char **arg_network_veth_extra = NULL;
f757855e 177static char *arg_network_bridge = NULL;
22b28dfd 178static char *arg_network_zone = NULL;
d7bea6b6 179static char *arg_network_namespace_path = NULL;
050f7277 180static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 181static char *arg_image = NULL;
f757855e 182static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 183static ExposePort *arg_expose_ports = NULL;
f36933fe 184static char **arg_property = NULL;
0de7acce 185static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 186static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 187static bool arg_userns_chown = false;
c6c8f6e2 188static int arg_kill_signal = 0;
5da38d07 189static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
190static SettingsMask arg_settings_mask = 0;
191static int arg_settings_trusted = -1;
192static char **arg_parameters = NULL;
6aadfa4c 193static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 194static bool arg_notify_ready = false;
5a8ff0e6 195static bool arg_use_cgns = true;
0c582db0 196static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 197static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
198static void *arg_root_hash = NULL;
199static size_t arg_root_hash_size = 0;
960e4569
LP
200static char **arg_syscall_whitelist = NULL;
201static char **arg_syscall_blacklist = NULL;
bf428efb 202static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 203static bool arg_no_new_privileges = false;
81f345df
LP
204static int arg_oom_score_adjust = 0;
205static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
206static cpu_set_t *arg_cpuset = NULL;
207static unsigned arg_cpuset_ncpus = 0;
09d423e9 208static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
88213476 209
601185b4 210static void help(void) {
50ebcf6c
LP
211
212 (void) pager_open(false, false);
213
88213476
LP
214 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
215 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
216 " -h --help Show this help\n"
217 " --version Print version string\n"
69c79d3c 218 " -q --quiet Do not show status information\n"
1b9e5b12 219 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
220 " --template=PATH Initialize root directory from template directory,\n"
221 " if missing\n"
222 " -x --ephemeral Run container with snapshot of root directory, and\n"
223 " remove it after exit\n"
224 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 225 " --root-hash=HASH Specify verity root hash\n"
7732f92b 226 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 227 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 228 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
229 " --pivot-root=PATH[:PATH]\n"
230 " Pivot root to given directory in the container\n"
a8828ed9 231 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 232 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 233 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 234 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 235 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 236 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 237 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 238 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 239 " Similar, but with user configured UID/GID range\n"
24597ee0 240 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
241 " --private-network Disable network in container\n"
242 " --network-interface=INTERFACE\n"
243 " Assign an existing network interface to the\n"
244 " container\n"
c74e630d
LP
245 " --network-macvlan=INTERFACE\n"
246 " Create a macvlan network interface based on an\n"
247 " existing network interface to the container\n"
4bbfe7ad
TG
248 " --network-ipvlan=INTERFACE\n"
249 " Create a ipvlan network interface based on an\n"
250 " existing network interface to the container\n"
a8eaaee7 251 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 252 " and container\n"
f6d6bad1
LP
253 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
254 " Add an additional virtual Ethernet link between\n"
255 " host and container\n"
ab046dde 256 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
257 " Add a virtual Ethernet connection to the container\n"
258 " and attach it to an existing bridge on the host\n"
259 " --network-zone=NAME Similar, but attach the new interface to an\n"
260 " an automatically managed bridge interface\n"
d7bea6b6
DP
261 " --network-namespace-path=PATH\n"
262 " Set network namespace to the one represented by\n"
263 " the specified kernel namespace file node\n"
6d0b55c2 264 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 265 " Expose a container IP port on the host\n"
82adf6af
LP
266 " -Z --selinux-context=SECLABEL\n"
267 " Set the SELinux security context to be used by\n"
268 " processes in the container\n"
269 " -L --selinux-apifs-context=SECLABEL\n"
270 " Set the SELinux security context to be used by\n"
271 " API/tmpfs file systems in the container\n"
a8828ed9
DW
272 " --capability=CAP In addition to the default, retain specified\n"
273 " capability\n"
274 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
275 " --system-call-filter=LIST|~LIST\n"
276 " Permit/prohibit specific system calls\n"
bf428efb 277 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
278 " --oom-score-adjust=VALUE\n"
279 " Adjust the OOM score value for the payload\n"
d107bb7d 280 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 281 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
282 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
283 " host, try-guest, try-host\n"
574edc90 284 " -j Equivalent to --link-journal=try-guest\n"
09d423e9 285 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
69c79d3c 286 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
287 " --bind=PATH[:PATH[:OPTIONS]]\n"
288 " Bind mount a file or directory from the host into\n"
a8828ed9 289 " the container\n"
5e5bfa6e
EY
290 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
291 " Similar, but creates a read-only bind mount\n"
06c17c39 292 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
293 " --overlay=PATH[:PATH...]:PATH\n"
294 " Create an overlay mount from the host to \n"
295 " the container\n"
296 " --overlay-ro=PATH[:PATH...]:PATH\n"
297 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 298 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 299 " --register=BOOLEAN Register container as machine\n"
89f7c846 300 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 301 " the service unit nspawn is running in\n"
6d0b55c2 302 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 303 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 304 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 305 , program_invocation_short_name);
88213476
LP
306}
307
86c0dd4a 308static int custom_mount_check_all(void) {
88614c8a 309 size_t i;
5a8af538 310
5a8af538
LP
311 for (i = 0; i < arg_n_custom_mounts; i++) {
312 CustomMount *m = &arg_custom_mounts[i];
313
0de7acce 314 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
315
316 if (arg_userns_chown) {
317 log_error("--private-users-chown may not be combined with custom root mounts.");
318 return -EINVAL;
319 } else if (arg_uid_shift == UID_INVALID) {
320 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
321 return -EINVAL;
322 }
825d5287 323 }
5a8af538
LP
324 }
325
326 return 0;
327}
328
8199d554 329static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 330 const char *e;
415fc41c 331 int r;
5da38d07 332
efdb0237
LP
333 /* Allow the user to control whether the unified hierarchy is used */
334 e = getenv("UNIFIED_CGROUP_HIERARCHY");
335 if (e) {
336 r = parse_boolean(e);
337 if (r < 0)
338 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
339 if (r > 0)
340 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
341 else
342 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
343 }
344
8199d554
LP
345 return 0;
346}
347
348static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
349 int r;
350
351 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
352 * image actually supports. */
b4cccbc1
LP
353 r = cg_all_unified();
354 if (r < 0)
355 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
356 if (r > 0) {
a8725a06
ZJS
357 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
358 * routine only detects 231, so we'll have a false negative here for 230. */
359 r = systemd_installation_has_version(directory, 230);
360 if (r < 0)
361 return log_error_errno(r, "Failed to determine systemd version in container: %m");
362 if (r > 0)
363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
364 else
365 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 366 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
367 /* Mixed cgroup hierarchy support was added in 233 */
368 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
369 if (r < 0)
370 return log_error_errno(r, "Failed to determine systemd version in container: %m");
371 if (r > 0)
372 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
373 else
374 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
375 } else
5da38d07 376 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 377
8199d554
LP
378 log_debug("Using %s hierarchy for container.",
379 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
380 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
381
efdb0237
LP
382 return 0;
383}
384
0c582db0
LB
385static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
386 int r;
387
388 r = getenv_bool(name);
389 if (r == -ENXIO)
390 return;
391 if (r < 0)
392 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
393 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
394}
395
4f086aab
SU
396static void parse_mount_settings_env(void) {
397 int r;
398 const char *e;
399
400 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
401 if (!e)
402 return;
403
404 if (streq(e, "network")) {
405 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
406 return;
407 }
408
409 r = parse_boolean(e);
410 if (r < 0) {
411 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
412 return;
ab8ee0f2 413 }
4f086aab 414
ab8ee0f2
ZJS
415 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
416 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
417}
418
88213476
LP
419static int parse_argv(int argc, char *argv[]) {
420
a41fe3a2 421 enum {
acbeb427
ZJS
422 ARG_VERSION = 0x100,
423 ARG_PRIVATE_NETWORK,
bc2f673e 424 ARG_UUID,
5076f0cc 425 ARG_READ_ONLY,
57fb9fb5 426 ARG_CAPABILITY,
420c7379 427 ARG_DROP_CAPABILITY,
17fe0523
LP
428 ARG_LINK_JOURNAL,
429 ARG_BIND,
f4889f65 430 ARG_BIND_RO,
06c17c39 431 ARG_TMPFS,
5a8af538
LP
432 ARG_OVERLAY,
433 ARG_OVERLAY_RO,
eb91eb18 434 ARG_SHARE_SYSTEM,
89f7c846 435 ARG_REGISTER,
aa28aefe 436 ARG_KEEP_UNIT,
69c79d3c 437 ARG_NETWORK_INTERFACE,
c74e630d 438 ARG_NETWORK_MACVLAN,
4bbfe7ad 439 ARG_NETWORK_IPVLAN,
ab046dde 440 ARG_NETWORK_BRIDGE,
22b28dfd 441 ARG_NETWORK_ZONE,
f6d6bad1 442 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 443 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 444 ARG_PERSONALITY,
4d9f07b4 445 ARG_VOLATILE,
ec16945e 446 ARG_TEMPLATE,
f36933fe 447 ARG_PROPERTY,
6dac160c 448 ARG_PRIVATE_USERS,
c6c8f6e2 449 ARG_KILL_SIGNAL,
f757855e 450 ARG_SETTINGS,
5f932eb9 451 ARG_CHDIR,
b53ede69 452 ARG_PIVOT_ROOT,
7336138e 453 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 454 ARG_NOTIFY_READY,
4623e8e6 455 ARG_ROOT_HASH,
960e4569 456 ARG_SYSTEM_CALL_FILTER,
bf428efb 457 ARG_RLIMIT,
3a9530e5 458 ARG_HOSTNAME,
66edd963 459 ARG_NO_NEW_PRIVILEGES,
81f345df 460 ARG_OOM_SCORE_ADJUST,
d107bb7d 461 ARG_CPU_AFFINITY,
09d423e9 462 ARG_RESOLV_CONF,
a41fe3a2
LP
463 };
464
88213476 465 static const struct option options[] = {
d7bea6b6
DP
466 { "help", no_argument, NULL, 'h' },
467 { "version", no_argument, NULL, ARG_VERSION },
468 { "directory", required_argument, NULL, 'D' },
469 { "template", required_argument, NULL, ARG_TEMPLATE },
470 { "ephemeral", no_argument, NULL, 'x' },
471 { "user", required_argument, NULL, 'u' },
472 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
473 { "as-pid2", no_argument, NULL, 'a' },
474 { "boot", no_argument, NULL, 'b' },
475 { "uuid", required_argument, NULL, ARG_UUID },
476 { "read-only", no_argument, NULL, ARG_READ_ONLY },
477 { "capability", required_argument, NULL, ARG_CAPABILITY },
478 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 479 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
480 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
481 { "bind", required_argument, NULL, ARG_BIND },
482 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
483 { "tmpfs", required_argument, NULL, ARG_TMPFS },
484 { "overlay", required_argument, NULL, ARG_OVERLAY },
485 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
486 { "machine", required_argument, NULL, 'M' },
3a9530e5 487 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
488 { "slice", required_argument, NULL, 'S' },
489 { "setenv", required_argument, NULL, 'E' },
490 { "selinux-context", required_argument, NULL, 'Z' },
491 { "selinux-apifs-context", required_argument, NULL, 'L' },
492 { "quiet", no_argument, NULL, 'q' },
493 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
494 { "register", required_argument, NULL, ARG_REGISTER },
495 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
496 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
497 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
498 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
499 { "network-veth", no_argument, NULL, 'n' },
500 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
501 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
502 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
503 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
504 { "personality", required_argument, NULL, ARG_PERSONALITY },
505 { "image", required_argument, NULL, 'i' },
506 { "volatile", optional_argument, NULL, ARG_VOLATILE },
507 { "port", required_argument, NULL, 'p' },
508 { "property", required_argument, NULL, ARG_PROPERTY },
509 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
510 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
511 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
512 { "settings", required_argument, NULL, ARG_SETTINGS },
513 { "chdir", required_argument, NULL, ARG_CHDIR },
514 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
515 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
516 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
517 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 518 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 519 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 520 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 521 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
eb9da376 522 {}
88213476
LP
523 };
524
9444b1f2 525 int c, r;
6aadfa4c 526 const char *p, *e;
a42c8b54 527 uint64_t plus = 0, minus = 0;
f757855e 528 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
529
530 assert(argc >= 0);
531 assert(argv);
532
2e1f244e 533 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
534
535 switch (c) {
536
537 case 'h':
601185b4
ZJS
538 help();
539 return 0;
88213476 540
acbeb427 541 case ARG_VERSION:
3f6fd1ba 542 return version();
acbeb427 543
88213476 544 case 'D':
0f03c2a4 545 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 546 if (r < 0)
0f03c2a4 547 return r;
ec16945e
LP
548 break;
549
550 case ARG_TEMPLATE:
0f03c2a4 551 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 552 if (r < 0)
0f03c2a4 553 return r;
88213476
LP
554 break;
555
1b9e5b12 556 case 'i':
0f03c2a4 557 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 558 if (r < 0)
0f03c2a4 559 return r;
ec16945e
LP
560 break;
561
562 case 'x':
563 arg_ephemeral = true;
1b9e5b12
LP
564 break;
565
687d0825 566 case 'u':
2fc09a9c
DM
567 r = free_and_strdup(&arg_user, optarg);
568 if (r < 0)
7027ff61 569 return log_oom();
687d0825 570
f757855e 571 arg_settings_mask |= SETTING_USER;
687d0825
MV
572 break;
573
22b28dfd
LP
574 case ARG_NETWORK_ZONE: {
575 char *j;
576
577 j = strappend("vz-", optarg);
578 if (!j)
579 return log_oom();
580
581 if (!ifname_valid(j)) {
582 log_error("Network zone name not valid: %s", j);
583 free(j);
584 return -EINVAL;
585 }
586
587 free(arg_network_zone);
588 arg_network_zone = j;
589
590 arg_network_veth = true;
591 arg_private_network = true;
592 arg_settings_mask |= SETTING_NETWORK;
593 break;
594 }
595
ab046dde 596 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
597
598 if (!ifname_valid(optarg)) {
599 log_error("Bridge interface name not valid: %s", optarg);
600 return -EINVAL;
601 }
602
f757855e
LP
603 r = free_and_strdup(&arg_network_bridge, optarg);
604 if (r < 0)
605 return log_oom();
ab046dde 606
4831981d 607 _fallthrough_;
0dfaa006 608 case 'n':
69c79d3c
LP
609 arg_network_veth = true;
610 arg_private_network = true;
f757855e 611 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
612 break;
613
f6d6bad1
LP
614 case ARG_NETWORK_VETH_EXTRA:
615 r = veth_extra_parse(&arg_network_veth_extra, optarg);
616 if (r < 0)
617 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
618
619 arg_private_network = true;
620 arg_settings_mask |= SETTING_NETWORK;
621 break;
622
aa28aefe 623 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
624
625 if (!ifname_valid(optarg)) {
626 log_error("Network interface name not valid: %s", optarg);
627 return -EINVAL;
628 }
629
c74e630d
LP
630 if (strv_extend(&arg_network_interfaces, optarg) < 0)
631 return log_oom();
632
633 arg_private_network = true;
f757855e 634 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
635 break;
636
637 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
638
639 if (!ifname_valid(optarg)) {
640 log_error("MACVLAN network interface name not valid: %s", optarg);
641 return -EINVAL;
642 }
643
c74e630d 644 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
645 return log_oom();
646
4bbfe7ad 647 arg_private_network = true;
f757855e 648 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
649 break;
650
651 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
652
653 if (!ifname_valid(optarg)) {
654 log_error("IPVLAN network interface name not valid: %s", optarg);
655 return -EINVAL;
656 }
657
4bbfe7ad
TG
658 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
659 return log_oom();
660
4831981d 661 _fallthrough_;
ff01d048
LP
662 case ARG_PRIVATE_NETWORK:
663 arg_private_network = true;
f757855e 664 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
665 break;
666
d7bea6b6
DP
667 case ARG_NETWORK_NAMESPACE_PATH:
668 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
669 if (r < 0)
670 return r;
671
672 break;
673
0f0dbc46 674 case 'b':
7732f92b
LP
675 if (arg_start_mode == START_PID2) {
676 log_error("--boot and --as-pid2 may not be combined.");
677 return -EINVAL;
678 }
679
680 arg_start_mode = START_BOOT;
681 arg_settings_mask |= SETTING_START_MODE;
682 break;
683
684 case 'a':
685 if (arg_start_mode == START_BOOT) {
686 log_error("--boot and --as-pid2 may not be combined.");
687 return -EINVAL;
688 }
689
690 arg_start_mode = START_PID2;
691 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
692 break;
693
144f0fc0 694 case ARG_UUID:
9444b1f2 695 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
696 if (r < 0)
697 return log_error_errno(r, "Invalid UUID: %s", optarg);
698
699 if (sd_id128_is_null(arg_uuid)) {
700 log_error("Machine UUID may not be all zeroes.");
701 return -EINVAL;
aa96c6cb 702 }
f757855e
LP
703
704 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 705 break;
aa96c6cb 706
9444b1f2 707 case 'S':
c74e630d 708 arg_slice = optarg;
144f0fc0
LP
709 break;
710
7027ff61 711 case 'M':
c1521918 712 if (isempty(optarg))
97b11eed 713 arg_machine = mfree(arg_machine);
c1521918 714 else {
0c3c4284 715 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
716 log_error("Invalid machine name: %s", optarg);
717 return -EINVAL;
718 }
7027ff61 719
0c3c4284
LP
720 r = free_and_strdup(&arg_machine, optarg);
721 if (r < 0)
eb91eb18 722 return log_oom();
eb91eb18 723 }
9ce6d1b3 724 break;
7027ff61 725
3a9530e5
LP
726 case ARG_HOSTNAME:
727 if (isempty(optarg))
728 arg_hostname = mfree(arg_hostname);
729 else {
730 if (!hostname_is_valid(optarg, false)) {
731 log_error("Invalid hostname: %s", optarg);
732 return -EINVAL;
733 }
734
735 r = free_and_strdup(&arg_hostname, optarg);
736 if (r < 0)
737 return log_oom();
738 }
739
740 arg_settings_mask |= SETTING_HOSTNAME;
741 break;
742
82adf6af
LP
743 case 'Z':
744 arg_selinux_context = optarg;
a8828ed9
DW
745 break;
746
82adf6af
LP
747 case 'L':
748 arg_selinux_apifs_context = optarg;
a8828ed9
DW
749 break;
750
bc2f673e
LP
751 case ARG_READ_ONLY:
752 arg_read_only = true;
f757855e 753 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
754 break;
755
420c7379
LP
756 case ARG_CAPABILITY:
757 case ARG_DROP_CAPABILITY: {
6cbe4ed1 758 p = optarg;
9ed794a3 759 for (;;) {
6cbe4ed1 760 _cleanup_free_ char *t = NULL;
5076f0cc 761
6cbe4ed1
SS
762 r = extract_first_word(&p, &t, ",", 0);
763 if (r < 0)
764 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 765
6cbe4ed1
SS
766 if (r == 0)
767 break;
5076f0cc 768
39ed67d1
LP
769 if (streq(t, "all")) {
770 if (c == ARG_CAPABILITY)
a42c8b54 771 plus = (uint64_t) -1;
39ed67d1 772 else
a42c8b54 773 minus = (uint64_t) -1;
39ed67d1 774 } else {
2822da4f
LP
775 int cap;
776
777 cap = capability_from_name(t);
778 if (cap < 0) {
39ed67d1
LP
779 log_error("Failed to parse capability %s.", t);
780 return -EINVAL;
781 }
782
783 if (c == ARG_CAPABILITY)
a42c8b54 784 plus |= 1ULL << (uint64_t) cap;
39ed67d1 785 else
a42c8b54 786 minus |= 1ULL << (uint64_t) cap;
5076f0cc 787 }
5076f0cc
LP
788 }
789
f757855e 790 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
791 break;
792 }
793
66edd963
LP
794 case ARG_NO_NEW_PRIVILEGES:
795 r = parse_boolean(optarg);
796 if (r < 0)
797 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
798
799 arg_no_new_privileges = r;
800 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
801 break;
802
57fb9fb5
LP
803 case 'j':
804 arg_link_journal = LINK_GUEST;
574edc90 805 arg_link_journal_try = true;
4e1d6aa9 806 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
807 break;
808
809 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
810 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
811 if (r < 0) {
812 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
813 return -EINVAL;
814 }
815
4e1d6aa9 816 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
817 break;
818
17fe0523 819 case ARG_BIND:
f757855e
LP
820 case ARG_BIND_RO:
821 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
822 if (r < 0)
823 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 824
f757855e 825 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 826 break;
06c17c39 827
f757855e
LP
828 case ARG_TMPFS:
829 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
830 if (r < 0)
831 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 832
f757855e 833 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 834 break;
5a8af538
LP
835
836 case ARG_OVERLAY:
ad85779a
LP
837 case ARG_OVERLAY_RO:
838 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
839 if (r == -EADDRNOTAVAIL)
840 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
841 if (r < 0)
842 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 843
f757855e 844 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 845 break;
06c17c39 846
a5f1cb3b 847 case 'E': {
f4889f65
LP
848 char **n;
849
850 if (!env_assignment_is_valid(optarg)) {
851 log_error("Environment variable assignment '%s' is not valid.", optarg);
852 return -EINVAL;
853 }
854
855 n = strv_env_set(arg_setenv, optarg);
856 if (!n)
857 return log_oom();
858
130d3d22 859 strv_free_and_replace(arg_setenv, n);
f757855e 860 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
861 break;
862 }
863
284c0b91
LP
864 case 'q':
865 arg_quiet = true;
866 break;
867
8a96d94e 868 case ARG_SHARE_SYSTEM:
a6b5216c 869 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 870 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 871 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 872 arg_clone_ns_flags = 0;
8a96d94e
LP
873 break;
874
eb91eb18
LP
875 case ARG_REGISTER:
876 r = parse_boolean(optarg);
877 if (r < 0) {
878 log_error("Failed to parse --register= argument: %s", optarg);
879 return r;
880 }
881
882 arg_register = r;
883 break;
884
89f7c846
LP
885 case ARG_KEEP_UNIT:
886 arg_keep_unit = true;
887 break;
888
6afc95b7
LP
889 case ARG_PERSONALITY:
890
ac45f971 891 arg_personality = personality_from_string(optarg);
050f7277 892 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
893 log_error("Unknown or unsupported personality '%s'.", optarg);
894 return -EINVAL;
895 }
896
f757855e 897 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
898 break;
899
4d9f07b4
LP
900 case ARG_VOLATILE:
901
902 if (!optarg)
f757855e 903 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
904 else if (streq(optarg, "help")) {
905 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
906 return 0;
907 } else {
f757855e 908 VolatileMode m;
4d9f07b4 909
f757855e
LP
910 m = volatile_mode_from_string(optarg);
911 if (m < 0) {
912 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 913 return -EINVAL;
f757855e
LP
914 } else
915 arg_volatile_mode = m;
6d0b55c2
LP
916 }
917
f757855e
LP
918 arg_settings_mask |= SETTING_VOLATILE_MODE;
919 break;
6d0b55c2 920
f757855e
LP
921 case 'p':
922 r = expose_port_parse(&arg_expose_ports, optarg);
923 if (r == -EEXIST)
924 return log_error_errno(r, "Duplicate port specification: %s", optarg);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 927
f757855e 928 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 929 break;
6d0b55c2 930
f36933fe
LP
931 case ARG_PROPERTY:
932 if (strv_extend(&arg_property, optarg) < 0)
933 return log_oom();
934
935 break;
936
ae209204
ZJS
937 case ARG_PRIVATE_USERS: {
938 int boolean = -1;
0de7acce 939
ae209204
ZJS
940 if (!optarg)
941 boolean = true;
942 else if (!in_charset(optarg, DIGITS))
943 /* do *not* parse numbers as booleans */
944 boolean = parse_boolean(optarg);
945
946 if (boolean == false) {
0de7acce
LP
947 /* no: User namespacing off */
948 arg_userns_mode = USER_NAMESPACE_NO;
949 arg_uid_shift = UID_INVALID;
950 arg_uid_range = UINT32_C(0x10000);
ae209204 951 } else if (boolean == true) {
0de7acce
LP
952 /* yes: User namespacing on, UID range is read from root dir */
953 arg_userns_mode = USER_NAMESPACE_FIXED;
954 arg_uid_shift = UID_INVALID;
955 arg_uid_range = UINT32_C(0x10000);
956 } else if (streq(optarg, "pick")) {
957 /* pick: User namespacing on, UID range is picked randomly */
958 arg_userns_mode = USER_NAMESPACE_PICK;
959 arg_uid_shift = UID_INVALID;
960 arg_uid_range = UINT32_C(0x10000);
961 } else {
6c2058b3 962 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
963 const char *range, *shift;
964
0de7acce
LP
965 /* anything else: User namespacing on, UID range is explicitly configured */
966
6dac160c
LP
967 range = strchr(optarg, ':');
968 if (range) {
6c2058b3
ZJS
969 buffer = strndup(optarg, range - optarg);
970 if (!buffer)
971 return log_oom();
972 shift = buffer;
6dac160c
LP
973
974 range++;
bfd292ec
ZJS
975 r = safe_atou32(range, &arg_uid_range);
976 if (r < 0)
be715731 977 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
978 } else
979 shift = optarg;
980
be715731
ZJS
981 r = parse_uid(shift, &arg_uid_shift);
982 if (r < 0)
983 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
984
985 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
986 }
987
be715731
ZJS
988 if (arg_uid_range <= 0) {
989 log_error("UID range cannot be 0.");
990 return -EINVAL;
991 }
992
0de7acce 993 arg_settings_mask |= SETTING_USERNS;
6dac160c 994 break;
ae209204 995 }
6dac160c 996
0de7acce 997 case 'U':
ccabee0d
LP
998 if (userns_supported()) {
999 arg_userns_mode = USER_NAMESPACE_PICK;
1000 arg_uid_shift = UID_INVALID;
1001 arg_uid_range = UINT32_C(0x10000);
1002
1003 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1004 }
1005
7336138e
LP
1006 break;
1007
0de7acce 1008 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1009 arg_userns_chown = true;
0de7acce
LP
1010
1011 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1012 break;
1013
c6c8f6e2 1014 case ARG_KILL_SIGNAL:
5c828e66
LP
1015 if (streq(optarg, "help")) {
1016 DUMP_STRING_TABLE(signal, int, _NSIG);
1017 return 0;
1018 }
1019
29a3db75 1020 arg_kill_signal = signal_from_string(optarg);
c6c8f6e2
LP
1021 if (arg_kill_signal < 0) {
1022 log_error("Cannot parse signal: %s", optarg);
1023 return -EINVAL;
1024 }
1025
f757855e
LP
1026 arg_settings_mask |= SETTING_KILL_SIGNAL;
1027 break;
1028
1029 case ARG_SETTINGS:
1030
1031 /* no → do not read files
1032 * yes → read files, do not override cmdline, trust only subset
1033 * override → read files, override cmdline, trust only subset
1034 * trusted → read files, do not override cmdline, trust all
1035 */
1036
1037 r = parse_boolean(optarg);
1038 if (r < 0) {
1039 if (streq(optarg, "trusted")) {
1040 mask_all_settings = false;
1041 mask_no_settings = false;
1042 arg_settings_trusted = true;
1043
1044 } else if (streq(optarg, "override")) {
1045 mask_all_settings = false;
1046 mask_no_settings = true;
1047 arg_settings_trusted = -1;
1048 } else
1049 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1050 } else if (r > 0) {
1051 /* yes */
1052 mask_all_settings = false;
1053 mask_no_settings = false;
1054 arg_settings_trusted = -1;
1055 } else {
1056 /* no */
1057 mask_all_settings = true;
1058 mask_no_settings = false;
1059 arg_settings_trusted = false;
1060 }
1061
c6c8f6e2
LP
1062 break;
1063
5f932eb9
LP
1064 case ARG_CHDIR:
1065 if (!path_is_absolute(optarg)) {
1066 log_error("Working directory %s is not an absolute path.", optarg);
1067 return -EINVAL;
1068 }
1069
1070 r = free_and_strdup(&arg_chdir, optarg);
1071 if (r < 0)
1072 return log_oom();
1073
1074 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1075 break;
1076
b53ede69
PW
1077 case ARG_PIVOT_ROOT:
1078 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1079 if (r < 0)
1080 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1081
1082 arg_settings_mask |= SETTING_PIVOT_ROOT;
1083 break;
1084
9c1e04d0
AP
1085 case ARG_NOTIFY_READY:
1086 r = parse_boolean(optarg);
1087 if (r < 0) {
1088 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1089 return -EINVAL;
1090 }
1091 arg_notify_ready = r;
1092 arg_settings_mask |= SETTING_NOTIFY_READY;
1093 break;
1094
4623e8e6
LP
1095 case ARG_ROOT_HASH: {
1096 void *k;
1097 size_t l;
1098
1099 r = unhexmem(optarg, strlen(optarg), &k, &l);
1100 if (r < 0)
1101 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1102 if (l < sizeof(sd_id128_t)) {
1103 log_error("Root hash must be at least 128bit long: %s", optarg);
1104 free(k);
1105 return -EINVAL;
1106 }
1107
1108 free(arg_root_hash);
1109 arg_root_hash = k;
1110 arg_root_hash_size = l;
1111 break;
1112 }
1113
960e4569
LP
1114 case ARG_SYSTEM_CALL_FILTER: {
1115 bool negative;
1116 const char *items;
1117
1118 negative = optarg[0] == '~';
1119 items = negative ? optarg + 1 : optarg;
1120
1121 for (;;) {
1122 _cleanup_free_ char *word = NULL;
1123
1124 r = extract_first_word(&items, &word, NULL, 0);
1125 if (r == 0)
1126 break;
1127 if (r == -ENOMEM)
1128 return log_oom();
1129 if (r < 0)
1130 return log_error_errno(r, "Failed to parse system call filter: %m");
1131
1132 if (negative)
1133 r = strv_extend(&arg_syscall_blacklist, word);
1134 else
1135 r = strv_extend(&arg_syscall_whitelist, word);
1136 if (r < 0)
1137 return log_oom();
1138 }
1139
1140 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1141 break;
1142 }
1143
bf428efb
LP
1144 case ARG_RLIMIT: {
1145 const char *eq;
1146 char *name;
1147 int rl;
1148
5c828e66
LP
1149 if (streq(optarg, "help")) {
1150 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1151 return 0;
1152 }
1153
bf428efb
LP
1154 eq = strchr(optarg, '=');
1155 if (!eq) {
1156 log_error("--rlimit= expects an '=' assignment.");
1157 return -EINVAL;
1158 }
1159
1160 name = strndup(optarg, eq - optarg);
1161 if (!name)
1162 return log_oom();
1163
1164 rl = rlimit_from_string_harder(name);
1165 if (rl < 0) {
1166 log_error("Unknown resource limit: %s", name);
1167 return -EINVAL;
1168 }
1169
1170 if (!arg_rlimit[rl]) {
1171 arg_rlimit[rl] = new0(struct rlimit, 1);
1172 if (!arg_rlimit[rl])
1173 return log_oom();
1174 }
1175
1176 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1177 if (r < 0)
1178 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1179
1180 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1181 break;
1182 }
1183
81f345df
LP
1184 case ARG_OOM_SCORE_ADJUST:
1185 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1186 if (r < 0)
1187 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1188
1189 arg_oom_score_adjust_set = true;
1190 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1191 break;
1192
d107bb7d
LP
1193 case ARG_CPU_AFFINITY: {
1194 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1195
1196 r = parse_cpu_set(optarg, &cpuset);
1197 if (r < 0)
1198 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1199
1200 if (arg_cpuset)
1201 CPU_FREE(arg_cpuset);
1202
1203 arg_cpuset = TAKE_PTR(cpuset);
1204 arg_cpuset_ncpus = r;
1205 arg_settings_mask |= SETTING_CPU_AFFINITY;
1206 break;
1207 }
1208
09d423e9
LP
1209 case ARG_RESOLV_CONF:
1210 if (streq(optarg, "help")) {
1211 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1212 return 0;
1213 }
1214
1215 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1216 if (arg_resolv_conf < 0) {
1217 log_error("Failed to parse /etc/resolv.conf mode: %s", optarg);
1218 return -EINVAL;
1219 }
1220
1221 arg_settings_mask |= SETTING_RESOLV_CONF;
1222 break;
1223
88213476
LP
1224 case '?':
1225 return -EINVAL;
1226
1227 default:
eb9da376 1228 assert_not_reached("Unhandled option");
88213476 1229 }
88213476 1230
d7bea6b6
DP
1231 /* If --network-namespace-path is given with any other network-related option,
1232 * we need to error out, to avoid conflicts between different network options. */
1233 if (arg_network_namespace_path &&
1234 (arg_network_interfaces || arg_network_macvlan ||
1235 arg_network_ipvlan || arg_network_veth_extra ||
1236 arg_network_bridge || arg_network_zone ||
1237 arg_network_veth || arg_private_network)) {
1238 log_error("--network-namespace-path cannot be combined with other network options.");
1239 return -EINVAL;
1240 }
1241
0c582db0
LB
1242 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1243 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1244 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1245 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1246
4f086aab
SU
1247 if (arg_userns_mode != USER_NAMESPACE_NO)
1248 arg_mount_settings |= MOUNT_USE_USERNS;
1249
1250 if (arg_private_network)
1251 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1252
1253 parse_mount_settings_env();
1254
48a8d337
LB
1255 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1256 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1257 arg_register = false;
0c582db0
LB
1258 if (arg_start_mode != START_PID1) {
1259 log_error("--boot cannot be used without namespacing.");
1260 return -EINVAL;
1261 }
1262 }
eb91eb18 1263
0de7acce 1264 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1265 arg_userns_chown = true;
1266
cd2dfc6f 1267 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
8d9c2bca
AJ
1268 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1269 * The latter is not technically a user session, but we don't need to labour the point. */
cd2dfc6f 1270 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846
LP
1271 return -EINVAL;
1272 }
1273
1b9e5b12
LP
1274 if (arg_directory && arg_image) {
1275 log_error("--directory= and --image= may not be combined.");
1276 return -EINVAL;
1277 }
1278
ec16945e
LP
1279 if (arg_template && arg_image) {
1280 log_error("--template= and --image= may not be combined.");
1281 return -EINVAL;
1282 }
1283
8cd328d8
LP
1284 if (arg_ephemeral && arg_template && !arg_directory) {
1285 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1286 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1287 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1288 * --directory=". */
1289
ae2a15bc 1290 arg_directory = TAKE_PTR(arg_template);
8cd328d8
LP
1291 }
1292
ec16945e
LP
1293 if (arg_template && !(arg_directory || arg_machine)) {
1294 log_error("--template= needs --directory= or --machine=.");
1295 return -EINVAL;
1296 }
1297
1298 if (arg_ephemeral && arg_template) {
1299 log_error("--ephemeral and --template= may not be combined.");
1300 return -EINVAL;
1301 }
1302
df9a75e4
LP
1303 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1304 log_error("--ephemeral and --link-journal= may not be combined.");
1305 return -EINVAL;
1306 }
1307
ccabee0d 1308 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1309 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1310 return -EOPNOTSUPP;
1311 }
1312
1313 if (arg_userns_chown && arg_read_only) {
1314 log_error("--read-only and --private-users-chown may not be combined.");
1315 return -EINVAL;
1316 }
f757855e 1317
22b28dfd
LP
1318 if (arg_network_bridge && arg_network_zone) {
1319 log_error("--network-bridge= and --network-zone= may not be combined.");
1320 return -EINVAL;
1321 }
1322
f757855e
LP
1323 if (argc > optind) {
1324 arg_parameters = strv_copy(argv + optind);
1325 if (!arg_parameters)
1326 return log_oom();
1327
7732f92b 1328 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1329 }
1330
1331 /* Load all settings from .nspawn files */
1332 if (mask_no_settings)
1333 arg_settings_mask = 0;
1334
1335 /* Don't load any settings from .nspawn files */
1336 if (mask_all_settings)
1337 arg_settings_mask = _SETTINGS_MASK_ALL;
1338
520e0d54 1339 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1340
399e391f
ZJS
1341 r = cg_unified_flush();
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1344
6aadfa4c
ILG
1345 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1346 if (e)
1347 arg_container_service_name = e;
1348
5a8ff0e6
CB
1349 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1350 if (r < 0)
1351 arg_use_cgns = cg_ns_supported();
1352 else
1353 arg_use_cgns = r;
1354
86c0dd4a
LP
1355 r = custom_mount_check_all();
1356 if (r < 0)
1357 return r;
1358
f757855e
LP
1359 return 1;
1360}
1361
1362static int verify_arguments(void) {
4f086aab
SU
1363 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1364 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1365 return -EINVAL;
1366 }
1367
1368 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1369 log_error("Cannot combine --private-users with read-write mounts.");
1370 return -EINVAL;
1371 }
f757855e
LP
1372
1373 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1374 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1375 return -EINVAL;
1376 }
1377
6d0b55c2
LP
1378 if (arg_expose_ports && !arg_private_network) {
1379 log_error("Cannot use --port= without private networking.");
1380 return -EINVAL;
1381 }
1382
349cc4a5 1383#if ! HAVE_LIBIPTC
1c1ea217
EV
1384 if (arg_expose_ports) {
1385 log_error("--port= is not supported, compiled without libiptc support.");
1386 return -EOPNOTSUPP;
1387 }
1388#endif
1389
7732f92b 1390 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1391 arg_kill_signal = SIGRTMIN+3;
1392
f757855e 1393 return 0;
88213476
LP
1394}
1395
03cfe0d5
LP
1396static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1397 assert(p);
1398
0de7acce 1399 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1400 return 0;
1401
1402 if (uid == UID_INVALID && gid == GID_INVALID)
1403 return 0;
1404
1405 if (uid != UID_INVALID) {
1406 uid += arg_uid_shift;
1407
1408 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1409 return -EOVERFLOW;
1410 }
1411
1412 if (gid != GID_INVALID) {
1413 gid += (gid_t) arg_uid_shift;
1414
1415 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1416 return -EOVERFLOW;
1417 }
1418
1419 if (lchown(p, uid, gid) < 0)
1420 return -errno;
b12afc8c
LP
1421
1422 return 0;
1423}
1424
03cfe0d5
LP
1425static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1426 const char *q;
dae8b82e 1427 int r;
03cfe0d5
LP
1428
1429 q = prefix_roota(root, path);
dae8b82e
ZJS
1430 r = mkdir_errno_wrapper(q, mode);
1431 if (r == -EEXIST)
1432 return 0;
1433 if (r < 0)
1434 return r;
03cfe0d5
LP
1435
1436 return userns_lchown(q, uid, gid);
1437}
1438
e58a1277 1439static int setup_timezone(const char *dest) {
03cfe0d5
LP
1440 _cleanup_free_ char *p = NULL, *q = NULL;
1441 const char *where, *check, *what;
d4036145
LP
1442 char *z, *y;
1443 int r;
f8440af5 1444
e58a1277
LP
1445 assert(dest);
1446
1447 /* Fix the timezone, if possible */
d4036145
LP
1448 r = readlink_malloc("/etc/localtime", &p);
1449 if (r < 0) {
0b493a02
MP
1450 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1451 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1452 * with a symbolic link to a time zone data file.
0b493a02
MP
1453 *
1454 * Example:
21dc0227 1455 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1456 */
d4036145
LP
1457 return 0;
1458 }
1459
1460 z = path_startswith(p, "../usr/share/zoneinfo/");
1461 if (!z)
1462 z = path_startswith(p, "/usr/share/zoneinfo/");
1463 if (!z) {
1464 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1465 return 0;
1466 }
1467
03cfe0d5 1468 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1469 r = readlink_malloc(where, &q);
1470 if (r >= 0) {
1471 y = path_startswith(q, "../usr/share/zoneinfo/");
1472 if (!y)
1473 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1474
d4036145
LP
1475 /* Already pointing to the right place? Then do nothing .. */
1476 if (y && streq(y, z))
1477 return 0;
1478 }
1479
03cfe0d5 1480 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1481 check = prefix_roota(dest, check);
03cfe0d5 1482 if (laccess(check, F_OK) < 0) {
d4036145
LP
1483 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1484 return 0;
1485 }
68fb0892 1486
8ccf7e9e
LP
1487 if (unlink(where) < 0 && errno != ENOENT) {
1488 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1489 errno,
1490 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1491 return 0;
1492 }
4d9f07b4 1493
03cfe0d5 1494 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1495 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1496 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1497 errno,
1498 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1499 return 0;
1500 }
e58a1277 1501
03cfe0d5
LP
1502 r = userns_lchown(where, 0, 0);
1503 if (r < 0)
1504 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1505
e58a1277 1506 return 0;
88213476
LP
1507}
1508
09d423e9
LP
1509static int have_resolv_conf(const char *path) {
1510 assert(path);
1511
1512 if (access(path, F_OK) < 0) {
1513 if (errno == ENOENT)
1514 return 0;
1515
1516 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1517 }
1518
1519 return 1;
1520}
1521
7357272e 1522static int resolved_listening(void) {
b8ea7a6e 1523 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1524 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1525 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1526 int r;
1527
7357272e 1528 /* Check if resolved is listening */
b053cd5f
LP
1529
1530 r = sd_bus_open_system(&bus);
1531 if (r < 0)
b8ea7a6e 1532 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1533
7357272e 1534 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1535 if (r < 0)
1536 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1537 if (r == 0)
1538 return 0;
7357272e
DM
1539
1540 r = sd_bus_get_property_string(bus,
1541 "org.freedesktop.resolve1",
1542 "/org/freedesktop/resolve1",
1543 "org.freedesktop.resolve1.Manager",
1544 "DNSStubListener",
b8ea7a6e 1545 &error,
7357272e
DM
1546 &dns_stub_listener_mode);
1547 if (r < 0)
b8ea7a6e 1548 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1549
1550 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1551}
1552
2547bb41 1553static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1554 _cleanup_free_ char *etc = NULL;
1555 const char *where, *what;
1556 ResolvConfMode m;
1557 int r;
2547bb41
LP
1558
1559 assert(dest);
1560
09d423e9
LP
1561 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1562 if (arg_private_network)
1563 m = RESOLV_CONF_OFF;
1564 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1565 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1566 * container, so that the container can use the host's resolver. Given that network namespacing is
1567 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1568 * advantage that the container will be able to follow the host's DNS server configuration changes
1569 * transparently. */
1570 m = RESOLV_CONF_BIND_STATIC;
1571 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1572 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
1573 else
1574 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
1575 } else
1576 m = arg_resolv_conf;
1577
1578 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1579 return 0;
1580
87447ae4
LP
1581 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1582 if (r < 0) {
1583 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1584 return 0;
1585 }
1586
1587 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1588
1589 if (m == RESOLV_CONF_DELETE) {
1590 if (unlink(where) < 0)
1591 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1592
87447ae4
LP
1593 return 0;
1594 }
79d80fc1 1595
09d423e9
LP
1596 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1597 what = STATIC_RESOLV_CONF;
1598 else
1599 what = "/etc/resolv.conf";
87447ae4 1600
09d423e9
LP
1601 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1602 _cleanup_free_ char *resolved = NULL;
1603 int found;
1604
1605 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1606 if (found < 0) {
1607 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1608 return 0;
1609 }
3539724c 1610
87447ae4
LP
1611 if (found == 0) /* missing? */
1612 (void) touch(resolved);
5367354d 1613
09d423e9 1614 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1615 if (r >= 0)
87447ae4 1616 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1617 }
1618
1619 /* If that didn't work, let's copy the file */
09d423e9 1620 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1621 if (r < 0) {
3539724c
LP
1622 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1623 * resolved or something similar runs inside and the symlink points there.
68a313c5 1624 *
3539724c 1625 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1626 */
09d423e9 1627 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1628 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1629 return 0;
1630 }
2547bb41 1631
03cfe0d5
LP
1632 r = userns_lchown(where, 0, 0);
1633 if (r < 0)
3539724c 1634 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1635
2547bb41
LP
1636 return 0;
1637}
1638
1e4f1671 1639static int setup_boot_id(void) {
cdde6ba6
LP
1640 _cleanup_(unlink_and_freep) char *from = NULL;
1641 _cleanup_free_ char *path = NULL;
3bbaff3e 1642 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1643 const char *to;
04bc4a3f
LP
1644 int r;
1645
04bc4a3f
LP
1646 /* Generate a new randomized boot ID, so that each boot-up of
1647 * the container gets a new one */
1648
cdde6ba6
LP
1649 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1650 if (r < 0)
1651 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1652
1653 r = sd_id128_randomize(&rnd);
f647962d
MS
1654 if (r < 0)
1655 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1656
cdde6ba6 1657 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1658 if (r < 0)
1659 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1660
cdde6ba6
LP
1661 from = TAKE_PTR(path);
1662 to = "/proc/sys/kernel/random/boot_id";
1663
60e76d48 1664 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1665 if (r < 0)
1666 return r;
04bc4a3f 1667
cdde6ba6 1668 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1669}
1670
e58a1277 1671static int copy_devnodes(const char *dest) {
88213476
LP
1672
1673 static const char devnodes[] =
1674 "null\0"
1675 "zero\0"
1676 "full\0"
1677 "random\0"
1678 "urandom\0"
85614d66
TG
1679 "tty\0"
1680 "net/tun\0";
88213476
LP
1681
1682 const char *d;
e58a1277 1683 int r = 0;
7fd1b19b 1684 _cleanup_umask_ mode_t u;
a258bf26
LP
1685
1686 assert(dest);
124640f1
LP
1687
1688 u = umask(0000);
88213476 1689
03cfe0d5
LP
1690 /* Create /dev/net, so that we can create /dev/net/tun in it */
1691 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1692 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1693
88213476 1694 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1695 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1696 struct stat st;
88213476 1697
7f112f50 1698 from = strappend("/dev/", d);
03cfe0d5 1699 to = prefix_root(dest, from);
88213476
LP
1700
1701 if (stat(from, &st) < 0) {
1702
4a62c710
MS
1703 if (errno != ENOENT)
1704 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1705
a258bf26 1706 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1707
03cfe0d5 1708 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1709 return -EIO;
a258bf26 1710
85614d66 1711 } else {
81f5049b 1712 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1713 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1714 if (errno == EEXIST)
8dbf71ec 1715 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1716 if (errno != EPERM)
1717 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1718
1719 /* Some systems abusively restrict mknod but
1720 * allow bind mounts. */
1721 r = touch(to);
1722 if (r < 0)
1723 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1724 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1725 if (r < 0)
1726 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1727 }
6278cf60 1728
03cfe0d5
LP
1729 r = userns_lchown(to, 0, 0);
1730 if (r < 0)
1731 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1732 }
88213476
LP
1733 }
1734
e58a1277
LP
1735 return r;
1736}
88213476 1737
03cfe0d5
LP
1738static int setup_pts(const char *dest) {
1739 _cleanup_free_ char *options = NULL;
1740 const char *p;
709f6e46 1741 int r;
03cfe0d5 1742
349cc4a5 1743#if HAVE_SELINUX
03cfe0d5
LP
1744 if (arg_selinux_apifs_context)
1745 (void) asprintf(&options,
3dce8915 1746 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1747 arg_uid_shift + TTY_GID,
1748 arg_selinux_apifs_context);
1749 else
1750#endif
1751 (void) asprintf(&options,
3dce8915 1752 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1753 arg_uid_shift + TTY_GID);
f2d88580 1754
03cfe0d5 1755 if (!options)
f2d88580
LP
1756 return log_oom();
1757
03cfe0d5 1758 /* Mount /dev/pts itself */
cc9fce65 1759 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1760 r = mkdir_errno_wrapper(p, 0755);
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to create /dev/pts: %m");
1763
60e76d48
ZJS
1764 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1765 if (r < 0)
1766 return r;
709f6e46
MS
1767 r = userns_lchown(p, 0, 0);
1768 if (r < 0)
1769 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1770
1771 /* Create /dev/ptmx symlink */
1772 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1773 if (symlink("pts/ptmx", p) < 0)
1774 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1775 r = userns_lchown(p, 0, 0);
1776 if (r < 0)
1777 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1778
03cfe0d5
LP
1779 /* And fix /dev/pts/ptmx ownership */
1780 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1781 r = userns_lchown(p, 0, 0);
1782 if (r < 0)
1783 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1784
f2d88580
LP
1785 return 0;
1786}
1787
e58a1277 1788static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1789 _cleanup_umask_ mode_t u;
1790 const char *to;
e58a1277 1791 int r;
e58a1277
LP
1792
1793 assert(dest);
1794 assert(console);
1795
1796 u = umask(0000);
1797
03cfe0d5 1798 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1799 if (r < 0)
1800 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1801
a258bf26
LP
1802 /* We need to bind mount the right tty to /dev/console since
1803 * ptys can only exist on pts file systems. To have something
81f5049b 1804 * to bind mount things on we create a empty regular file. */
a258bf26 1805
03cfe0d5 1806 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1807 r = touch(to);
1808 if (r < 0)
1809 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1810
60e76d48 1811 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1812}
1813
8e5430c4
LP
1814static int setup_keyring(void) {
1815 key_serial_t keyring;
1816
1817 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1818 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1819 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1820 * these system calls let's make sure we don't leak anything into the container. */
1821
1822 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1823 if (keyring == -1) {
1824 if (errno == ENOSYS)
1825 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1826 else if (IN_SET(errno, EACCES, EPERM))
1827 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1828 else
1829 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1830 }
1831
1832 return 0;
1833}
1834
1e4f1671 1835static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
1836 _cleanup_(unlink_and_freep) char *from = NULL;
1837 _cleanup_free_ char *fifo = NULL;
1838 _cleanup_close_ int fd = -1;
7fd1b19b 1839 _cleanup_umask_ mode_t u;
9ec5a93c
LP
1840 const char *to;
1841 int r;
e58a1277 1842
e58a1277 1843 assert(kmsg_socket >= 0);
a258bf26 1844
e58a1277 1845 u = umask(0000);
a258bf26 1846
9ec5a93c
LP
1847 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1848 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1849 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1850 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1851
1852 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1853 if (r < 0)
1854 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 1855
9ec5a93c 1856 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 1857 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
1858
1859 from = TAKE_PTR(fifo);
1860 to = "/proc/kmsg";
1861
60e76d48
ZJS
1862 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1863 if (r < 0)
1864 return r;
e58a1277
LP
1865
1866 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1867 if (fd < 0)
1868 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1869
9ec5a93c 1870 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 1871 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
1872 if (r < 0)
1873 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1874
25ea79fe 1875 return 0;
88213476
LP
1876}
1877
1c4baffc 1878static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1879 union in_addr_union *exposed = userdata;
1880
1881 assert(rtnl);
1882 assert(m);
1883 assert(exposed);
1884
7a8f6325 1885 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1886 return 0;
1887}
1888
3a74cea5 1889static int setup_hostname(void) {
c818eef1 1890 int r;
3a74cea5 1891
0c582db0 1892 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1893 return 0;
1894
c818eef1
LP
1895 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1896 if (r < 0)
1897 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 1898
7027ff61 1899 return 0;
3a74cea5
LP
1900}
1901
57fb9fb5 1902static int setup_journal(const char *directory) {
e01ff70a 1903 sd_id128_t this_id;
0f5e1382 1904 _cleanup_free_ char *d = NULL;
e01ff70a 1905 const char *p, *q;
8054d749 1906 bool try;
e01ff70a 1907 char id[33];
57fb9fb5
LP
1908 int r;
1909
df9a75e4
LP
1910 /* Don't link journals in ephemeral mode */
1911 if (arg_ephemeral)
1912 return 0;
1913
8054d749
LP
1914 if (arg_link_journal == LINK_NO)
1915 return 0;
1916
1917 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1918
4d680aee 1919 r = sd_id128_get_machine(&this_id);
f647962d
MS
1920 if (r < 0)
1921 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1922
e01ff70a 1923 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1924 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1925 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1926 if (try)
4d680aee 1927 return 0;
df9a75e4 1928 return -EEXIST;
4d680aee
ZJS
1929 }
1930
03cfe0d5
LP
1931 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1932 if (r < 0)
1933 return log_error_errno(r, "Failed to create /var: %m");
1934
1935 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to create /var/log: %m");
1938
1939 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1940 if (r < 0)
1941 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1942
e01ff70a
MS
1943 (void) sd_id128_to_string(arg_uuid, id);
1944
03cfe0d5
LP
1945 p = strjoina("/var/log/journal/", id);
1946 q = prefix_roota(directory, p);
27407a01 1947
e1873695 1948 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1949 if (try)
1950 return 0;
27407a01 1951
8054d749
LP
1952 log_error("%s: already a mount point, refusing to use for journal", p);
1953 return -EEXIST;
57fb9fb5
LP
1954 }
1955
e1873695 1956 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1957 if (try)
1958 return 0;
57fb9fb5 1959
8054d749
LP
1960 log_error("%s: already a mount point, refusing to use for journal", q);
1961 return -EEXIST;
57fb9fb5
LP
1962 }
1963
1964 r = readlink_and_make_absolute(p, &d);
1965 if (r >= 0) {
3742095b 1966 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
1967 path_equal(d, q)) {
1968
03cfe0d5 1969 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1970 if (r < 0)
709f6e46 1971 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1972 return 0;
57fb9fb5
LP
1973 }
1974
4a62c710
MS
1975 if (unlink(p) < 0)
1976 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1977 } else if (r == -EINVAL) {
1978
1979 if (arg_link_journal == LINK_GUEST &&
1980 rmdir(p) < 0) {
1981
27407a01
ZJS
1982 if (errno == ENOTDIR) {
1983 log_error("%s already exists and is neither a symlink nor a directory", p);
1984 return r;
4314d33f
MS
1985 } else
1986 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1987 }
4314d33f
MS
1988 } else if (r != -ENOENT)
1989 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1990
1991 if (arg_link_journal == LINK_GUEST) {
1992
1993 if (symlink(q, p) < 0) {
8054d749 1994 if (try) {
56f64d95 1995 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1996 return 0;
4314d33f
MS
1997 } else
1998 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1999 }
2000
03cfe0d5 2001 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2002 if (r < 0)
709f6e46 2003 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2004 return 0;
57fb9fb5
LP
2005 }
2006
2007 if (arg_link_journal == LINK_HOST) {
ccddd104 2008 /* don't create parents here — if the host doesn't have
574edc90 2009 * permanent journal set up, don't force it here */
ba8e6c4d 2010
dae8b82e
ZJS
2011 r = mkdir_errno_wrapper(p, 0755);
2012 if (r < 0 && r != -EEXIST) {
8054d749 2013 if (try) {
dae8b82e 2014 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2015 return 0;
4314d33f 2016 } else
dae8b82e 2017 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2018 }
2019
27407a01
ZJS
2020 } else if (access(p, F_OK) < 0)
2021 return 0;
57fb9fb5 2022
cdb2b9d0
LP
2023 if (dir_is_empty(q) == 0)
2024 log_warning("%s is not empty, proceeding anyway.", q);
2025
03cfe0d5 2026 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2027 if (r < 0)
2028 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2029
60e76d48
ZJS
2030 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2031 if (r < 0)
4a62c710 2032 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2033
27407a01 2034 return 0;
57fb9fb5
LP
2035}
2036
88213476 2037static int drop_capabilities(void) {
520e0d54 2038 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
2039}
2040
db999e0f
LP
2041static int reset_audit_loginuid(void) {
2042 _cleanup_free_ char *p = NULL;
2043 int r;
2044
0c582db0 2045 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2046 return 0;
2047
2048 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2049 if (r == -ENOENT)
db999e0f 2050 return 0;
f647962d
MS
2051 if (r < 0)
2052 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2053
2054 /* Already reset? */
2055 if (streq(p, "4294967295"))
2056 return 0;
2057
ad118bda 2058 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2059 if (r < 0) {
10a87006
LP
2060 log_error_errno(r,
2061 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2062 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2063 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2064 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2065 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2066
db999e0f 2067 sleep(5);
77b6e194 2068 }
db999e0f
LP
2069
2070 return 0;
77b6e194
LP
2071}
2072
785890ac
LP
2073static int setup_propagate(const char *root) {
2074 const char *p, *q;
709f6e46 2075 int r;
785890ac
LP
2076
2077 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2078 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2079 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2080 (void) mkdir_p(p, 0600);
2081
709f6e46
MS
2082 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2083 if (r < 0)
2084 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2085
709f6e46
MS
2086 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2087 if (r < 0)
2088 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2089
709f6e46
MS
2090 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2091 if (r < 0)
2092 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2093
03cfe0d5 2094 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2095 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2096 if (r < 0)
2097 return r;
785890ac 2098
60e76d48
ZJS
2099 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2100 if (r < 0)
2101 return r;
785890ac 2102
19caffac
AC
2103 /* machined will MS_MOVE into that directory, and that's only
2104 * supported for non-shared mounts. */
60e76d48 2105 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2106}
2107
317feb4d 2108static int setup_machine_id(const char *directory) {
691675ba
LP
2109 const char *etc_machine_id;
2110 sd_id128_t id;
3bbaff3e 2111 int r;
e01ff70a 2112
317feb4d
LP
2113 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2114 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2115 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2116 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2117 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2118 * container behaves nicely). */
2119
e01ff70a
MS
2120 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2121
691675ba 2122 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2123 if (r < 0) {
2124 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2125 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2126
317feb4d
LP
2127 if (sd_id128_is_null(arg_uuid)) {
2128 r = sd_id128_randomize(&arg_uuid);
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2131 }
2132 } else {
2133 if (sd_id128_is_null(id)) {
2134 log_error("Machine ID in container image is zero, refusing.");
2135 return -EINVAL;
2136 }
e01ff70a 2137
317feb4d
LP
2138 arg_uuid = id;
2139 }
691675ba 2140
e01ff70a
MS
2141 return 0;
2142}
2143
7336138e
LP
2144static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2145 int r;
2146
2147 assert(directory);
2148
0de7acce 2149 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2150 return 0;
2151
2152 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2153 if (r == -EOPNOTSUPP)
2154 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2155 if (r == -EBADE)
2156 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2157 if (r < 0)
2158 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2159 if (r == 0)
2160 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2161 else
2162 log_debug("Patched directory tree to match UID/GID range.");
2163
2164 return r;
2165}
2166
113cea80 2167/*
6d416b9c
LS
2168 * Return values:
2169 * < 0 : wait_for_terminate() failed to get the state of the
2170 * container, the container was terminated by a signal, or
2171 * failed for an unknown reason. No change is made to the
2172 * container argument.
2173 * > 0 : The program executed in the container terminated with an
2174 * error. The exit code of the program executed in the
919699ec
LP
2175 * container is returned. The container argument has been set
2176 * to CONTAINER_TERMINATED.
6d416b9c
LS
2177 * 0 : The container is being rebooted, has been shut down or exited
2178 * successfully. The container argument has been set to either
2179 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2180 *
6d416b9c
LS
2181 * That is, success is indicated by a return value of zero, and an
2182 * error is indicated by a non-zero value.
113cea80
DH
2183 */
2184static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2185 siginfo_t status;
919699ec 2186 int r;
113cea80
DH
2187
2188 r = wait_for_terminate(pid, &status);
f647962d
MS
2189 if (r < 0)
2190 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2191
2192 switch (status.si_code) {
fddbb89c 2193
113cea80 2194 case CLD_EXITED:
b5a2179b 2195 if (status.si_status == 0)
919699ec 2196 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2197 else
919699ec 2198 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2199
919699ec
LP
2200 *container = CONTAINER_TERMINATED;
2201 return status.si_status;
113cea80
DH
2202
2203 case CLD_KILLED:
2204 if (status.si_status == SIGINT) {
919699ec 2205 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2206 *container = CONTAINER_TERMINATED;
919699ec
LP
2207 return 0;
2208
113cea80 2209 } else if (status.si_status == SIGHUP) {
919699ec 2210 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2211 *container = CONTAINER_REBOOTED;
919699ec 2212 return 0;
113cea80 2213 }
919699ec 2214
4831981d 2215 _fallthrough_;
113cea80 2216 case CLD_DUMPED:
fddbb89c 2217 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2218 return -EIO;
113cea80
DH
2219
2220 default:
fddbb89c 2221 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2222 return -EIO;
113cea80 2223 }
113cea80
DH
2224}
2225
023fb90b
LP
2226static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2227 pid_t pid;
2228
4a0b58c4 2229 pid = PTR_TO_PID(userdata);
023fb90b 2230 if (pid > 0) {
c6c8f6e2 2231 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2232 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2233 sd_event_source_set_userdata(s, NULL);
2234 return 0;
2235 }
2236 }
2237
2238 sd_event_exit(sd_event_source_get_event(s), 0);
2239 return 0;
2240}
2241
6916b164 2242static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2243 pid_t pid;
2244
2245 assert(s);
2246 assert(ssi);
2247
2248 pid = PTR_TO_PID(userdata);
2249
6916b164
AU
2250 for (;;) {
2251 siginfo_t si = {};
abdb9b08 2252
6916b164
AU
2253 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2254 return log_error_errno(errno, "Failed to waitid(): %m");
2255 if (si.si_pid == 0) /* No pending children. */
2256 break;
abdb9b08 2257 if (si.si_pid == pid) {
6916b164
AU
2258 /* The main process we care for has exited. Return from
2259 * signal handler but leave the zombie. */
2260 sd_event_exit(sd_event_source_get_event(s), 0);
2261 break;
2262 }
abdb9b08 2263
6916b164
AU
2264 /* Reap all other children. */
2265 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2266 }
2267
2268 return 0;
2269}
2270
abdb9b08
LP
2271static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2272 pid_t pid;
2273
2274 assert(m);
2275
2276 pid = PTR_TO_PID(userdata);
2277
2278 if (arg_kill_signal > 0) {
2279 log_info("Container termination requested. Attempting to halt container.");
2280 (void) kill(pid, arg_kill_signal);
2281 } else {
2282 log_info("Container termination requested. Exiting.");
2283 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2284 }
2285
2286 return 0;
2287}
2288
ec16945e 2289static int determine_names(void) {
1b9cebf6 2290 int r;
ec16945e 2291
c1521918
LP
2292 if (arg_template && !arg_directory && arg_machine) {
2293
2294 /* If --template= was specified then we should not
2295 * search for a machine, but instead create a new one
2296 * in /var/lib/machine. */
2297
605405c6 2298 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2299 if (!arg_directory)
2300 return log_oom();
2301 }
2302
ec16945e 2303 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2304 if (arg_machine) {
2305 _cleanup_(image_unrefp) Image *i = NULL;
2306
2307 r = image_find(arg_machine, &i);
2308 if (r < 0)
2309 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2310 if (r == 0) {
35bca925 2311 log_error("No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2312 return -ENOENT;
2313 }
2314
eb38edce 2315 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2316 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2317 else
0f03c2a4 2318 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2319 if (r < 0)
0f3be6ca 2320 return log_oom();
1b9cebf6 2321
aee327b8
LP
2322 if (!arg_ephemeral)
2323 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2324 } else {
2325 r = safe_getcwd(&arg_directory);
2326 if (r < 0)
2327 return log_error_errno(r, "Failed to determine current directory: %m");
2328 }
ec16945e 2329
0f3be6ca 2330 if (!arg_directory && !arg_image) {
1b9cebf6 2331 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2332 return -EINVAL;
2333 }
2334 }
2335
2336 if (!arg_machine) {
4827ab48 2337
b9ba4dab
LP
2338 if (arg_directory && path_equal(arg_directory, "/"))
2339 arg_machine = gethostname_malloc();
4827ab48
LP
2340 else {
2341 if (arg_image) {
2342 char *e;
2343
2344 arg_machine = strdup(basename(arg_image));
2345
2346 /* Truncate suffix if there is one */
2347 e = endswith(arg_machine, ".raw");
2348 if (e)
2349 *e = 0;
2350 } else
2351 arg_machine = strdup(basename(arg_directory));
2352 }
ec16945e
LP
2353 if (!arg_machine)
2354 return log_oom();
2355
ae691c1d 2356 hostname_cleanup(arg_machine);
ec16945e
LP
2357 if (!machine_name_is_valid(arg_machine)) {
2358 log_error("Failed to determine machine name automatically, please use -M.");
2359 return -EINVAL;
2360 }
b9ba4dab
LP
2361
2362 if (arg_ephemeral) {
2363 char *b;
2364
2365 /* Add a random suffix when this is an
2366 * ephemeral machine, so that we can run many
2367 * instances at once without manually having
2368 * to specify -M each time. */
2369
2370 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2371 return log_oom();
2372
2373 free(arg_machine);
2374 arg_machine = b;
2375 }
ec16945e
LP
2376 }
2377
2378 return 0;
2379}
2380
8d4aa2bb 2381static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2382 char *chased;
2383 int r;
2384
2385 assert(p);
2386
2387 if (!*p)
2388 return 0;
2389
8d4aa2bb 2390 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2391 if (r < 0)
2392 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2393
8405dcf7
ZJS
2394 free_and_replace(*p, chased);
2395 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2396}
2397
03cfe0d5 2398static int determine_uid_shift(const char *directory) {
6dac160c
LP
2399 int r;
2400
0de7acce 2401 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2402 arg_uid_shift = 0;
6dac160c 2403 return 0;
03cfe0d5 2404 }
6dac160c
LP
2405
2406 if (arg_uid_shift == UID_INVALID) {
2407 struct stat st;
2408
03cfe0d5 2409 r = stat(directory, &st);
6dac160c 2410 if (r < 0)
03cfe0d5 2411 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2412
2413 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2414
2415 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2416 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2417 return -EINVAL;
2418 }
2419
2420 arg_uid_range = UINT32_C(0x10000);
2421 }
2422
2423 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2424 log_error("UID base too high for UID range.");
2425 return -EINVAL;
2426 }
2427
6dac160c
LP
2428 return 0;
2429}
2430
03cfe0d5
LP
2431static int inner_child(
2432 Barrier *barrier,
2433 const char *directory,
2434 bool secondary,
2435 int kmsg_socket,
2436 int rtnl_socket,
f757855e 2437 FDSet *fds) {
69c79d3c 2438
03cfe0d5 2439 _cleanup_free_ char *home = NULL;
e01ff70a 2440 char as_uuid[37];
88614c8a 2441 size_t n_env = 1;
03cfe0d5 2442 const char *envp[] = {
0c300adf 2443 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2444 NULL, /* container */
03cfe0d5
LP
2445 NULL, /* TERM */
2446 NULL, /* HOME */
2447 NULL, /* USER */
2448 NULL, /* LOGNAME */
2449 NULL, /* container_uuid */
2450 NULL, /* LISTEN_FDS */
2451 NULL, /* LISTEN_PID */
9c1e04d0 2452 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2453 NULL
2454 };
1a68e1e5 2455 const char *exec_target;
2371271c 2456 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2457 int r;
88213476 2458
03cfe0d5
LP
2459 assert(barrier);
2460 assert(directory);
2461 assert(kmsg_socket >= 0);
88213476 2462
0de7acce 2463 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2464 /* Tell the parent, that it now can write the UID map. */
2465 (void) barrier_place(barrier); /* #1 */
7027ff61 2466
03cfe0d5
LP
2467 /* Wait until the parent wrote the UID map */
2468 if (!barrier_place_and_sync(barrier)) { /* #2 */
2469 log_error("Parent died too early");
2470 return -ESRCH;
2471 }
88213476
LP
2472 }
2473
6d66bd3b
EV
2474 r = reset_uid_gid();
2475 if (r < 0)
2476 return log_error_errno(r, "Couldn't become new root: %m");
2477
0de7acce 2478 r = mount_all(NULL,
4f086aab 2479 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2480 arg_uid_shift,
2481 arg_uid_range,
2482 arg_selinux_apifs_context);
03cfe0d5
LP
2483 if (r < 0)
2484 return r;
2485
04413780
ZJS
2486 if (!arg_network_namespace_path && arg_private_network) {
2487 r = unshare(CLONE_NEWNET);
2488 if (r < 0)
2489 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2490
2491 /* Tell the parent that it can setup network interfaces. */
2492 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2493 }
2494
4f086aab 2495 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2496 if (r < 0)
2497 return r;
2498
03cfe0d5
LP
2499 /* Wait until we are cgroup-ified, so that we
2500 * can mount the right cgroup path writable */
75116558 2501 if (!barrier_place_and_sync(barrier)) { /* #4 */
03cfe0d5
LP
2502 log_error("Parent died too early");
2503 return -ESRCH;
88213476
LP
2504 }
2505
5a8ff0e6 2506 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2507 r = unshare(CLONE_NEWCGROUP);
2508 if (r < 0)
04413780 2509 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2510 r = mount_cgroups(
2511 "",
2512 arg_unified_cgroup_hierarchy,
2513 arg_userns_mode != USER_NAMESPACE_NO,
2514 arg_uid_shift,
2515 arg_uid_range,
5a8ff0e6 2516 arg_selinux_apifs_context,
ada54120 2517 true);
0996ef00
CB
2518 if (r < 0)
2519 return r;
2520 } else {
2521 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2522 if (r < 0)
2523 return r;
2524 }
ec16945e 2525
1e4f1671 2526 r = setup_boot_id();
03cfe0d5
LP
2527 if (r < 0)
2528 return r;
ec16945e 2529
1e4f1671 2530 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2531 if (r < 0)
2532 return r;
2533 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2534
03cfe0d5 2535 umask(0022);
30535c16 2536
03cfe0d5
LP
2537 if (setsid() < 0)
2538 return log_error_errno(errno, "setsid() failed: %m");
2539
2540 if (arg_private_network)
2541 loopback_setup();
2542
7a8f6325
LP
2543 if (arg_expose_ports) {
2544 r = expose_port_send_rtnl(rtnl_socket);
2545 if (r < 0)
2546 return r;
2547 rtnl_socket = safe_close(rtnl_socket);
2548 }
03cfe0d5 2549
81f345df
LP
2550 if (arg_oom_score_adjust_set) {
2551 r = set_oom_score_adjust(arg_oom_score_adjust);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to adjust OOM score: %m");
2554 }
2555
d107bb7d
LP
2556 if (arg_cpuset)
2557 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2558 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2559
709f6e46
MS
2560 r = drop_capabilities();
2561 if (r < 0)
2562 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5 2563
c818eef1 2564 (void) setup_hostname();
03cfe0d5 2565
050f7277 2566 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2567 r = safe_personality(arg_personality);
2568 if (r < 0)
2569 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2570 } else if (secondary) {
21022b9d
LP
2571 r = safe_personality(PER_LINUX32);
2572 if (r < 0)
2573 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2574 }
2575
349cc4a5 2576#if HAVE_SELINUX
03cfe0d5 2577 if (arg_selinux_context)
2ed96880 2578 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2579 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2580#endif
2581
ee645080 2582 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2583 if (r < 0)
2584 return r;
2585
66edd963
LP
2586 if (arg_no_new_privileges)
2587 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2588 return log_error_errno(errno, "Failed to disable new privileges: %m");
2589
6aadfa4c
ILG
2590 /* LXC sets container=lxc, so follow the scheme here */
2591 envp[n_env++] = strjoina("container=", arg_container_service_name);
2592
03cfe0d5
LP
2593 envp[n_env] = strv_find_prefix(environ, "TERM=");
2594 if (envp[n_env])
313cefa1 2595 n_env++;
03cfe0d5
LP
2596
2597 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2598 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2599 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2600 return log_oom();
2601
3bbaff3e 2602 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2603
691675ba 2604 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2605 return log_oom();
03cfe0d5
LP
2606
2607 if (fdset_size(fds) > 0) {
2608 r = fdset_cloexec(fds, false);
2609 if (r < 0)
2610 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2611
2612 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2613 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2614 return log_oom();
2615 }
9c1e04d0
AP
2616 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2617 return log_oom();
03cfe0d5 2618
2371271c
TG
2619 env_use = strv_env_merge(2, envp, arg_setenv);
2620 if (!env_use)
2621 return log_oom();
03cfe0d5
LP
2622
2623 /* Let the parent know that we are ready and
2624 * wait until the parent is ready with the
2625 * setup, too... */
75116558 2626 if (!barrier_place_and_sync(barrier)) { /* #5 */
03cfe0d5
LP
2627 log_error("Parent died too early");
2628 return -ESRCH;
2629 }
2630
5f932eb9
LP
2631 if (arg_chdir)
2632 if (chdir(arg_chdir) < 0)
2633 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2634
7732f92b 2635 if (arg_start_mode == START_PID2) {
75bf701f 2636 r = stub_pid1(arg_uuid);
7732f92b
LP
2637 if (r < 0)
2638 return r;
2639 }
2640
03cfe0d5
LP
2641 /* Now, explicitly close the log, so that we
2642 * then can close all remaining fds. Closing
2643 * the log explicitly first has the benefit
2644 * that the logging subsystem knows about it,
2645 * and is thus ready to be reopened should we
2646 * need it again. Note that the other fds
2647 * closed here are at least the locking and
2648 * barrier fds. */
2649 log_close();
2650 (void) fdset_close_others(fds);
2651
7732f92b 2652 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2653 char **a;
2654 size_t m;
2655
2656 /* Automatically search for the init system */
2657
75f32f04
ZJS
2658 m = strv_length(arg_parameters);
2659 a = newa(char*, m + 2);
2660 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2661 a[1 + m] = NULL;
03cfe0d5 2662
ced58da7 2663 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2664 execve(a[0], a, env_use);
2665
ced58da7 2666 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2667 execve(a[0], a, env_use);
2668
ced58da7 2669 a[0] = (char*) "/sbin/init";
03cfe0d5 2670 execve(a[0], a, env_use);
ced58da7
LP
2671
2672 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2673 } else if (!strv_isempty(arg_parameters)) {
2674 exec_target = arg_parameters[0];
f757855e 2675 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2676 } else {
5f932eb9 2677 if (!arg_chdir)
d929b0f9
ZJS
2678 /* If we cannot change the directory, we'll end up in /, that is expected. */
2679 (void) chdir(home ?: "/root");
5f932eb9 2680
03cfe0d5
LP
2681 execle("/bin/bash", "-bash", NULL, env_use);
2682 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2683
2684 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2685 }
2686
35607a8d 2687 r = -errno;
03cfe0d5 2688 (void) log_open();
1a68e1e5 2689 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2690}
2691
9c1e04d0
AP
2692static int setup_sd_notify_child(void) {
2693 static const int one = 1;
2694 int fd = -1;
2695 union sockaddr_union sa = {
2696 .sa.sa_family = AF_UNIX,
2697 };
2698 int r;
2699
2700 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2701 if (fd < 0)
2702 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2703
2704 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2705 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2706
2707 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2708 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2709 if (r < 0) {
2710 safe_close(fd);
2711 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2712 }
2713
adc7d9f0
EV
2714 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2715 if (r < 0) {
2716 safe_close(fd);
2717 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2718 }
2719
9c1e04d0
AP
2720 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2721 if (r < 0) {
2722 safe_close(fd);
2723 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2724 }
2725
2726 return fd;
2727}
2728
03cfe0d5
LP
2729static int outer_child(
2730 Barrier *barrier,
2731 const char *directory,
2732 const char *console,
2d845785 2733 DissectedImage *dissected_image,
03cfe0d5
LP
2734 bool interactive,
2735 bool secondary,
2736 int pid_socket,
e01ff70a 2737 int uuid_socket,
9c1e04d0 2738 int notify_socket,
03cfe0d5
LP
2739 int kmsg_socket,
2740 int rtnl_socket,
825d5287 2741 int uid_shift_socket,
8199d554 2742 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
2743 FDSet *fds,
2744 int netns_fd) {
03cfe0d5 2745
bf428efb
LP
2746 _cleanup_close_ int fd = -1;
2747 int r, which_failed;
03cfe0d5
LP
2748 pid_t pid;
2749 ssize_t l;
03cfe0d5
LP
2750
2751 assert(barrier);
2752 assert(directory);
2753 assert(console);
2754 assert(pid_socket >= 0);
e01ff70a 2755 assert(uuid_socket >= 0);
9c1e04d0 2756 assert(notify_socket >= 0);
03cfe0d5
LP
2757 assert(kmsg_socket >= 0);
2758
2759 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2760 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2761
2762 if (interactive) {
2b33ab09 2763 int terminal;
03cfe0d5 2764
2b33ab09
LP
2765 terminal = open_terminal(console, O_RDWR);
2766 if (terminal < 0)
2767 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 2768
2b33ab09
LP
2769 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
2772 }
2773
2774 r = reset_audit_loginuid();
2775 if (r < 0)
2776 return r;
2777
2778 /* Mark everything as slave, so that we still
2779 * receive mounts from the real root, but don't
2780 * propagate mounts to the real root. */
60e76d48
ZJS
2781 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2782 if (r < 0)
2783 return r;
03cfe0d5 2784
2d845785 2785 if (dissected_image) {
2d3a5a73
LP
2786 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2787 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2788 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2789 * makes sure ESP partitions and userns are compatible. */
2790
2791 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2792 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2793 if (r < 0)
2794 return r;
2795 }
03cfe0d5 2796
391567f4
LP
2797 r = determine_uid_shift(directory);
2798 if (r < 0)
2799 return r;
2800
0de7acce 2801 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2802 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2803 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2804 if (l < 0)
2805 return log_error_errno(errno, "Failed to send UID shift: %m");
2806 if (l != sizeof(arg_uid_shift)) {
2807 log_error("Short write while sending UID shift.");
2808 return -EIO;
2809 }
0e7ac751 2810
0de7acce 2811 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2812 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2813 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2814 * not it will pick a different one, and send it back to us. */
2815
2816 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2817 if (l < 0)
2818 return log_error_errno(errno, "Failed to recv UID shift: %m");
2819 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2820 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2821 return -EIO;
2822 }
2823 }
2824
2825 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2826 }
2827
2d3a5a73
LP
2828 if (dissected_image) {
2829 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2830 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2831 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2832 if (r < 0)
2833 return r;
2834 }
2835
8199d554
LP
2836 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2837 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2838
2839 r = detect_unified_cgroup_hierarchy_from_image(directory);
2840 if (r < 0)
2841 return r;
2842
2843 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2844 if (l < 0)
2845 return log_error_errno(errno, "Failed to send cgroup mode: %m");
2846 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
2847 log_error("Short write while sending cgroup mode: %m");
2848 return -EIO;
2849 }
2850
2851 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2852 }
2853
03cfe0d5 2854 /* Turn directory into bind mount */
60e76d48
ZJS
2855 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2856 if (r < 0)
2857 return r;
03cfe0d5 2858
b53ede69
PW
2859 r = setup_pivot_root(
2860 directory,
2861 arg_pivot_root_new,
2862 arg_pivot_root_old);
2863 if (r < 0)
2864 return r;
2865
0de7acce
LP
2866 r = setup_volatile(
2867 directory,
2868 arg_volatile_mode,
2869 arg_userns_mode != USER_NAMESPACE_NO,
2870 arg_uid_shift,
2871 arg_uid_range,
2872 arg_selinux_context);
03cfe0d5
LP
2873 if (r < 0)
2874 return r;
2875
0de7acce
LP
2876 r = setup_volatile_state(
2877 directory,
2878 arg_volatile_mode,
2879 arg_userns_mode != USER_NAMESPACE_NO,
2880 arg_uid_shift,
2881 arg_uid_range,
2882 arg_selinux_context);
03cfe0d5
LP
2883 if (r < 0)
2884 return r;
2885
4ad14eff
LP
2886 /* Mark everything as shared so our mounts get propagated down. This is
2887 * required to make new bind mounts available in systemd services
2888 * inside the containter that create a new mount namespace.
2889 * See https://github.com/systemd/systemd/issues/3860
2890 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2891 * shared propagation mode. */
4ad14eff
LP
2892 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2893 if (r < 0)
2894 return r;
2895
2896 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2897 if (r < 0)
2898 return r;
2899
03cfe0d5
LP
2900 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2901 if (r < 0)
2902 return r;
2903
03cfe0d5 2904 if (arg_read_only) {
6b7c9f8b 2905 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2906 if (r < 0)
2907 return log_error_errno(r, "Failed to make tree read-only: %m");
2908 }
2909
0de7acce 2910 r = mount_all(directory,
4f086aab 2911 arg_mount_settings,
0de7acce
LP
2912 arg_uid_shift,
2913 arg_uid_range,
2914 arg_selinux_apifs_context);
03cfe0d5
LP
2915 if (r < 0)
2916 return r;
2917
07fa00f9
LP
2918 r = copy_devnodes(directory);
2919 if (r < 0)
03cfe0d5
LP
2920 return r;
2921
2922 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2923
07fa00f9
LP
2924 r = setup_pts(directory);
2925 if (r < 0)
03cfe0d5
LP
2926 return r;
2927
2928 r = setup_propagate(directory);
2929 if (r < 0)
2930 return r;
2931
2932 r = setup_dev_console(directory, console);
2933 if (r < 0)
2934 return r;
2935
8e5430c4
LP
2936 r = setup_keyring();
2937 if (r < 0)
2938 return r;
2939
960e4569 2940 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
2941 if (r < 0)
2942 return r;
2943
2944 r = setup_timezone(directory);
2945 if (r < 0)
2946 return r;
2947
2948 r = setup_resolv_conf(directory);
2949 if (r < 0)
2950 return r;
2951
e01ff70a
MS
2952 r = setup_machine_id(directory);
2953 if (r < 0)
2954 return r;
2955
03cfe0d5
LP
2956 r = setup_journal(directory);
2957 if (r < 0)
2958 return r;
2959
0de7acce
LP
2960 r = mount_custom(
2961 directory,
2962 arg_custom_mounts,
2963 arg_n_custom_mounts,
2964 arg_userns_mode != USER_NAMESPACE_NO,
2965 arg_uid_shift,
2966 arg_uid_range,
2967 arg_selinux_apifs_context);
03cfe0d5
LP
2968 if (r < 0)
2969 return r;
2970
5a8ff0e6 2971 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2972 r = mount_cgroups(
2973 directory,
2974 arg_unified_cgroup_hierarchy,
2975 arg_userns_mode != USER_NAMESPACE_NO,
2976 arg_uid_shift,
2977 arg_uid_range,
5a8ff0e6 2978 arg_selinux_apifs_context,
ada54120 2979 false);
0996ef00
CB
2980 if (r < 0)
2981 return r;
2982 }
03cfe0d5
LP
2983
2984 r = mount_move_root(directory);
2985 if (r < 0)
2986 return log_error_errno(r, "Failed to move root directory: %m");
2987
9c1e04d0
AP
2988 fd = setup_sd_notify_child();
2989 if (fd < 0)
2990 return fd;
2991
bf428efb
LP
2992 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2993 if (r < 0)
2994 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2995
03cfe0d5 2996 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2997 arg_clone_ns_flags |
8869a0b4 2998 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2999 if (pid < 0)
3000 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3001 if (pid == 0) {
3002 pid_socket = safe_close(pid_socket);
e01ff70a 3003 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3004 notify_socket = safe_close(notify_socket);
825d5287 3005 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3006
3007 /* The inner child has all namespaces that are
3008 * requested, so that we all are owned by the user if
3009 * user namespaces are turned on. */
3010
d7bea6b6
DP
3011 if (arg_network_namespace_path) {
3012 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3013 if (r < 0)
3014 return r;
3015 }
3016
f757855e 3017 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3018 if (r < 0)
3019 _exit(EXIT_FAILURE);
3020
3021 _exit(EXIT_SUCCESS);
3022 }
3023
3024 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3025 if (l < 0)
3026 return log_error_errno(errno, "Failed to send PID: %m");
3027 if (l != sizeof(pid)) {
3028 log_error("Short write while sending PID.");
3029 return -EIO;
3030 }
3031
e01ff70a
MS
3032 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3033 if (l < 0)
3034 return log_error_errno(errno, "Failed to send machine ID: %m");
3035 if (l != sizeof(arg_uuid)) {
3036 log_error("Short write while sending machine ID.");
3037 return -EIO;
3038 }
3039
9c1e04d0
AP
3040 l = send_one_fd(notify_socket, fd, 0);
3041 if (l < 0)
3042 return log_error_errno(errno, "Failed to send notify fd: %m");
3043
03cfe0d5 3044 pid_socket = safe_close(pid_socket);
e01ff70a 3045 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3046 notify_socket = safe_close(notify_socket);
327e26d6
KN
3047 kmsg_socket = safe_close(kmsg_socket);
3048 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3049 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3050
3051 return 0;
3052}
3053
0e7ac751 3054static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3055 bool tried_hashed = false;
0e7ac751
LP
3056 unsigned n_tries = 100;
3057 uid_t candidate;
3058 int r;
3059
3060 assert(shift);
3061 assert(ret_lock_file);
0de7acce 3062 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3063 assert(arg_uid_range == 0x10000U);
3064
3065 candidate = *shift;
3066
3067 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3068
3069 for (;;) {
fbd0b64f 3070 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3071 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3072
3073 if (--n_tries <= 0)
3074 return -EBUSY;
3075
87d5e4f2 3076 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3077 goto next;
3078 if ((candidate & UINT32_C(0xFFFF)) != 0)
3079 goto next;
3080
3081 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3082 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3083 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3084 goto next;
3085 if (r < 0)
3086 return r;
3087
3088 /* Make some superficial checks whether the range is currently known in the user database */
3089 if (getpwuid(candidate))
3090 goto next;
3091 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3092 goto next;
3093 if (getgrgid(candidate))
3094 goto next;
3095 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3096 goto next;
3097
3098 *ret_lock_file = lf;
3099 lf = (struct LockFile) LOCK_FILE_INIT;
3100 *shift = candidate;
3101 return 0;
3102
3103 next:
d381c8a6
LP
3104 if (arg_machine && !tried_hashed) {
3105 /* Try to hash the base from the container name */
3106
3107 static const uint8_t hash_key[] = {
3108 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3109 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3110 };
3111
3112 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3113
3114 tried_hashed = true;
3115 } else
3116 random_bytes(&candidate, sizeof(candidate));
3117
87d5e4f2 3118 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3119 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3120 }
3121}
3122
03cfe0d5 3123static int setup_uid_map(pid_t pid) {
fbd0b64f 3124 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3125 int r;
3126
3127 assert(pid > 1);
3128
3129 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3130 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3131 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3132 if (r < 0)
3133 return log_error_errno(r, "Failed to write UID map: %m");
3134
3135 /* We always assign the same UID and GID ranges */
3136 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3137 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3138 if (r < 0)
3139 return log_error_errno(r, "Failed to write GID map: %m");
3140
3141 return 0;
3142}
3143
9c1e04d0 3144static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3145 char buf[NOTIFY_BUFFER_MAX+1];
3146 char *p = NULL;
3147 struct iovec iovec = {
3148 .iov_base = buf,
3149 .iov_len = sizeof(buf)-1,
3150 };
3151 union {
3152 struct cmsghdr cmsghdr;
3153 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3154 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3155 } control = {};
3156 struct msghdr msghdr = {
3157 .msg_iov = &iovec,
3158 .msg_iovlen = 1,
3159 .msg_control = &control,
3160 .msg_controllen = sizeof(control),
3161 };
3162 struct cmsghdr *cmsg;
3163 struct ucred *ucred = NULL;
3164 ssize_t n;
3165 pid_t inner_child_pid;
3166 _cleanup_strv_free_ char **tags = NULL;
3167
3168 assert(userdata);
3169
3170 inner_child_pid = PTR_TO_PID(userdata);
3171
3172 if (revents != EPOLLIN) {
3173 log_warning("Got unexpected poll event for notify fd.");
3174 return 0;
3175 }
3176
3177 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3178 if (n < 0) {
3742095b 3179 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3180 return 0;
3181
3182 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3183 }
3184 cmsg_close_all(&msghdr);
3185
3186 CMSG_FOREACH(cmsg, &msghdr) {
3187 if (cmsg->cmsg_level == SOL_SOCKET &&
3188 cmsg->cmsg_type == SCM_CREDENTIALS &&
3189 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3190
3191 ucred = (struct ucred*) CMSG_DATA(cmsg);
3192 }
3193 }
3194
3195 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3196 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3197 return 0;
3198 }
3199
3200 if ((size_t) n >= sizeof(buf)) {
3201 log_warning("Received notify message exceeded maximum size. Ignoring.");
3202 return 0;
3203 }
3204
3205 buf[n] = 0;
3206 tags = strv_split(buf, "\n\r");
3207 if (!tags)
3208 return log_oom();
3209
3210 if (strv_find(tags, "READY=1"))
3211 sd_notifyf(false, "READY=1\n");
3212
3213 p = strv_find_startswith(tags, "STATUS=");
3214 if (p)
3215 sd_notifyf(false, "STATUS=Container running: %s", p);
3216
3217 return 0;
3218}
3219
5773024d 3220static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3221 int r;
9c1e04d0 3222
5773024d 3223 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3224 if (r < 0)
3225 return log_error_errno(r, "Failed to allocate notify event source: %m");
3226
5773024d 3227 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3228
3229 return 0;
3230}
3231
5d961407
LP
3232static int merge_settings(Settings *settings, const char *path) {
3233 int rl;
f757855e 3234
5d961407
LP
3235 assert(settings);
3236 assert(path);
f757855e 3237
5d961407
LP
3238 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3239 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3240
7732f92b
LP
3241 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3242 settings->start_mode >= 0) {
3243 arg_start_mode = settings->start_mode;
130d3d22 3244 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3245 }
3246
b53ede69
PW
3247 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3248 settings->pivot_root_new) {
3249 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3250 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3251 }
3252
5f932eb9 3253 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3254 settings->working_directory)
3255 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3256
f757855e 3257 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3258 settings->environment)
3259 strv_free_and_replace(arg_setenv, settings->environment);
f757855e
LP
3260
3261 if ((arg_settings_mask & SETTING_USER) == 0 &&
1cc6c93a
YW
3262 settings->user)
3263 free_and_replace(arg_user, settings->user);
f757855e
LP
3264
3265 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3266 uint64_t plus;
f757855e 3267
0e265674
LP
3268 plus = settings->capability;
3269 if (settings_private_network(settings))
3270 plus |= (1ULL << CAP_NET_ADMIN);
3271
3272 if (!arg_settings_trusted && plus != 0) {
3273 if (settings->capability != 0)
5d961407 3274 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3275 } else
520e0d54 3276 arg_caps_retain |= plus;
f757855e 3277
520e0d54 3278 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3279 }
3280
3281 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3282 settings->kill_signal > 0)
3283 arg_kill_signal = settings->kill_signal;
3284
3285 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3286 settings->personality != PERSONALITY_INVALID)
3287 arg_personality = settings->personality;
3288
3289 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3290 !sd_id128_is_null(settings->machine_id)) {
3291
3292 if (!arg_settings_trusted)
5d961407 3293 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3294 else
3295 arg_uuid = settings->machine_id;
3296 }
3297
3298 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3299 settings->read_only >= 0)
3300 arg_read_only = settings->read_only;
3301
3302 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3303 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3304 arg_volatile_mode = settings->volatile_mode;
3305
3306 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3307 settings->n_custom_mounts > 0) {
3308
3309 if (!arg_settings_trusted)
5d961407 3310 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3311 else {
3312 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3313 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3314 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3315 settings->n_custom_mounts = 0;
3316 }
3317 }
3318
3319 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3320 (settings->private_network >= 0 ||
3321 settings->network_veth >= 0 ||
3322 settings->network_bridge ||
22b28dfd 3323 settings->network_zone ||
f757855e
LP
3324 settings->network_interfaces ||
3325 settings->network_macvlan ||
f6d6bad1
LP
3326 settings->network_ipvlan ||
3327 settings->network_veth_extra)) {
f757855e
LP
3328
3329 if (!arg_settings_trusted)
5d961407 3330 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3331 else {
f6d6bad1 3332 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3333 arg_private_network = settings_private_network(settings);
3334
130d3d22
YW
3335 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3336 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3337 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3338 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3339
1cc6c93a
YW
3340 free_and_replace(arg_network_bridge, settings->network_bridge);
3341 free_and_replace(arg_network_zone, settings->network_zone);
f757855e
LP
3342 }
3343 }
3344
3345 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3346 settings->expose_ports) {
3347
3348 if (!arg_settings_trusted)
5d961407 3349 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3350 else {
3351 expose_port_free_all(arg_expose_ports);
1cc6c93a 3352 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3353 }
3354 }
3355
0de7acce
LP
3356 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3357 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3358
3359 if (!arg_settings_trusted)
5d961407 3360 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3361 else {
3362 arg_userns_mode = settings->userns_mode;
3363 arg_uid_shift = settings->uid_shift;
3364 arg_uid_range = settings->uid_range;
3365 arg_userns_chown = settings->userns_chown;
3366 }
3367 }
3368
9c1e04d0
AP
3369 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3370 arg_notify_ready = settings->notify_ready;
3371
960e4569
LP
3372 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3373
3374 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
5d961407 3375 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3376 else {
130d3d22
YW
3377 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3378 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569
LP
3379 }
3380 }
3381
bf428efb
LP
3382 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3383 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3384 continue;
3385
3386 if (!settings->rlimit[rl])
3387 continue;
3388
3389 if (!arg_settings_trusted) {
5d961407 3390 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3391 continue;
3392 }
3393
3394 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3395 }
3396
3a9530e5
LP
3397 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3398 settings->hostname)
3399 free_and_replace(arg_hostname, settings->hostname);
3400
66edd963
LP
3401 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3402 settings->no_new_privileges >= 0)
3403 arg_no_new_privileges = settings->no_new_privileges;
3404
81f345df
LP
3405 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3406 settings->oom_score_adjust_set) {
3407
3408 if (!arg_settings_trusted)
5d961407 3409 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3410 else {
3411 arg_oom_score_adjust = settings->oom_score_adjust;
3412 arg_oom_score_adjust_set = true;
3413 }
3414 }
3415
d107bb7d
LP
3416 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3417 settings->cpuset) {
3418
3419 if (!arg_settings_trusted)
5d961407 3420 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3421 else {
3422 if (arg_cpuset)
3423 CPU_FREE(arg_cpuset);
3424 arg_cpuset = TAKE_PTR(settings->cpuset);
3425 arg_cpuset_ncpus = settings->cpuset_ncpus;
3426 }
3427 }
3428
09d423e9
LP
3429 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3430 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3431 arg_resolv_conf = settings->resolv_conf;
3432
4e1d6aa9
LP
3433 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3434 settings->link_journal != _LINK_JOURNAL_INVALID) {
3435
3436 if (!arg_settings_trusted)
3437 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3438 else {
3439 arg_link_journal = settings->link_journal;
3440 arg_link_journal_try = settings->link_journal_try;
3441 }
3442 }
3443
f757855e
LP
3444 return 0;
3445}
3446
5d961407
LP
3447static int load_settings(void) {
3448 _cleanup_(settings_freep) Settings *settings = NULL;
3449 _cleanup_fclose_ FILE *f = NULL;
3450 _cleanup_free_ char *p = NULL;
3451 const char *fn, *i;
3452 int r;
3453
3454 /* If all settings are masked, there's no point in looking for
3455 * the settings file */
3456 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3457 return 0;
3458
3459 fn = strjoina(arg_machine, ".nspawn");
3460
3461 /* We first look in the admin's directories in /etc and /run */
3462 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3463 _cleanup_free_ char *j = NULL;
3464
3465 j = strjoin(i, "/", fn);
3466 if (!j)
3467 return log_oom();
3468
3469 f = fopen(j, "re");
3470 if (f) {
3471 p = TAKE_PTR(j);
3472
3473 /* By default, we trust configuration from /etc and /run */
3474 if (arg_settings_trusted < 0)
3475 arg_settings_trusted = true;
3476
3477 break;
3478 }
3479
3480 if (errno != ENOENT)
3481 return log_error_errno(errno, "Failed to open %s: %m", j);
3482 }
3483
3484 if (!f) {
3485 /* After that, let's look for a file next to the
3486 * actual image we shall boot. */
3487
3488 if (arg_image) {
3489 p = file_in_same_dir(arg_image, fn);
3490 if (!p)
3491 return log_oom();
3492 } else if (arg_directory) {
3493 p = file_in_same_dir(arg_directory, fn);
3494 if (!p)
3495 return log_oom();
3496 }
3497
3498 if (p) {
3499 f = fopen(p, "re");
3500 if (!f && errno != ENOENT)
3501 return log_error_errno(errno, "Failed to open %s: %m", p);
3502
3503 /* By default, we do not trust configuration from /var/lib/machines */
3504 if (arg_settings_trusted < 0)
3505 arg_settings_trusted = false;
3506 }
3507 }
3508
3509 if (!f)
3510 return 0;
3511
3512 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3513
3514 r = settings_load(f, p, &settings);
3515 if (r < 0)
3516 return r;
3517
3518 return merge_settings(settings, p);
3519}
3520
b0067625
ZJS
3521static int run(int master,
3522 const char* console,
2d845785 3523 DissectedImage *dissected_image,
b0067625
ZJS
3524 bool interactive,
3525 bool secondary,
3526 FDSet *fds,
3527 char veth_name[IFNAMSIZ], bool *veth_created,
3528 union in_addr_union *exposed,
3529 pid_t *pid, int *ret) {
3530
3531 static const struct sigaction sa = {
3532 .sa_handler = nop_signal_handler,
e28c7cd0 3533 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3534 };
3535
8e766630 3536 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
3537 _cleanup_close_ int etc_passwd_lock = -1;
3538 _cleanup_close_pair_ int
3539 kmsg_socket_pair[2] = { -1, -1 },
3540 rtnl_socket_pair[2] = { -1, -1 },
3541 pid_socket_pair[2] = { -1, -1 },
3542 uuid_socket_pair[2] = { -1, -1 },
3543 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
3544 uid_shift_socket_pair[2] = { -1, -1 },
3545 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3546
b0067625
ZJS
3547 _cleanup_close_ int notify_socket= -1;
3548 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3549 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3550 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3551 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3552 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 3553 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625
ZJS
3554 ContainerStatus container_status = 0;
3555 char last_char = 0;
3556 int ifi = 0, r;
3557 ssize_t l;
3558 sigset_t mask_chld;
d7bea6b6 3559 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
3560
3561 assert_se(sigemptyset(&mask_chld) == 0);
3562 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3563
3564 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3565 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3566 * check with getpwuid() if the specific user already exists. Note that /etc might be
3567 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3568 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3569 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3570 * really ours. */
3571
3572 etc_passwd_lock = take_etc_passwd_lock(NULL);
3573 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3574 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3575 }
3576
3577 r = barrier_create(&barrier);
3578 if (r < 0)
3579 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3580
3581 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3582 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3583
3584 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3585 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3586
3587 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3588 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3589
3590 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3591 return log_error_errno(errno, "Failed to create id socket pair: %m");
3592
3593 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3594 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3595
3596 if (arg_userns_mode != USER_NAMESPACE_NO)
3597 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3598 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3599
8199d554
LP
3600 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3601 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3602 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3603
b0067625
ZJS
3604 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3605 * parent's blocking calls and give it a chance to call wait() and terminate. */
3606 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3607 if (r < 0)
3608 return log_error_errno(errno, "Failed to change the signal mask: %m");
3609
3610 r = sigaction(SIGCHLD, &sa, NULL);
3611 if (r < 0)
3612 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3613
d7bea6b6
DP
3614 if (arg_network_namespace_path) {
3615 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3616 if (netns_fd < 0)
3617 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3618
3619 r = fd_is_network_ns(netns_fd);
3620 if (r < 0 && r != -ENOTTY)
3621 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
3622 if (r == 0) {
3623 log_error("Path %s doesn't refer to a network namespace", arg_network_namespace_path);
3624 return -EINVAL;
3625 }
3626 }
3627
b0067625
ZJS
3628 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3629 if (*pid < 0)
3630 return log_error_errno(errno, "clone() failed%s: %m",
3631 errno == EINVAL ?
3632 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3633
3634 if (*pid == 0) {
3635 /* The outer child only has a file system namespace. */
3636 barrier_set_role(&barrier, BARRIER_CHILD);
3637
3638 master = safe_close(master);
3639
3640 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3641 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3642 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3643 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3644 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3645 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 3646 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
3647
3648 (void) reset_all_signal_handlers();
3649 (void) reset_signal_mask();
3650
3651 r = outer_child(&barrier,
3652 arg_directory,
3653 console,
2d845785 3654 dissected_image,
b0067625
ZJS
3655 interactive,
3656 secondary,
3657 pid_socket_pair[1],
3658 uuid_socket_pair[1],
3659 notify_socket_pair[1],
3660 kmsg_socket_pair[1],
3661 rtnl_socket_pair[1],
3662 uid_shift_socket_pair[1],
8199d554 3663 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
3664 fds,
3665 netns_fd);
b0067625
ZJS
3666 if (r < 0)
3667 _exit(EXIT_FAILURE);
3668
3669 _exit(EXIT_SUCCESS);
3670 }
3671
3672 barrier_set_role(&barrier, BARRIER_PARENT);
3673
3674 fds = fdset_free(fds);
3675
3676 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3677 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3678 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3679 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3680 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3681 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 3682 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
3683
3684 if (arg_userns_mode != USER_NAMESPACE_NO) {
3685 /* The child just let us know the UID shift it might have read from the image. */
3686 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3687 if (l < 0)
3688 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3689 if (l != sizeof arg_uid_shift) {
3690 log_error("Short read while reading UID shift.");
3691 return -EIO;
3692 }
3693
3694 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3695 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3696 * image, but if that's already in use, pick a new one, and report back to the child,
3697 * which one we now picked. */
3698
3699 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3700 if (r < 0)
3701 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3702
3703 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3704 if (l < 0)
3705 return log_error_errno(errno, "Failed to send UID shift: %m");
3706 if (l != sizeof arg_uid_shift) {
3707 log_error("Short write while writing UID shift.");
3708 return -EIO;
3709 }
3710 }
3711 }
3712
8199d554
LP
3713 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3714 /* The child let us know the support cgroup mode it might have read from the image. */
3715 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3716 if (l < 0)
3717 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3718 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
3719 log_error("Short read while reading cgroup mode.");
3720 return -EIO;
3721 }
3722 }
3723
b0067625 3724 /* Wait for the outer child. */
d2e0ac3d
LP
3725 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3726 if (r < 0)
3727 return r;
3728 if (r != EXIT_SUCCESS)
3729 return -EIO;
b0067625
ZJS
3730
3731 /* And now retrieve the PID of the inner child. */
3732 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3733 if (l < 0)
3734 return log_error_errno(errno, "Failed to read inner child PID: %m");
3735 if (l != sizeof *pid) {
3736 log_error("Short read while reading inner child PID.");
3737 return -EIO;
3738 }
3739
3740 /* We also retrieve container UUID in case it was generated by outer child */
3741 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3742 if (l < 0)
3743 return log_error_errno(errno, "Failed to read container machine ID: %m");
3744 if (l != sizeof(arg_uuid)) {
3745 log_error("Short read while reading container machined ID.");
3746 return -EIO;
3747 }
3748
3749 /* We also retrieve the socket used for notifications generated by outer child */
3750 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3751 if (notify_socket < 0)
3752 return log_error_errno(notify_socket,
3753 "Failed to receive notification socket from the outer child: %m");
3754
3755 log_debug("Init process invoked as PID "PID_FMT, *pid);
3756
3757 if (arg_userns_mode != USER_NAMESPACE_NO) {
3758 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3759 log_error("Child died too early.");
3760 return -ESRCH;
3761 }
3762
3763 r = setup_uid_map(*pid);
3764 if (r < 0)
3765 return r;
3766
3767 (void) barrier_place(&barrier); /* #2 */
3768 }
3769
3770 if (arg_private_network) {
3771
75116558
PS
3772 if (!arg_network_namespace_path) {
3773 /* Wait until the child has unshared its network namespace. */
3774 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3775 log_error("Child died too early");
3776 return -ESRCH;
3777 }
3778 }
3779
b0067625
ZJS
3780 r = move_network_interfaces(*pid, arg_network_interfaces);
3781 if (r < 0)
3782 return r;
3783
3784 if (arg_network_veth) {
3785 r = setup_veth(arg_machine, *pid, veth_name,
3786 arg_network_bridge || arg_network_zone);
3787 if (r < 0)
3788 return r;
3789 else if (r > 0)
3790 ifi = r;
3791
3792 if (arg_network_bridge) {
3793 /* Add the interface to a bridge */
3794 r = setup_bridge(veth_name, arg_network_bridge, false);
3795 if (r < 0)
3796 return r;
3797 if (r > 0)
3798 ifi = r;
3799 } else if (arg_network_zone) {
3800 /* Add the interface to a bridge, possibly creating it */
3801 r = setup_bridge(veth_name, arg_network_zone, true);
3802 if (r < 0)
3803 return r;
3804 if (r > 0)
3805 ifi = r;
3806 }
3807 }
3808
3809 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3810 if (r < 0)
3811 return r;
3812
3813 /* We created the primary and extra veth links now; let's remember this, so that we know to
3814 remove them later on. Note that we don't bother with removing veth links that were created
3815 here when their setup failed half-way, because in that case the kernel should be able to
3816 remove them on its own, since they cannot be referenced by anything yet. */
3817 *veth_created = true;
3818
3819 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3820 if (r < 0)
3821 return r;
3822
3823 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3824 if (r < 0)
3825 return r;
3826 }
3827
abdb9b08
LP
3828 if (arg_register || !arg_keep_unit) {
3829 r = sd_bus_default_system(&bus);
3830 if (r < 0)
3831 return log_error_errno(r, "Failed to open system bus: %m");
3832 }
3833
3834 if (!arg_keep_unit) {
3835 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3836 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3837 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3838
75152a4d
LP
3839 r = sd_bus_match_signal_async(
3840 bus,
3841 NULL,
3842 "org.freedesktop.systemd1",
3843 NULL,
3844 "org.freedesktop.systemd1.Scope",
3845 "RequestStop",
3846 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 3847 if (r < 0)
75152a4d 3848 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
3849 }
3850
b0067625 3851 if (arg_register) {
abdb9b08 3852
b0067625 3853 r = register_machine(
abdb9b08 3854 bus,
b0067625
ZJS
3855 arg_machine,
3856 *pid,
3857 arg_directory,
3858 arg_uuid,
3859 ifi,
3860 arg_slice,
3861 arg_custom_mounts, arg_n_custom_mounts,
3862 arg_kill_signal,
3863 arg_property,
3864 arg_keep_unit,
3865 arg_container_service_name);
3866 if (r < 0)
3867 return r;
abdb9b08 3868
cd2dfc6f 3869 } else if (!arg_keep_unit) {
abdb9b08 3870
cd2dfc6f 3871 r = allocate_scope(
abdb9b08 3872 bus,
cd2dfc6f
LP
3873 arg_machine,
3874 *pid,
3875 arg_slice,
3876 arg_custom_mounts, arg_n_custom_mounts,
3877 arg_kill_signal,
3878 arg_property);
3879 if (r < 0)
3880 return r;
3881
3882 } else if (arg_slice || arg_property)
3883 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3884
f0bef277 3885 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3886 if (r < 0)
3887 return r;
3888
720f0a2f
LP
3889 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
3890 if (r < 0)
3891 return r;
b0067625 3892
de54e02d 3893 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3894 if (r < 0)
3895 return r;
3896
3897 /* Notify the child that the parent is ready with all
3898 * its setup (including cgroup-ification), and that
3899 * the child can now hand over control to the code to
3900 * run inside the container. */
75116558 3901 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
3902
3903 /* Block SIGCHLD here, before notifying child.
3904 * process_pty() will handle it with the other signals. */
3905 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3906
3907 /* Reset signal to default */
3908 r = default_signals(SIGCHLD, -1);
3909 if (r < 0)
3910 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3911
3912 r = sd_event_new(&event);
3913 if (r < 0)
3914 return log_error_errno(r, "Failed to get default event source: %m");
3915
8fd010bb
LP
3916 (void) sd_event_set_watchdog(event, true);
3917
abdb9b08
LP
3918 if (bus) {
3919 r = sd_bus_attach_event(bus, event, 0);
3920 if (r < 0)
3921 return log_error_errno(r, "Failed to attach bus to event loop: %m");
3922 }
3923
5773024d 3924 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3925 if (r < 0)
3926 return r;
3927
3928 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 3929 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
3930 log_error("Child died too early.");
3931 return -ESRCH;
3932 }
3933
3934 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3935 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3936 etc_passwd_lock = safe_close(etc_passwd_lock);
3937
3938 sd_notifyf(false,
3939 "STATUS=Container running.\n"
3940 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3941 if (!arg_notify_ready)
919f5ae0 3942 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
3943
3944 if (arg_kill_signal > 0) {
3945 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
3946 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3947 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
3948 } else {
3949 /* Immediately exit */
919f5ae0
LP
3950 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3951 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
3952 }
3953
6916b164 3954 /* Exit when the child exits */
919f5ae0 3955 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3956
3957 if (arg_expose_ports) {
3958 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3959 if (r < 0)
3960 return r;
3961
3962 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3963 }
3964
3965 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3966
3967 r = pty_forward_new(event, master,
3968 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3969 &forward);
3970 if (r < 0)
3971 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3972
3973 r = sd_event_loop(event);
3974 if (r < 0)
3975 return log_error_errno(r, "Failed to run event loop: %m");
3976
3977 pty_forward_get_last_char(forward, &last_char);
3978
3979 forward = pty_forward_free(forward);
3980
3981 if (!arg_quiet && last_char != '\n')
3982 putc('\n', stdout);
3983
3984 /* Kill if it is not dead yet anyway */
abdb9b08
LP
3985 if (arg_register && !arg_keep_unit && bus)
3986 terminate_machine(bus, *pid);
b0067625
ZJS
3987
3988 /* Normally redundant, but better safe than sorry */
c67b0082 3989 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3990
3991 r = wait_for_container(*pid, &container_status);
3992 *pid = 0;
3993
3994 if (r < 0)
3995 /* We failed to wait for the container, or the container exited abnormally. */
3996 return r;
3997 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3998 /* r > 0 → The container exited with a non-zero status.
3999 * As a special case, we need to replace 133 with a different value,
4000 * because 133 is special-cased in the service file to reboot the container.
4001 * otherwise → The container exited with zero status and a reboot was not requested.
4002 */
2a49b612 4003 if (r == EXIT_FORCE_RESTART)
27e29a1e 4004 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4005 *ret = r;
b0067625
ZJS
4006 return 0; /* finito */
4007 }
4008
4009 /* CONTAINER_REBOOTED, loop again */
4010
4011 if (arg_keep_unit) {
4012 /* Special handling if we are running as a service: instead of simply
4013 * restarting the machine we want to restart the entire service, so let's
4014 * inform systemd about this with the special exit code 133. The service
4015 * file uses RestartForceExitStatus=133 so that this results in a full
4016 * nspawn restart. This is necessary since we might have cgroup parameters
4017 * set we want to have flushed out. */
2a49b612
ZJS
4018 *ret = EXIT_FORCE_RESTART;
4019 return 0; /* finito */
b0067625
ZJS
4020 }
4021
4022 expose_port_flush(arg_expose_ports, exposed);
4023
4024 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4025 *veth_created = false;
4026 return 1; /* loop again */
4027}
4028
bf428efb
LP
4029static int initialize_rlimits(void) {
4030
4031 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4032 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4033 * container execution environments. */
4034
4035 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4036 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4037 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4038 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4039 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4040 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4041 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4042 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4043 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4044 [RLIMIT_NICE] = { 0, 0 },
4045 [RLIMIT_NOFILE] = { 1024, 4096 },
4046 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4047 [RLIMIT_RTPRIO] = { 0, 0 },
4048 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4049 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4050
4051 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4052 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4053 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4054 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4055 * that PID 1 changes a number of other resource limits during early initialization which is why we
4056 * don't read the other limits from PID 1 but prefer the static table above. */
4057 };
4058
4059 int rl;
4060
4061 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4062
4063 /* Let's only fill in what the user hasn't explicitly configured anyway */
4064 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4065 const struct rlimit *v;
4066 struct rlimit buffer;
4067
4068 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4069 /* For these two let's read the limits off PID 1. See above for an explanation. */
4070
4071 if (prlimit(1, rl, NULL, &buffer) < 0)
4072 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4073
4074 v = &buffer;
4075 } else
4076 v = kernel_defaults + rl;
4077
4078 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4079 if (!arg_rlimit[rl])
4080 return log_oom();
4081 }
4082
4083 if (DEBUG_LOGGING) {
4084 _cleanup_free_ char *k = NULL;
4085
4086 (void) rlimit_format(arg_rlimit[rl], &k);
4087 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4088 }
4089 }
4090
4091 return 0;
4092}
4093
03cfe0d5
LP
4094int main(int argc, char *argv[]) {
4095
2d845785
LP
4096 _cleanup_free_ char *console = NULL;
4097 _cleanup_close_ int master = -1;
03cfe0d5 4098 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4099 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4100 char veth_name[IFNAMSIZ] = "";
17cbb288 4101 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4102 pid_t pid = 0;
03cfe0d5 4103 union in_addr_union exposed = {};
8e766630 4104 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4105 bool interactive, veth_created = false, remove_tmprootdir = false;
4106 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4107 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4108 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4109 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4110
4111 log_parse_environment();
4112 log_open();
415fc41c 4113
7732f92b
LP
4114 /* Make sure rename_process() in the stub init process can work */
4115 saved_argv = argv;
4116 saved_argc = argc;
4117
03cfe0d5
LP
4118 r = parse_argv(argc, argv);
4119 if (r <= 0)
4120 goto finish;
4121
fba868fa
LP
4122 r = must_be_root();
4123 if (r < 0)
03cfe0d5 4124 goto finish;
fba868fa 4125
bf428efb
LP
4126 r = initialize_rlimits();
4127 if (r < 0)
4128 goto finish;
4129
f757855e
LP
4130 r = determine_names();
4131 if (r < 0)
4132 goto finish;
4133
4134 r = load_settings();
4135 if (r < 0)
4136 goto finish;
4137
4138 r = verify_arguments();
4139 if (r < 0)
4140 goto finish;
03cfe0d5 4141
8199d554
LP
4142 r = detect_unified_cgroup_hierarchy_from_environment();
4143 if (r < 0)
4144 goto finish;
4145
03cfe0d5
LP
4146 n_fd_passed = sd_listen_fds(false);
4147 if (n_fd_passed > 0) {
4148 r = fdset_new_listen_fds(&fds, false);
4149 if (r < 0) {
4150 log_error_errno(r, "Failed to collect file descriptors: %m");
4151 goto finish;
4152 }
4153 }
4154
4155 if (arg_directory) {
4156 assert(!arg_image);
4157
4158 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4159 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4160 r = -EINVAL;
4161 goto finish;
4162 }
4163
4164 if (arg_ephemeral) {
4165 _cleanup_free_ char *np = NULL;
4166
8d4aa2bb 4167 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4168 if (r < 0)
4169 goto finish;
4170
03cfe0d5
LP
4171 /* If the specified path is a mount point we
4172 * generate the new snapshot immediately
4173 * inside it under a random name. However if
4174 * the specified is not a mount point we
4175 * create the new snapshot in the parent
4176 * directory, just next to it. */
e1873695 4177 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4178 if (r < 0) {
4179 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4180 goto finish;
4181 }
4182 if (r > 0)
770b5ce4 4183 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4184 else
770b5ce4 4185 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4186 if (r < 0) {
0f3be6ca 4187 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4188 goto finish;
4189 }
4190
4191 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4192 if (r < 0) {
4193 log_error_errno(r, "Failed to lock %s: %m", np);
4194 goto finish;
4195 }
4196
17cbb288
LP
4197 r = btrfs_subvol_snapshot(arg_directory, np,
4198 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4199 BTRFS_SNAPSHOT_FALLBACK_COPY |
4200 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4201 BTRFS_SNAPSHOT_RECURSIVE |
4202 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4203 if (r < 0) {
4204 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4205 goto finish;
ec16945e
LP
4206 }
4207
1cc6c93a 4208 free_and_replace(arg_directory, np);
ec16945e 4209
17cbb288 4210 remove_directory = true;
30535c16
LP
4211
4212 } else {
cb638b5e 4213 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4214 if (r < 0)
4215 goto finish;
4216
30535c16
LP
4217 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4218 if (r == -EBUSY) {
4219 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4220 goto finish;
4221 }
4222 if (r < 0) {
4223 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4224 goto finish;
30535c16
LP
4225 }
4226
4227 if (arg_template) {
8d4aa2bb 4228 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4229 if (r < 0)
4230 goto finish;
4231
17cbb288
LP
4232 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4233 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4234 BTRFS_SNAPSHOT_FALLBACK_COPY |
4235 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4236 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4237 BTRFS_SNAPSHOT_RECURSIVE |
4238 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4239 if (r == -EEXIST) {
4240 if (!arg_quiet)
4241 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4242 } else if (r < 0) {
83521414 4243 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4244 goto finish;
4245 } else {
4246 if (!arg_quiet)
4247 log_info("Populated %s from template %s.", arg_directory, arg_template);
4248 }
4249 }
ec16945e
LP
4250 }
4251
7732f92b 4252 if (arg_start_mode == START_BOOT) {
c9fe05e0
AR
4253 const char *p;
4254
4255 if (arg_pivot_root_new)
4256 p = prefix_roota(arg_directory, arg_pivot_root_new);
4257 else
4258 p = arg_directory;
4259
4260 if (path_is_os_tree(p) <= 0) {
4261 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4262 r = -EINVAL;
1b9e5b12
LP
4263 goto finish;
4264 }
4265 } else {
c9fe05e0
AR
4266 const char *p, *q;
4267
4268 if (arg_pivot_root_new)
4269 p = prefix_roota(arg_directory, arg_pivot_root_new);
4270 else
4271 p = arg_directory;
4272
4273 q = strjoina(p, "/usr/");
1b9e5b12 4274
c9fe05e0
AR
4275 if (laccess(q, F_OK) < 0) {
4276 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4277 r = -EINVAL;
1b9e5b12 4278 goto finish;
1b9e5b12
LP
4279 }
4280 }
ec16945e 4281
6b9132a9 4282 } else {
ec16945e
LP
4283 assert(arg_image);
4284 assert(!arg_template);
4285
8d4aa2bb 4286 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4287 if (r < 0)
4288 goto finish;
4289
0f3be6ca
LP
4290 if (arg_ephemeral) {
4291 _cleanup_free_ char *np = NULL;
4292
4293 r = tempfn_random(arg_image, "machine.", &np);
4294 if (r < 0) {
4295 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4296 goto finish;
4297 }
4298
4299 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4300 if (r < 0) {
4301 r = log_error_errno(r, "Failed to create image lock: %m");
4302 goto finish;
4303 }
4304
1c876927 4305 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
4306 if (r < 0) {
4307 r = log_error_errno(r, "Failed to copy image file: %m");
4308 goto finish;
4309 }
4310
1cc6c93a 4311 free_and_replace(arg_image, np);
0f3be6ca
LP
4312
4313 remove_image = true;
4314 } else {
4315 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4316 if (r == -EBUSY) {
4317 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4318 goto finish;
4319 }
4320 if (r < 0) {
4321 r = log_error_errno(r, "Failed to create image lock: %m");
4322 goto finish;
4323 }
4623e8e6 4324
78ebe980
LP
4325 if (!arg_root_hash) {
4326 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4327 if (r < 0) {
4328 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4329 goto finish;
4330 }
4331 }
30535c16
LP
4332 }
4333
c67b0082 4334 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4335 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4336 goto finish;
1b9e5b12 4337 }
6b9132a9 4338
c67b0082
LP
4339 remove_tmprootdir = true;
4340
4341 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4342 if (!arg_directory) {
4343 r = log_oom();
4344 goto finish;
6b9132a9 4345 }
88213476 4346
2d845785
LP
4347 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4348 if (r < 0) {
4349 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4350 goto finish;
4351 }
1b9e5b12 4352
4526113f 4353 r = dissect_image_and_warn(
e0f9e7bd 4354 loop->fd,
4526113f 4355 arg_image,
e0f9e7bd
LP
4356 arg_root_hash, arg_root_hash_size,
4357 DISSECT_IMAGE_REQUIRE_ROOT,
4358 &dissected_image);
2d845785 4359 if (r == -ENOPKG) {
4526113f 4360 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4361 log_notice("Note that the disk image needs to\n"
4362 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4363 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4364 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4365 " d) or contain a file system without a partition table\n"
4366 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4367 goto finish;
2d845785 4368 }
4526113f 4369 if (r < 0)
842f3b0f 4370 goto finish;
1b9e5b12 4371
4623e8e6
LP
4372 if (!arg_root_hash && dissected_image->can_verity)
4373 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4374
4375 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4376 if (r < 0)
4377 goto finish;
0f3be6ca
LP
4378
4379 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4380 if (remove_image && unlink(arg_image) >= 0)
4381 remove_image = false;
842f3b0f 4382 }
842f3b0f 4383
86c0dd4a 4384 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4385 if (r < 0)
4386 goto finish;
4387
03cfe0d5
LP
4388 interactive =
4389 isatty(STDIN_FILENO) > 0 &&
4390 isatty(STDOUT_FILENO) > 0;
9c857b9d 4391
db7feb7e
LP
4392 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4393 if (master < 0) {
ec16945e 4394 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4395 goto finish;
4396 }
4397
611b312b
LP
4398 r = ptsname_malloc(master, &console);
4399 if (r < 0) {
4400 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4401 goto finish;
68b02049
DW
4402 }
4403
4404 if (arg_selinux_apifs_context) {
4405 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4406 if (r < 0)
4407 goto finish;
a258bf26
LP
4408 }
4409
a258bf26 4410 if (unlockpt(master) < 0) {
ec16945e 4411 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4412 goto finish;
4413 }
4414
9c857b9d
LP
4415 if (!arg_quiet)
4416 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4417 arg_machine, arg_image ?: arg_directory);
4418
72c0a2c2 4419 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4420
66edd963 4421 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4422 r = log_error_errno(errno, "Failed to become subreaper: %m");
4423 goto finish;
4424 }
4425
d87be9b0 4426 for (;;) {
b0067625
ZJS
4427 r = run(master,
4428 console,
2d845785 4429 dissected_image,
b0067625
ZJS
4430 interactive, secondary,
4431 fds,
4432 veth_name, &veth_created,
4433 &exposed,
4434 &pid, &ret);
4435 if (r <= 0)
d87be9b0 4436 break;
d87be9b0 4437 }
88213476
LP
4438
4439finish:
af4ec430 4440 sd_notify(false,
2a49b612
ZJS
4441 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4442 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4443
9444b1f2 4444 if (pid > 0)
c67b0082 4445 (void) kill(pid, SIGKILL);
88213476 4446
503546da 4447 /* Try to flush whatever is still queued in the pty */
6a0f896b 4448 if (master >= 0) {
1c876927 4449 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4450 master = safe_close(master);
4451 }
4452
4453 if (pid > 0)
4454 (void) wait_for_terminate(pid, NULL);
503546da 4455
50ebcf6c
LP
4456 pager_close();
4457
17cbb288 4458 if (remove_directory && arg_directory) {
ec16945e
LP
4459 int k;
4460
17cbb288 4461 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4462 if (k < 0)
17cbb288 4463 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4464 }
4465
0f3be6ca
LP
4466 if (remove_image && arg_image) {
4467 if (unlink(arg_image) < 0)
4468 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4469 }
4470
c67b0082
LP
4471 if (remove_tmprootdir) {
4472 if (rmdir(tmprootdir) < 0)
4473 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4474 }
4475
785890ac
LP
4476 if (arg_machine) {
4477 const char *p;
4478
63c372cb 4479 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4480 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4481 }
4482
7a8f6325 4483 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4484
4485 if (veth_created)
4486 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4487 (void) remove_bridge(arg_network_zone);
f757855e 4488
04d391da 4489 free(arg_directory);
ec16945e
LP
4490 free(arg_template);
4491 free(arg_image);
7027ff61 4492 free(arg_machine);
3a9530e5 4493 free(arg_hostname);
c74e630d 4494 free(arg_user);
b53ede69
PW
4495 free(arg_pivot_root_new);
4496 free(arg_pivot_root_old);
5f932eb9 4497 free(arg_chdir);
c74e630d 4498 strv_free(arg_setenv);
f757855e 4499 free(arg_network_bridge);
c74e630d
LP
4500 strv_free(arg_network_interfaces);
4501 strv_free(arg_network_macvlan);
4bbfe7ad 4502 strv_free(arg_network_ipvlan);
f6d6bad1 4503 strv_free(arg_network_veth_extra);
f757855e
LP
4504 strv_free(arg_parameters);
4505 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4506 expose_port_free_all(arg_expose_ports);
4623e8e6 4507 free(arg_root_hash);
bf428efb 4508 rlimit_free_all(arg_rlimit);
d107bb7d 4509 arg_cpuset = cpu_set_mfree(arg_cpuset);
6d0b55c2 4510
ec16945e 4511 return r < 0 ? EXIT_FAILURE : ret;
88213476 4512}