]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
sleep: fix one more printf format of a fiemap field
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
6b5cf3ea 4#include <blkid.h>
8fe0087e 5#endif
88213476 6#include <errno.h>
88213476 7#include <getopt.h>
0e7ac751 8#include <grp.h>
1b9e5b12 9#include <linux/loop.h>
0e7ac751 10#include <pwd.h>
8fe0087e 11#include <sched.h>
349cc4a5 12#if HAVE_SELINUX
8fe0087e 13#include <selinux/selinux.h>
1b9e5b12 14#endif
8fe0087e
LP
15#include <signal.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
19#include <sys/file.h>
20#include <sys/mount.h>
21#include <sys/personality.h>
22#include <sys/prctl.h>
23#include <sys/types.h>
6916b164 24#include <sys/wait.h>
8fe0087e 25#include <unistd.h>
1b9e5b12 26
b053cd5f 27#include "sd-bus.h"
1f0cd86b 28#include "sd-daemon.h"
1f0cd86b 29#include "sd-id128.h"
8fe0087e 30
b5efdb8a 31#include "alloc-util.h"
8fe0087e
LP
32#include "barrier.h"
33#include "base-filesystem.h"
34#include "blkid-util.h"
35#include "btrfs-util.h"
b8ea7a6e 36#include "bus-error.h"
b053cd5f 37#include "bus-util.h"
8fe0087e 38#include "cap-list.h"
430f0182 39#include "capability-util.h"
04d391da 40#include "cgroup-util.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
4fc9982c 43#include "dev-setup.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3ffd4af2 46#include "fd-util.h"
842f3b0f 47#include "fdset.h"
a5c32cff 48#include "fileio.h"
f97b34a6 49#include "format-util.h"
f4f15635 50#include "fs-util.h"
1b9e5b12 51#include "gpt.h"
4623e8e6 52#include "hexdecoct.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
1b9cebf6 58#include "machine-image.h"
8fe0087e
LP
59#include "macro.h"
60#include "missing.h"
61#include "mkdir.h"
4349cd7c 62#include "mount-util.h"
8fe0087e 63#include "netlink-util.h"
07630cea 64#include "nspawn-cgroup.h"
3603efde 65#include "nspawn-def.h"
07630cea
LP
66#include "nspawn-expose-ports.h"
67#include "nspawn-mount.h"
68#include "nspawn-network.h"
7336138e 69#include "nspawn-patch-uid.h"
07630cea 70#include "nspawn-register.h"
910fd145 71#include "nspawn-seccomp.h"
07630cea
LP
72#include "nspawn-settings.h"
73#include "nspawn-setuid.h"
7732f92b 74#include "nspawn-stub-pid1.h"
d58ad743 75#include "os-util.h"
50ebcf6c 76#include "pager.h"
6bedfcbb 77#include "parse-util.h"
8fe0087e 78#include "path-util.h"
0b452006 79#include "process-util.h"
8fe0087e
LP
80#include "ptyfwd.h"
81#include "random-util.h"
8869a0b4 82#include "raw-clone.h"
bf428efb 83#include "rlimit-util.h"
8fe0087e 84#include "rm-rf.h"
68b02049 85#include "selinux-util.h"
8fe0087e 86#include "signal-util.h"
2583fbea 87#include "socket-util.h"
8fcde012 88#include "stat-util.h"
15a5e950 89#include "stdio-util.h"
5c828e66 90#include "string-table.h"
07630cea 91#include "string-util.h"
8fe0087e
LP
92#include "strv.h"
93#include "terminal-util.h"
94#include "udev-util.h"
affb60b1 95#include "umask-util.h"
b1d4f8e1 96#include "user-util.h"
8fe0087e 97#include "util.h"
e9642be2 98
62b1e758
YW
99#if HAVE_SPLIT_USR
100#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
101#else
102#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
103#endif
104
9c1e04d0
AP
105/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
106 * nspawn_notify_socket_path is relative to the container
107 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
108#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 109
2a49b612
ZJS
110#define EXIT_FORCE_RESTART 133
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115} ContainerStatus;
116
88213476 117static char *arg_directory = NULL;
ec16945e 118static char *arg_template = NULL;
5f932eb9 119static char *arg_chdir = NULL;
b53ede69
PW
120static char *arg_pivot_root_new = NULL;
121static char *arg_pivot_root_old = NULL;
687d0825 122static char *arg_user = NULL;
9444b1f2 123static sd_id128_t arg_uuid = {};
3a9530e5
LP
124static char *arg_machine = NULL; /* The name used by the host to refer to this */
125static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
126static const char *arg_selinux_context = NULL;
127static const char *arg_selinux_apifs_context = NULL;
9444b1f2 128static const char *arg_slice = NULL;
ff01d048 129static bool arg_private_network = false;
bc2f673e 130static bool arg_read_only = false;
7732f92b 131static StartMode arg_start_mode = START_PID1;
ec16945e 132static bool arg_ephemeral = false;
57fb9fb5 133static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 134static bool arg_link_journal_try = false;
520e0d54 135static uint64_t arg_caps_retain =
50b52222
LP
136 (1ULL << CAP_AUDIT_CONTROL) |
137 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
138 (1ULL << CAP_CHOWN) |
139 (1ULL << CAP_DAC_OVERRIDE) |
140 (1ULL << CAP_DAC_READ_SEARCH) |
141 (1ULL << CAP_FOWNER) |
142 (1ULL << CAP_FSETID) |
143 (1ULL << CAP_IPC_OWNER) |
144 (1ULL << CAP_KILL) |
145 (1ULL << CAP_LEASE) |
146 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 147 (1ULL << CAP_MKNOD) |
5076f0cc
LP
148 (1ULL << CAP_NET_BIND_SERVICE) |
149 (1ULL << CAP_NET_BROADCAST) |
150 (1ULL << CAP_NET_RAW) |
5076f0cc 151 (1ULL << CAP_SETFCAP) |
50b52222 152 (1ULL << CAP_SETGID) |
5076f0cc
LP
153 (1ULL << CAP_SETPCAP) |
154 (1ULL << CAP_SETUID) |
155 (1ULL << CAP_SYS_ADMIN) |
50b52222 156 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
157 (1ULL << CAP_SYS_CHROOT) |
158 (1ULL << CAP_SYS_NICE) |
159 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 160 (1ULL << CAP_SYS_RESOURCE) |
50b52222 161 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538 162static CustomMount *arg_custom_mounts = NULL;
88614c8a 163static size_t arg_n_custom_mounts = 0;
f4889f65 164static char **arg_setenv = NULL;
284c0b91 165static bool arg_quiet = false;
eb91eb18 166static bool arg_register = true;
89f7c846 167static bool arg_keep_unit = false;
aa28aefe 168static char **arg_network_interfaces = NULL;
c74e630d 169static char **arg_network_macvlan = NULL;
4bbfe7ad 170static char **arg_network_ipvlan = NULL;
69c79d3c 171static bool arg_network_veth = false;
f6d6bad1 172static char **arg_network_veth_extra = NULL;
f757855e 173static char *arg_network_bridge = NULL;
22b28dfd 174static char *arg_network_zone = NULL;
d7bea6b6 175static char *arg_network_namespace_path = NULL;
050f7277 176static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 177static char *arg_image = NULL;
f757855e 178static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 179static ExposePort *arg_expose_ports = NULL;
f36933fe 180static char **arg_property = NULL;
0de7acce 181static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 182static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 183static bool arg_userns_chown = false;
c6c8f6e2 184static int arg_kill_signal = 0;
5da38d07 185static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
186static SettingsMask arg_settings_mask = 0;
187static int arg_settings_trusted = -1;
188static char **arg_parameters = NULL;
6aadfa4c 189static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 190static bool arg_notify_ready = false;
5a8ff0e6 191static bool arg_use_cgns = true;
0c582db0 192static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 193static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
194static void *arg_root_hash = NULL;
195static size_t arg_root_hash_size = 0;
960e4569
LP
196static char **arg_syscall_whitelist = NULL;
197static char **arg_syscall_blacklist = NULL;
bf428efb 198static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 199static bool arg_no_new_privileges = false;
81f345df
LP
200static int arg_oom_score_adjust = 0;
201static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
202static cpu_set_t *arg_cpuset = NULL;
203static unsigned arg_cpuset_ncpus = 0;
09d423e9 204static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 205static TimezoneMode arg_timezone = TIMEZONE_AUTO;
88213476 206
601185b4 207static void help(void) {
50ebcf6c
LP
208 (void) pager_open(false, false);
209
88213476
LP
210 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
211 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
212 " -h --help Show this help\n"
213 " --version Print version string\n"
69c79d3c 214 " -q --quiet Do not show status information\n"
1b9e5b12 215 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
216 " --template=PATH Initialize root directory from template directory,\n"
217 " if missing\n"
218 " -x --ephemeral Run container with snapshot of root directory, and\n"
219 " remove it after exit\n"
220 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 221 " --root-hash=HASH Specify verity root hash\n"
7732f92b 222 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 223 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 224 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
225 " --pivot-root=PATH[:PATH]\n"
226 " Pivot root to given directory in the container\n"
a8828ed9 227 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 228 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 229 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 230 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 231 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 232 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 233 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 234 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 235 " Similar, but with user configured UID/GID range\n"
24597ee0 236 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
237 " --private-network Disable network in container\n"
238 " --network-interface=INTERFACE\n"
239 " Assign an existing network interface to the\n"
240 " container\n"
c74e630d
LP
241 " --network-macvlan=INTERFACE\n"
242 " Create a macvlan network interface based on an\n"
243 " existing network interface to the container\n"
4bbfe7ad
TG
244 " --network-ipvlan=INTERFACE\n"
245 " Create a ipvlan network interface based on an\n"
246 " existing network interface to the container\n"
a8eaaee7 247 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 248 " and container\n"
f6d6bad1
LP
249 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
250 " Add an additional virtual Ethernet link between\n"
251 " host and container\n"
ab046dde 252 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
253 " Add a virtual Ethernet connection to the container\n"
254 " and attach it to an existing bridge on the host\n"
255 " --network-zone=NAME Similar, but attach the new interface to an\n"
256 " an automatically managed bridge interface\n"
d7bea6b6
DP
257 " --network-namespace-path=PATH\n"
258 " Set network namespace to the one represented by\n"
259 " the specified kernel namespace file node\n"
6d0b55c2 260 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 261 " Expose a container IP port on the host\n"
82adf6af
LP
262 " -Z --selinux-context=SECLABEL\n"
263 " Set the SELinux security context to be used by\n"
264 " processes in the container\n"
265 " -L --selinux-apifs-context=SECLABEL\n"
266 " Set the SELinux security context to be used by\n"
267 " API/tmpfs file systems in the container\n"
a8828ed9
DW
268 " --capability=CAP In addition to the default, retain specified\n"
269 " capability\n"
270 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
271 " --system-call-filter=LIST|~LIST\n"
272 " Permit/prohibit specific system calls\n"
bf428efb 273 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
274 " --oom-score-adjust=VALUE\n"
275 " Adjust the OOM score value for the payload\n"
d107bb7d 276 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 277 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
278 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
279 " host, try-guest, try-host\n"
574edc90 280 " -j Equivalent to --link-journal=try-guest\n"
09d423e9 281 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 282 " --timezone=MODE Select mode of /etc/localtime initialization\n"
69c79d3c 283 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
284 " --bind=PATH[:PATH[:OPTIONS]]\n"
285 " Bind mount a file or directory from the host into\n"
a8828ed9 286 " the container\n"
5e5bfa6e
EY
287 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
288 " Similar, but creates a read-only bind mount\n"
06c17c39 289 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
290 " --overlay=PATH[:PATH...]:PATH\n"
291 " Create an overlay mount from the host to \n"
292 " the container\n"
293 " --overlay-ro=PATH[:PATH...]:PATH\n"
294 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 295 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 296 " --register=BOOLEAN Register container as machine\n"
89f7c846 297 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 298 " the service unit nspawn is running in\n"
6d0b55c2 299 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 300 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 301 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 302 , program_invocation_short_name);
88213476
LP
303}
304
86c0dd4a 305static int custom_mount_check_all(void) {
88614c8a 306 size_t i;
5a8af538 307
5a8af538
LP
308 for (i = 0; i < arg_n_custom_mounts; i++) {
309 CustomMount *m = &arg_custom_mounts[i];
310
0de7acce 311 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
312
313 if (arg_userns_chown) {
314 log_error("--private-users-chown may not be combined with custom root mounts.");
315 return -EINVAL;
316 } else if (arg_uid_shift == UID_INVALID) {
317 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
318 return -EINVAL;
319 }
825d5287 320 }
5a8af538
LP
321 }
322
323 return 0;
324}
325
8199d554 326static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 327 const char *e;
415fc41c 328 int r;
5da38d07 329
efdb0237
LP
330 /* Allow the user to control whether the unified hierarchy is used */
331 e = getenv("UNIFIED_CGROUP_HIERARCHY");
332 if (e) {
333 r = parse_boolean(e);
334 if (r < 0)
335 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
336 if (r > 0)
337 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
338 else
339 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
340 }
341
8199d554
LP
342 return 0;
343}
344
345static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
346 int r;
347
348 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
349 * image actually supports. */
b4cccbc1
LP
350 r = cg_all_unified();
351 if (r < 0)
352 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
353 if (r > 0) {
a8725a06
ZJS
354 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
355 * routine only detects 231, so we'll have a false negative here for 230. */
356 r = systemd_installation_has_version(directory, 230);
357 if (r < 0)
358 return log_error_errno(r, "Failed to determine systemd version in container: %m");
359 if (r > 0)
360 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
361 else
362 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 363 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
364 /* Mixed cgroup hierarchy support was added in 233 */
365 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
366 if (r < 0)
367 return log_error_errno(r, "Failed to determine systemd version in container: %m");
368 if (r > 0)
369 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
370 else
371 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
372 } else
5da38d07 373 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 374
8199d554
LP
375 log_debug("Using %s hierarchy for container.",
376 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
377 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
378
efdb0237
LP
379 return 0;
380}
381
0c582db0
LB
382static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
383 int r;
384
385 r = getenv_bool(name);
386 if (r == -ENXIO)
387 return;
388 if (r < 0)
389 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
390 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
391}
392
4f086aab
SU
393static void parse_mount_settings_env(void) {
394 int r;
395 const char *e;
396
397 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
398 if (!e)
399 return;
400
401 if (streq(e, "network")) {
402 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
403 return;
404 }
405
406 r = parse_boolean(e);
407 if (r < 0) {
408 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
409 return;
ab8ee0f2 410 }
4f086aab 411
ab8ee0f2
ZJS
412 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
413 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
414}
415
88213476 416static int parse_argv(int argc, char *argv[]) {
a41fe3a2 417 enum {
acbeb427
ZJS
418 ARG_VERSION = 0x100,
419 ARG_PRIVATE_NETWORK,
bc2f673e 420 ARG_UUID,
5076f0cc 421 ARG_READ_ONLY,
57fb9fb5 422 ARG_CAPABILITY,
420c7379 423 ARG_DROP_CAPABILITY,
17fe0523
LP
424 ARG_LINK_JOURNAL,
425 ARG_BIND,
f4889f65 426 ARG_BIND_RO,
06c17c39 427 ARG_TMPFS,
5a8af538
LP
428 ARG_OVERLAY,
429 ARG_OVERLAY_RO,
eb91eb18 430 ARG_SHARE_SYSTEM,
89f7c846 431 ARG_REGISTER,
aa28aefe 432 ARG_KEEP_UNIT,
69c79d3c 433 ARG_NETWORK_INTERFACE,
c74e630d 434 ARG_NETWORK_MACVLAN,
4bbfe7ad 435 ARG_NETWORK_IPVLAN,
ab046dde 436 ARG_NETWORK_BRIDGE,
22b28dfd 437 ARG_NETWORK_ZONE,
f6d6bad1 438 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 439 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 440 ARG_PERSONALITY,
4d9f07b4 441 ARG_VOLATILE,
ec16945e 442 ARG_TEMPLATE,
f36933fe 443 ARG_PROPERTY,
6dac160c 444 ARG_PRIVATE_USERS,
c6c8f6e2 445 ARG_KILL_SIGNAL,
f757855e 446 ARG_SETTINGS,
5f932eb9 447 ARG_CHDIR,
b53ede69 448 ARG_PIVOT_ROOT,
7336138e 449 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 450 ARG_NOTIFY_READY,
4623e8e6 451 ARG_ROOT_HASH,
960e4569 452 ARG_SYSTEM_CALL_FILTER,
bf428efb 453 ARG_RLIMIT,
3a9530e5 454 ARG_HOSTNAME,
66edd963 455 ARG_NO_NEW_PRIVILEGES,
81f345df 456 ARG_OOM_SCORE_ADJUST,
d107bb7d 457 ARG_CPU_AFFINITY,
09d423e9 458 ARG_RESOLV_CONF,
1688841f 459 ARG_TIMEZONE,
a41fe3a2
LP
460 };
461
88213476 462 static const struct option options[] = {
d7bea6b6
DP
463 { "help", no_argument, NULL, 'h' },
464 { "version", no_argument, NULL, ARG_VERSION },
465 { "directory", required_argument, NULL, 'D' },
466 { "template", required_argument, NULL, ARG_TEMPLATE },
467 { "ephemeral", no_argument, NULL, 'x' },
468 { "user", required_argument, NULL, 'u' },
469 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
470 { "as-pid2", no_argument, NULL, 'a' },
471 { "boot", no_argument, NULL, 'b' },
472 { "uuid", required_argument, NULL, ARG_UUID },
473 { "read-only", no_argument, NULL, ARG_READ_ONLY },
474 { "capability", required_argument, NULL, ARG_CAPABILITY },
475 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 476 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
477 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
478 { "bind", required_argument, NULL, ARG_BIND },
479 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
480 { "tmpfs", required_argument, NULL, ARG_TMPFS },
481 { "overlay", required_argument, NULL, ARG_OVERLAY },
482 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
483 { "machine", required_argument, NULL, 'M' },
3a9530e5 484 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
485 { "slice", required_argument, NULL, 'S' },
486 { "setenv", required_argument, NULL, 'E' },
487 { "selinux-context", required_argument, NULL, 'Z' },
488 { "selinux-apifs-context", required_argument, NULL, 'L' },
489 { "quiet", no_argument, NULL, 'q' },
490 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
491 { "register", required_argument, NULL, ARG_REGISTER },
492 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
493 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
494 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
495 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
496 { "network-veth", no_argument, NULL, 'n' },
497 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
498 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
499 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
500 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
501 { "personality", required_argument, NULL, ARG_PERSONALITY },
502 { "image", required_argument, NULL, 'i' },
503 { "volatile", optional_argument, NULL, ARG_VOLATILE },
504 { "port", required_argument, NULL, 'p' },
505 { "property", required_argument, NULL, ARG_PROPERTY },
506 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
507 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
508 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
509 { "settings", required_argument, NULL, ARG_SETTINGS },
510 { "chdir", required_argument, NULL, ARG_CHDIR },
511 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
512 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
513 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
514 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 515 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 516 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 517 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 518 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 519 { "timezone", required_argument, NULL, ARG_TIMEZONE },
eb9da376 520 {}
88213476
LP
521 };
522
9444b1f2 523 int c, r;
6aadfa4c 524 const char *p, *e;
a42c8b54 525 uint64_t plus = 0, minus = 0;
f757855e 526 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
527
528 assert(argc >= 0);
529 assert(argv);
530
2e1f244e 531 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
532 switch (c) {
533
534 case 'h':
601185b4
ZJS
535 help();
536 return 0;
88213476 537
acbeb427 538 case ARG_VERSION:
3f6fd1ba 539 return version();
acbeb427 540
88213476 541 case 'D':
0f03c2a4 542 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 543 if (r < 0)
0f03c2a4 544 return r;
ec16945e
LP
545 break;
546
547 case ARG_TEMPLATE:
0f03c2a4 548 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 549 if (r < 0)
0f03c2a4 550 return r;
88213476
LP
551 break;
552
1b9e5b12 553 case 'i':
0f03c2a4 554 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 555 if (r < 0)
0f03c2a4 556 return r;
ec16945e
LP
557 break;
558
559 case 'x':
560 arg_ephemeral = true;
1b9e5b12
LP
561 break;
562
687d0825 563 case 'u':
2fc09a9c
DM
564 r = free_and_strdup(&arg_user, optarg);
565 if (r < 0)
7027ff61 566 return log_oom();
687d0825 567
f757855e 568 arg_settings_mask |= SETTING_USER;
687d0825
MV
569 break;
570
22b28dfd
LP
571 case ARG_NETWORK_ZONE: {
572 char *j;
573
574 j = strappend("vz-", optarg);
575 if (!j)
576 return log_oom();
577
578 if (!ifname_valid(j)) {
579 log_error("Network zone name not valid: %s", j);
580 free(j);
581 return -EINVAL;
582 }
583
df1fac6d 584 free_and_replace(arg_network_zone, j);
22b28dfd
LP
585
586 arg_network_veth = true;
587 arg_private_network = true;
588 arg_settings_mask |= SETTING_NETWORK;
589 break;
590 }
591
ab046dde 592 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
593
594 if (!ifname_valid(optarg)) {
595 log_error("Bridge interface name not valid: %s", optarg);
596 return -EINVAL;
597 }
598
f757855e
LP
599 r = free_and_strdup(&arg_network_bridge, optarg);
600 if (r < 0)
601 return log_oom();
ab046dde 602
4831981d 603 _fallthrough_;
0dfaa006 604 case 'n':
69c79d3c
LP
605 arg_network_veth = true;
606 arg_private_network = true;
f757855e 607 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
608 break;
609
f6d6bad1
LP
610 case ARG_NETWORK_VETH_EXTRA:
611 r = veth_extra_parse(&arg_network_veth_extra, optarg);
612 if (r < 0)
613 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
614
615 arg_private_network = true;
616 arg_settings_mask |= SETTING_NETWORK;
617 break;
618
aa28aefe 619 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
620 if (!ifname_valid(optarg)) {
621 log_error("Network interface name not valid: %s", optarg);
622 return -EINVAL;
623 }
624
c74e630d
LP
625 if (strv_extend(&arg_network_interfaces, optarg) < 0)
626 return log_oom();
627
628 arg_private_network = true;
f757855e 629 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
630 break;
631
632 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
633
634 if (!ifname_valid(optarg)) {
635 log_error("MACVLAN network interface name not valid: %s", optarg);
636 return -EINVAL;
637 }
638
c74e630d 639 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
640 return log_oom();
641
4bbfe7ad 642 arg_private_network = true;
f757855e 643 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
644 break;
645
646 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
647
648 if (!ifname_valid(optarg)) {
649 log_error("IPVLAN network interface name not valid: %s", optarg);
650 return -EINVAL;
651 }
652
4bbfe7ad
TG
653 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
654 return log_oom();
655
4831981d 656 _fallthrough_;
ff01d048
LP
657 case ARG_PRIVATE_NETWORK:
658 arg_private_network = true;
f757855e 659 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
660 break;
661
d7bea6b6
DP
662 case ARG_NETWORK_NAMESPACE_PATH:
663 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
664 if (r < 0)
665 return r;
666
667 break;
668
0f0dbc46 669 case 'b':
7732f92b
LP
670 if (arg_start_mode == START_PID2) {
671 log_error("--boot and --as-pid2 may not be combined.");
672 return -EINVAL;
673 }
674
675 arg_start_mode = START_BOOT;
676 arg_settings_mask |= SETTING_START_MODE;
677 break;
678
679 case 'a':
680 if (arg_start_mode == START_BOOT) {
681 log_error("--boot and --as-pid2 may not be combined.");
682 return -EINVAL;
683 }
684
685 arg_start_mode = START_PID2;
686 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
687 break;
688
144f0fc0 689 case ARG_UUID:
9444b1f2 690 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
691 if (r < 0)
692 return log_error_errno(r, "Invalid UUID: %s", optarg);
693
694 if (sd_id128_is_null(arg_uuid)) {
695 log_error("Machine UUID may not be all zeroes.");
696 return -EINVAL;
aa96c6cb 697 }
f757855e
LP
698
699 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 700 break;
aa96c6cb 701
9444b1f2 702 case 'S':
c74e630d 703 arg_slice = optarg;
144f0fc0
LP
704 break;
705
7027ff61 706 case 'M':
c1521918 707 if (isempty(optarg))
97b11eed 708 arg_machine = mfree(arg_machine);
c1521918 709 else {
0c3c4284 710 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
711 log_error("Invalid machine name: %s", optarg);
712 return -EINVAL;
713 }
7027ff61 714
0c3c4284
LP
715 r = free_and_strdup(&arg_machine, optarg);
716 if (r < 0)
eb91eb18 717 return log_oom();
eb91eb18 718 }
9ce6d1b3 719 break;
7027ff61 720
3a9530e5
LP
721 case ARG_HOSTNAME:
722 if (isempty(optarg))
723 arg_hostname = mfree(arg_hostname);
724 else {
725 if (!hostname_is_valid(optarg, false)) {
726 log_error("Invalid hostname: %s", optarg);
727 return -EINVAL;
728 }
729
730 r = free_and_strdup(&arg_hostname, optarg);
731 if (r < 0)
732 return log_oom();
733 }
734
735 arg_settings_mask |= SETTING_HOSTNAME;
736 break;
737
82adf6af
LP
738 case 'Z':
739 arg_selinux_context = optarg;
a8828ed9
DW
740 break;
741
82adf6af
LP
742 case 'L':
743 arg_selinux_apifs_context = optarg;
a8828ed9
DW
744 break;
745
bc2f673e
LP
746 case ARG_READ_ONLY:
747 arg_read_only = true;
f757855e 748 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
749 break;
750
420c7379
LP
751 case ARG_CAPABILITY:
752 case ARG_DROP_CAPABILITY: {
6cbe4ed1 753 p = optarg;
9ed794a3 754 for (;;) {
6cbe4ed1 755 _cleanup_free_ char *t = NULL;
5076f0cc 756
6cbe4ed1
SS
757 r = extract_first_word(&p, &t, ",", 0);
758 if (r < 0)
759 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 760
6cbe4ed1
SS
761 if (r == 0)
762 break;
5076f0cc 763
39ed67d1
LP
764 if (streq(t, "all")) {
765 if (c == ARG_CAPABILITY)
a42c8b54 766 plus = (uint64_t) -1;
39ed67d1 767 else
a42c8b54 768 minus = (uint64_t) -1;
39ed67d1 769 } else {
2822da4f
LP
770 int cap;
771
772 cap = capability_from_name(t);
773 if (cap < 0) {
39ed67d1
LP
774 log_error("Failed to parse capability %s.", t);
775 return -EINVAL;
776 }
777
778 if (c == ARG_CAPABILITY)
a42c8b54 779 plus |= 1ULL << (uint64_t) cap;
39ed67d1 780 else
a42c8b54 781 minus |= 1ULL << (uint64_t) cap;
5076f0cc 782 }
5076f0cc
LP
783 }
784
f757855e 785 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
786 break;
787 }
788
66edd963
LP
789 case ARG_NO_NEW_PRIVILEGES:
790 r = parse_boolean(optarg);
791 if (r < 0)
792 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
793
794 arg_no_new_privileges = r;
795 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
796 break;
797
57fb9fb5
LP
798 case 'j':
799 arg_link_journal = LINK_GUEST;
574edc90 800 arg_link_journal_try = true;
4e1d6aa9 801 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
802 break;
803
804 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
805 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
806 if (r < 0) {
807 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
808 return -EINVAL;
809 }
810
4e1d6aa9 811 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
812 break;
813
17fe0523 814 case ARG_BIND:
f757855e
LP
815 case ARG_BIND_RO:
816 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
817 if (r < 0)
818 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 819
f757855e 820 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 821 break;
06c17c39 822
f757855e
LP
823 case ARG_TMPFS:
824 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
825 if (r < 0)
826 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 827
f757855e 828 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 829 break;
5a8af538
LP
830
831 case ARG_OVERLAY:
ad85779a
LP
832 case ARG_OVERLAY_RO:
833 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
834 if (r == -EADDRNOTAVAIL)
835 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
836 if (r < 0)
837 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 838
f757855e 839 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 840 break;
06c17c39 841
a5f1cb3b 842 case 'E': {
f4889f65
LP
843 char **n;
844
845 if (!env_assignment_is_valid(optarg)) {
846 log_error("Environment variable assignment '%s' is not valid.", optarg);
847 return -EINVAL;
848 }
849
850 n = strv_env_set(arg_setenv, optarg);
851 if (!n)
852 return log_oom();
853
130d3d22 854 strv_free_and_replace(arg_setenv, n);
f757855e 855 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
856 break;
857 }
858
284c0b91
LP
859 case 'q':
860 arg_quiet = true;
861 break;
862
8a96d94e 863 case ARG_SHARE_SYSTEM:
a6b5216c 864 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 865 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 866 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 867 arg_clone_ns_flags = 0;
8a96d94e
LP
868 break;
869
eb91eb18
LP
870 case ARG_REGISTER:
871 r = parse_boolean(optarg);
872 if (r < 0) {
873 log_error("Failed to parse --register= argument: %s", optarg);
874 return r;
875 }
876
877 arg_register = r;
878 break;
879
89f7c846
LP
880 case ARG_KEEP_UNIT:
881 arg_keep_unit = true;
882 break;
883
6afc95b7
LP
884 case ARG_PERSONALITY:
885
ac45f971 886 arg_personality = personality_from_string(optarg);
050f7277 887 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
888 log_error("Unknown or unsupported personality '%s'.", optarg);
889 return -EINVAL;
890 }
891
f757855e 892 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
893 break;
894
4d9f07b4
LP
895 case ARG_VOLATILE:
896
897 if (!optarg)
f757855e 898 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
899 else if (streq(optarg, "help")) {
900 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
901 return 0;
902 } else {
f757855e 903 VolatileMode m;
4d9f07b4 904
f757855e
LP
905 m = volatile_mode_from_string(optarg);
906 if (m < 0) {
907 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 908 return -EINVAL;
f757855e
LP
909 } else
910 arg_volatile_mode = m;
6d0b55c2
LP
911 }
912
f757855e
LP
913 arg_settings_mask |= SETTING_VOLATILE_MODE;
914 break;
6d0b55c2 915
f757855e
LP
916 case 'p':
917 r = expose_port_parse(&arg_expose_ports, optarg);
918 if (r == -EEXIST)
919 return log_error_errno(r, "Duplicate port specification: %s", optarg);
920 if (r < 0)
921 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 922
f757855e 923 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 924 break;
6d0b55c2 925
f36933fe
LP
926 case ARG_PROPERTY:
927 if (strv_extend(&arg_property, optarg) < 0)
928 return log_oom();
929
930 break;
931
ae209204
ZJS
932 case ARG_PRIVATE_USERS: {
933 int boolean = -1;
0de7acce 934
ae209204
ZJS
935 if (!optarg)
936 boolean = true;
937 else if (!in_charset(optarg, DIGITS))
938 /* do *not* parse numbers as booleans */
939 boolean = parse_boolean(optarg);
940
941 if (boolean == false) {
0de7acce
LP
942 /* no: User namespacing off */
943 arg_userns_mode = USER_NAMESPACE_NO;
944 arg_uid_shift = UID_INVALID;
945 arg_uid_range = UINT32_C(0x10000);
ae209204 946 } else if (boolean == true) {
0de7acce
LP
947 /* yes: User namespacing on, UID range is read from root dir */
948 arg_userns_mode = USER_NAMESPACE_FIXED;
949 arg_uid_shift = UID_INVALID;
950 arg_uid_range = UINT32_C(0x10000);
951 } else if (streq(optarg, "pick")) {
952 /* pick: User namespacing on, UID range is picked randomly */
953 arg_userns_mode = USER_NAMESPACE_PICK;
954 arg_uid_shift = UID_INVALID;
955 arg_uid_range = UINT32_C(0x10000);
956 } else {
6c2058b3 957 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
958 const char *range, *shift;
959
0de7acce
LP
960 /* anything else: User namespacing on, UID range is explicitly configured */
961
6dac160c
LP
962 range = strchr(optarg, ':');
963 if (range) {
6c2058b3
ZJS
964 buffer = strndup(optarg, range - optarg);
965 if (!buffer)
966 return log_oom();
967 shift = buffer;
6dac160c
LP
968
969 range++;
bfd292ec
ZJS
970 r = safe_atou32(range, &arg_uid_range);
971 if (r < 0)
be715731 972 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
973 } else
974 shift = optarg;
975
be715731
ZJS
976 r = parse_uid(shift, &arg_uid_shift);
977 if (r < 0)
978 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
979
980 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
981 }
982
be715731
ZJS
983 if (arg_uid_range <= 0) {
984 log_error("UID range cannot be 0.");
985 return -EINVAL;
986 }
987
0de7acce 988 arg_settings_mask |= SETTING_USERNS;
6dac160c 989 break;
ae209204 990 }
6dac160c 991
0de7acce 992 case 'U':
ccabee0d
LP
993 if (userns_supported()) {
994 arg_userns_mode = USER_NAMESPACE_PICK;
995 arg_uid_shift = UID_INVALID;
996 arg_uid_range = UINT32_C(0x10000);
997
998 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
999 }
1000
7336138e
LP
1001 break;
1002
0de7acce 1003 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1004 arg_userns_chown = true;
0de7acce
LP
1005
1006 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1007 break;
1008
c6c8f6e2 1009 case ARG_KILL_SIGNAL:
5c828e66
LP
1010 if (streq(optarg, "help")) {
1011 DUMP_STRING_TABLE(signal, int, _NSIG);
1012 return 0;
1013 }
1014
29a3db75 1015 arg_kill_signal = signal_from_string(optarg);
c6c8f6e2
LP
1016 if (arg_kill_signal < 0) {
1017 log_error("Cannot parse signal: %s", optarg);
1018 return -EINVAL;
1019 }
1020
f757855e
LP
1021 arg_settings_mask |= SETTING_KILL_SIGNAL;
1022 break;
1023
1024 case ARG_SETTINGS:
1025
1026 /* no → do not read files
1027 * yes → read files, do not override cmdline, trust only subset
1028 * override → read files, override cmdline, trust only subset
1029 * trusted → read files, do not override cmdline, trust all
1030 */
1031
1032 r = parse_boolean(optarg);
1033 if (r < 0) {
1034 if (streq(optarg, "trusted")) {
1035 mask_all_settings = false;
1036 mask_no_settings = false;
1037 arg_settings_trusted = true;
1038
1039 } else if (streq(optarg, "override")) {
1040 mask_all_settings = false;
1041 mask_no_settings = true;
1042 arg_settings_trusted = -1;
1043 } else
1044 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1045 } else if (r > 0) {
1046 /* yes */
1047 mask_all_settings = false;
1048 mask_no_settings = false;
1049 arg_settings_trusted = -1;
1050 } else {
1051 /* no */
1052 mask_all_settings = true;
1053 mask_no_settings = false;
1054 arg_settings_trusted = false;
1055 }
1056
c6c8f6e2
LP
1057 break;
1058
5f932eb9
LP
1059 case ARG_CHDIR:
1060 if (!path_is_absolute(optarg)) {
1061 log_error("Working directory %s is not an absolute path.", optarg);
1062 return -EINVAL;
1063 }
1064
1065 r = free_and_strdup(&arg_chdir, optarg);
1066 if (r < 0)
1067 return log_oom();
1068
1069 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1070 break;
1071
b53ede69
PW
1072 case ARG_PIVOT_ROOT:
1073 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1074 if (r < 0)
1075 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1076
1077 arg_settings_mask |= SETTING_PIVOT_ROOT;
1078 break;
1079
9c1e04d0
AP
1080 case ARG_NOTIFY_READY:
1081 r = parse_boolean(optarg);
1082 if (r < 0) {
1083 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1084 return -EINVAL;
1085 }
1086 arg_notify_ready = r;
1087 arg_settings_mask |= SETTING_NOTIFY_READY;
1088 break;
1089
4623e8e6
LP
1090 case ARG_ROOT_HASH: {
1091 void *k;
1092 size_t l;
1093
1094 r = unhexmem(optarg, strlen(optarg), &k, &l);
1095 if (r < 0)
1096 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1097 if (l < sizeof(sd_id128_t)) {
1098 log_error("Root hash must be at least 128bit long: %s", optarg);
1099 free(k);
1100 return -EINVAL;
1101 }
1102
1103 free(arg_root_hash);
1104 arg_root_hash = k;
1105 arg_root_hash_size = l;
1106 break;
1107 }
1108
960e4569
LP
1109 case ARG_SYSTEM_CALL_FILTER: {
1110 bool negative;
1111 const char *items;
1112
1113 negative = optarg[0] == '~';
1114 items = negative ? optarg + 1 : optarg;
1115
1116 for (;;) {
1117 _cleanup_free_ char *word = NULL;
1118
1119 r = extract_first_word(&items, &word, NULL, 0);
1120 if (r == 0)
1121 break;
1122 if (r == -ENOMEM)
1123 return log_oom();
1124 if (r < 0)
1125 return log_error_errno(r, "Failed to parse system call filter: %m");
1126
1127 if (negative)
1128 r = strv_extend(&arg_syscall_blacklist, word);
1129 else
1130 r = strv_extend(&arg_syscall_whitelist, word);
1131 if (r < 0)
1132 return log_oom();
1133 }
1134
1135 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1136 break;
1137 }
1138
bf428efb
LP
1139 case ARG_RLIMIT: {
1140 const char *eq;
1141 char *name;
1142 int rl;
1143
5c828e66
LP
1144 if (streq(optarg, "help")) {
1145 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1146 return 0;
1147 }
1148
bf428efb
LP
1149 eq = strchr(optarg, '=');
1150 if (!eq) {
1151 log_error("--rlimit= expects an '=' assignment.");
1152 return -EINVAL;
1153 }
1154
1155 name = strndup(optarg, eq - optarg);
1156 if (!name)
1157 return log_oom();
1158
1159 rl = rlimit_from_string_harder(name);
1160 if (rl < 0) {
1161 log_error("Unknown resource limit: %s", name);
1162 return -EINVAL;
1163 }
1164
1165 if (!arg_rlimit[rl]) {
1166 arg_rlimit[rl] = new0(struct rlimit, 1);
1167 if (!arg_rlimit[rl])
1168 return log_oom();
1169 }
1170
1171 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1172 if (r < 0)
1173 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1174
1175 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1176 break;
1177 }
1178
81f345df
LP
1179 case ARG_OOM_SCORE_ADJUST:
1180 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1181 if (r < 0)
1182 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1183
1184 arg_oom_score_adjust_set = true;
1185 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1186 break;
1187
d107bb7d
LP
1188 case ARG_CPU_AFFINITY: {
1189 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1190
1191 r = parse_cpu_set(optarg, &cpuset);
1192 if (r < 0)
1193 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1194
1195 if (arg_cpuset)
1196 CPU_FREE(arg_cpuset);
1197
1198 arg_cpuset = TAKE_PTR(cpuset);
1199 arg_cpuset_ncpus = r;
1200 arg_settings_mask |= SETTING_CPU_AFFINITY;
1201 break;
1202 }
1203
09d423e9
LP
1204 case ARG_RESOLV_CONF:
1205 if (streq(optarg, "help")) {
1206 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1207 return 0;
1208 }
1209
1210 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1211 if (arg_resolv_conf < 0) {
1212 log_error("Failed to parse /etc/resolv.conf mode: %s", optarg);
1213 return -EINVAL;
1214 }
1215
1216 arg_settings_mask |= SETTING_RESOLV_CONF;
1217 break;
1218
1688841f
LP
1219 case ARG_TIMEZONE:
1220 if (streq(optarg, "help")) {
1221 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1222 return 0;
1223 }
1224
1225 arg_timezone = timezone_mode_from_string(optarg);
1226 if (arg_timezone < 0) {
1227 log_error("Failed to parse /etc/localtime mode: %s", optarg);
1228 return -EINVAL;
1229 }
1230
1231 arg_settings_mask |= SETTING_TIMEZONE;
1232 break;
1233
88213476
LP
1234 case '?':
1235 return -EINVAL;
1236
1237 default:
eb9da376 1238 assert_not_reached("Unhandled option");
88213476 1239 }
88213476 1240
d7bea6b6
DP
1241 /* If --network-namespace-path is given with any other network-related option,
1242 * we need to error out, to avoid conflicts between different network options. */
1243 if (arg_network_namespace_path &&
1244 (arg_network_interfaces || arg_network_macvlan ||
1245 arg_network_ipvlan || arg_network_veth_extra ||
1246 arg_network_bridge || arg_network_zone ||
1247 arg_network_veth || arg_private_network)) {
1248 log_error("--network-namespace-path cannot be combined with other network options.");
1249 return -EINVAL;
1250 }
1251
0c582db0
LB
1252 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1253 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1254 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1255 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1256
4f086aab
SU
1257 if (arg_userns_mode != USER_NAMESPACE_NO)
1258 arg_mount_settings |= MOUNT_USE_USERNS;
1259
1260 if (arg_private_network)
1261 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1262
1263 parse_mount_settings_env();
1264
48a8d337
LB
1265 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1266 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1267 arg_register = false;
0c582db0
LB
1268 if (arg_start_mode != START_PID1) {
1269 log_error("--boot cannot be used without namespacing.");
1270 return -EINVAL;
1271 }
1272 }
eb91eb18 1273
0de7acce 1274 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1275 arg_userns_chown = true;
1276
cd2dfc6f 1277 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
8d9c2bca
AJ
1278 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1279 * The latter is not technically a user session, but we don't need to labour the point. */
cd2dfc6f 1280 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846
LP
1281 return -EINVAL;
1282 }
1283
1b9e5b12
LP
1284 if (arg_directory && arg_image) {
1285 log_error("--directory= and --image= may not be combined.");
1286 return -EINVAL;
1287 }
1288
ec16945e
LP
1289 if (arg_template && arg_image) {
1290 log_error("--template= and --image= may not be combined.");
1291 return -EINVAL;
1292 }
1293
8cd328d8
LP
1294 if (arg_ephemeral && arg_template && !arg_directory) {
1295 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1296 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1297 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1298 * --directory=". */
1299
ae2a15bc 1300 arg_directory = TAKE_PTR(arg_template);
8cd328d8
LP
1301 }
1302
ec16945e
LP
1303 if (arg_template && !(arg_directory || arg_machine)) {
1304 log_error("--template= needs --directory= or --machine=.");
1305 return -EINVAL;
1306 }
1307
1308 if (arg_ephemeral && arg_template) {
1309 log_error("--ephemeral and --template= may not be combined.");
1310 return -EINVAL;
1311 }
1312
df9a75e4
LP
1313 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1314 log_error("--ephemeral and --link-journal= may not be combined.");
1315 return -EINVAL;
1316 }
1317
ccabee0d 1318 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1319 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1320 return -EOPNOTSUPP;
1321 }
1322
1323 if (arg_userns_chown && arg_read_only) {
1324 log_error("--read-only and --private-users-chown may not be combined.");
1325 return -EINVAL;
1326 }
f757855e 1327
22b28dfd
LP
1328 if (arg_network_bridge && arg_network_zone) {
1329 log_error("--network-bridge= and --network-zone= may not be combined.");
1330 return -EINVAL;
1331 }
1332
f757855e
LP
1333 if (argc > optind) {
1334 arg_parameters = strv_copy(argv + optind);
1335 if (!arg_parameters)
1336 return log_oom();
1337
7732f92b 1338 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1339 }
1340
1341 /* Load all settings from .nspawn files */
1342 if (mask_no_settings)
1343 arg_settings_mask = 0;
1344
1345 /* Don't load any settings from .nspawn files */
1346 if (mask_all_settings)
1347 arg_settings_mask = _SETTINGS_MASK_ALL;
1348
520e0d54 1349 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1350
399e391f
ZJS
1351 r = cg_unified_flush();
1352 if (r < 0)
1353 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1354
6aadfa4c
ILG
1355 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1356 if (e)
1357 arg_container_service_name = e;
1358
5a8ff0e6
CB
1359 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1360 if (r < 0)
1361 arg_use_cgns = cg_ns_supported();
1362 else
1363 arg_use_cgns = r;
1364
86c0dd4a
LP
1365 r = custom_mount_check_all();
1366 if (r < 0)
1367 return r;
1368
f757855e
LP
1369 return 1;
1370}
1371
1372static int verify_arguments(void) {
4f086aab
SU
1373 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1374 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1375 return -EINVAL;
1376 }
1377
1378 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1379 log_error("Cannot combine --private-users with read-write mounts.");
1380 return -EINVAL;
1381 }
f757855e
LP
1382
1383 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1384 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1385 return -EINVAL;
1386 }
1387
6d0b55c2
LP
1388 if (arg_expose_ports && !arg_private_network) {
1389 log_error("Cannot use --port= without private networking.");
1390 return -EINVAL;
1391 }
1392
349cc4a5 1393#if ! HAVE_LIBIPTC
1c1ea217
EV
1394 if (arg_expose_ports) {
1395 log_error("--port= is not supported, compiled without libiptc support.");
1396 return -EOPNOTSUPP;
1397 }
1398#endif
1399
7732f92b 1400 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1401 arg_kill_signal = SIGRTMIN+3;
1402
f757855e 1403 return 0;
88213476
LP
1404}
1405
03cfe0d5
LP
1406static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1407 assert(p);
1408
0de7acce 1409 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1410 return 0;
1411
1412 if (uid == UID_INVALID && gid == GID_INVALID)
1413 return 0;
1414
1415 if (uid != UID_INVALID) {
1416 uid += arg_uid_shift;
1417
1418 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1419 return -EOVERFLOW;
1420 }
1421
1422 if (gid != GID_INVALID) {
1423 gid += (gid_t) arg_uid_shift;
1424
1425 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1426 return -EOVERFLOW;
1427 }
1428
1429 if (lchown(p, uid, gid) < 0)
1430 return -errno;
b12afc8c
LP
1431
1432 return 0;
1433}
1434
03cfe0d5
LP
1435static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1436 const char *q;
dae8b82e 1437 int r;
03cfe0d5
LP
1438
1439 q = prefix_roota(root, path);
dae8b82e
ZJS
1440 r = mkdir_errno_wrapper(q, mode);
1441 if (r == -EEXIST)
1442 return 0;
1443 if (r < 0)
1444 return r;
03cfe0d5
LP
1445
1446 return userns_lchown(q, uid, gid);
1447}
1448
1688841f
LP
1449static const char *timezone_from_path(const char *path) {
1450 const char *z;
1451
1452 z = path_startswith(path, "../usr/share/zoneinfo/");
1453 if (z)
1454 return z;
1455
1456 z = path_startswith(path, "/usr/share/zoneinfo/");
1457 if (z)
1458 return z;
1459
1460 return NULL;
1461}
1462
e58a1277 1463static int setup_timezone(const char *dest) {
1688841f
LP
1464 _cleanup_free_ char *p = NULL, *etc = NULL;
1465 const char *where, *check;
1466 TimezoneMode m;
d4036145 1467 int r;
f8440af5 1468
e58a1277
LP
1469 assert(dest);
1470
1688841f 1471 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1472 r = readlink_malloc("/etc/localtime", &p);
1473 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1474 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE;
1475 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1476 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY;
1477 else if (r < 0) {
1478 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1479 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1480 * file.
1481 *
1482 * Example:
1483 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1484 */
1485 return 0;
1486 } else if (arg_timezone == TIMEZONE_AUTO)
1487 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK;
1488 else
1489 m = arg_timezone;
1490 } else
1491 m = arg_timezone;
1492
1493 if (m == TIMEZONE_OFF)
1494 return 0;
1495
1496 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
d4036145 1497 if (r < 0) {
1688841f 1498 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1499 return 0;
1500 }
1501
1688841f
LP
1502 where = strjoina(etc, "/localtime");
1503
1504 switch (m) {
1505
1506 case TIMEZONE_DELETE:
1507 if (unlink(where) < 0)
1508 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1509
d4036145 1510 return 0;
d4036145 1511
1688841f
LP
1512 case TIMEZONE_SYMLINK: {
1513 _cleanup_free_ char *q = NULL;
1514 const char *z, *what;
4d1c38b8 1515
1688841f
LP
1516 z = timezone_from_path(p);
1517 if (!z) {
1518 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1519 return 0;
1688841f 1520 }
d4036145 1521
1688841f
LP
1522 r = readlink_malloc(where, &q);
1523 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1524 return 0; /* Already pointing to the right place? Then do nothing .. */
1525
1526 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1527 r = chase_symlinks(check, dest, 0, NULL);
1528 if (r < 0)
1529 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1530 else {
1531 if (unlink(where) < 0 && errno != ENOENT) {
1532 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1533 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1534 return 0;
1535 }
1536
1537 what = strjoina("../usr/share/zoneinfo/", z);
1538 if (symlink(what, where) < 0) {
1539 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1540 errno, "Failed to correct timezone of container, ignoring: %m");
1541 return 0;
1542 }
1543
1544 break;
1545 }
1546
1547 _fallthrough_;
d4036145 1548 }
68fb0892 1549
1688841f
LP
1550 case TIMEZONE_BIND: {
1551 _cleanup_free_ char *resolved = NULL;
1552 int found;
1553
1554 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1555 if (found < 0) {
1556 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1557 return 0;
1558 }
1559
1560 if (found == 0) /* missing? */
1561 (void) touch(resolved);
1562
1563 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1564 if (r >= 0)
1565 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1566
1567 _fallthrough_;
79d80fc1 1568 }
4d9f07b4 1569
1688841f
LP
1570 case TIMEZONE_COPY:
1571 /* If mounting failed, try to copy */
1572 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1573 if (r < 0) {
1574 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1575 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1576 return 0;
1577 }
1578
1579 break;
1580
1581 default:
1582 assert_not_reached("unexpected mode");
d4036145 1583 }
e58a1277 1584
1688841f 1585 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1586 r = userns_lchown(where, 0, 0);
1587 if (r < 0)
1688841f 1588 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1589
e58a1277 1590 return 0;
88213476
LP
1591}
1592
09d423e9
LP
1593static int have_resolv_conf(const char *path) {
1594 assert(path);
1595
1596 if (access(path, F_OK) < 0) {
1597 if (errno == ENOENT)
1598 return 0;
1599
1600 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1601 }
1602
1603 return 1;
1604}
1605
7357272e 1606static int resolved_listening(void) {
b8ea7a6e 1607 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1608 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1609 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1610 int r;
1611
7357272e 1612 /* Check if resolved is listening */
b053cd5f
LP
1613
1614 r = sd_bus_open_system(&bus);
1615 if (r < 0)
b8ea7a6e 1616 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1617
7357272e 1618 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1619 if (r < 0)
1620 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1621 if (r == 0)
1622 return 0;
7357272e
DM
1623
1624 r = sd_bus_get_property_string(bus,
1625 "org.freedesktop.resolve1",
1626 "/org/freedesktop/resolve1",
1627 "org.freedesktop.resolve1.Manager",
1628 "DNSStubListener",
b8ea7a6e 1629 &error,
7357272e
DM
1630 &dns_stub_listener_mode);
1631 if (r < 0)
b8ea7a6e 1632 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1633
1634 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1635}
1636
2547bb41 1637static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1638 _cleanup_free_ char *etc = NULL;
1639 const char *where, *what;
1640 ResolvConfMode m;
1641 int r;
2547bb41
LP
1642
1643 assert(dest);
1644
09d423e9
LP
1645 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1646 if (arg_private_network)
1647 m = RESOLV_CONF_OFF;
1648 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
1649 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1650 * container, so that the container can use the host's resolver. Given that network namespacing is
1651 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1652 * advantage that the container will be able to follow the host's DNS server configuration changes
1653 * transparently. */
1654 m = RESOLV_CONF_BIND_STATIC;
1655 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1656 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
1657 else
1658 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
1659 } else
1660 m = arg_resolv_conf;
1661
1662 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1663 return 0;
1664
87447ae4
LP
1665 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1666 if (r < 0) {
1667 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1668 return 0;
1669 }
1670
1671 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1672
1673 if (m == RESOLV_CONF_DELETE) {
1674 if (unlink(where) < 0)
1675 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1676
87447ae4
LP
1677 return 0;
1678 }
79d80fc1 1679
09d423e9
LP
1680 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1681 what = STATIC_RESOLV_CONF;
1682 else
1683 what = "/etc/resolv.conf";
87447ae4 1684
09d423e9
LP
1685 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1686 _cleanup_free_ char *resolved = NULL;
1687 int found;
1688
1689 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1690 if (found < 0) {
1691 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1692 return 0;
1693 }
3539724c 1694
87447ae4
LP
1695 if (found == 0) /* missing? */
1696 (void) touch(resolved);
5367354d 1697
09d423e9 1698 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1699 if (r >= 0)
87447ae4 1700 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1701 }
1702
1703 /* If that didn't work, let's copy the file */
09d423e9 1704 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1705 if (r < 0) {
3539724c
LP
1706 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1707 * resolved or something similar runs inside and the symlink points there.
68a313c5 1708 *
3539724c 1709 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1710 */
09d423e9 1711 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1712 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1713 return 0;
1714 }
2547bb41 1715
03cfe0d5
LP
1716 r = userns_lchown(where, 0, 0);
1717 if (r < 0)
3539724c 1718 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1719
2547bb41
LP
1720 return 0;
1721}
1722
1e4f1671 1723static int setup_boot_id(void) {
cdde6ba6
LP
1724 _cleanup_(unlink_and_freep) char *from = NULL;
1725 _cleanup_free_ char *path = NULL;
3bbaff3e 1726 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1727 const char *to;
04bc4a3f
LP
1728 int r;
1729
04bc4a3f
LP
1730 /* Generate a new randomized boot ID, so that each boot-up of
1731 * the container gets a new one */
1732
cdde6ba6
LP
1733 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1734 if (r < 0)
1735 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1736
1737 r = sd_id128_randomize(&rnd);
f647962d
MS
1738 if (r < 0)
1739 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1740
cdde6ba6 1741 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1742 if (r < 0)
1743 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1744
cdde6ba6
LP
1745 from = TAKE_PTR(path);
1746 to = "/proc/sys/kernel/random/boot_id";
1747
60e76d48 1748 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1749 if (r < 0)
1750 return r;
04bc4a3f 1751
cdde6ba6 1752 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1753}
1754
e58a1277 1755static int copy_devnodes(const char *dest) {
88213476
LP
1756 static const char devnodes[] =
1757 "null\0"
1758 "zero\0"
1759 "full\0"
1760 "random\0"
1761 "urandom\0"
85614d66
TG
1762 "tty\0"
1763 "net/tun\0";
88213476
LP
1764
1765 const char *d;
e58a1277 1766 int r = 0;
7fd1b19b 1767 _cleanup_umask_ mode_t u;
a258bf26
LP
1768
1769 assert(dest);
124640f1
LP
1770
1771 u = umask(0000);
88213476 1772
03cfe0d5
LP
1773 /* Create /dev/net, so that we can create /dev/net/tun in it */
1774 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1775 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1776
88213476 1777 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1778 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1779 struct stat st;
88213476 1780
7f112f50 1781 from = strappend("/dev/", d);
03cfe0d5 1782 to = prefix_root(dest, from);
88213476
LP
1783
1784 if (stat(from, &st) < 0) {
1785
4a62c710
MS
1786 if (errno != ENOENT)
1787 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1788
a258bf26 1789 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1790
03cfe0d5 1791 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1792 return -EIO;
a258bf26 1793
85614d66 1794 } else {
81f5049b 1795 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1796 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1797 if (errno == EEXIST)
8dbf71ec 1798 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1799 if (errno != EPERM)
1800 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1801
1802 /* Some systems abusively restrict mknod but
1803 * allow bind mounts. */
1804 r = touch(to);
1805 if (r < 0)
1806 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1807 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1808 if (r < 0)
1809 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1810 }
6278cf60 1811
03cfe0d5
LP
1812 r = userns_lchown(to, 0, 0);
1813 if (r < 0)
1814 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1815 }
88213476
LP
1816 }
1817
e58a1277
LP
1818 return r;
1819}
88213476 1820
03cfe0d5
LP
1821static int setup_pts(const char *dest) {
1822 _cleanup_free_ char *options = NULL;
1823 const char *p;
709f6e46 1824 int r;
03cfe0d5 1825
349cc4a5 1826#if HAVE_SELINUX
03cfe0d5
LP
1827 if (arg_selinux_apifs_context)
1828 (void) asprintf(&options,
3dce8915 1829 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1830 arg_uid_shift + TTY_GID,
1831 arg_selinux_apifs_context);
1832 else
1833#endif
1834 (void) asprintf(&options,
3dce8915 1835 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1836 arg_uid_shift + TTY_GID);
f2d88580 1837
03cfe0d5 1838 if (!options)
f2d88580
LP
1839 return log_oom();
1840
03cfe0d5 1841 /* Mount /dev/pts itself */
cc9fce65 1842 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1843 r = mkdir_errno_wrapper(p, 0755);
1844 if (r < 0)
1845 return log_error_errno(r, "Failed to create /dev/pts: %m");
1846
60e76d48
ZJS
1847 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1848 if (r < 0)
1849 return r;
709f6e46
MS
1850 r = userns_lchown(p, 0, 0);
1851 if (r < 0)
1852 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1853
1854 /* Create /dev/ptmx symlink */
1855 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1856 if (symlink("pts/ptmx", p) < 0)
1857 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1858 r = userns_lchown(p, 0, 0);
1859 if (r < 0)
1860 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1861
03cfe0d5
LP
1862 /* And fix /dev/pts/ptmx ownership */
1863 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1864 r = userns_lchown(p, 0, 0);
1865 if (r < 0)
1866 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1867
f2d88580
LP
1868 return 0;
1869}
1870
e58a1277 1871static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1872 _cleanup_umask_ mode_t u;
1873 const char *to;
e58a1277 1874 int r;
e58a1277
LP
1875
1876 assert(dest);
1877 assert(console);
1878
1879 u = umask(0000);
1880
03cfe0d5 1881 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1882 if (r < 0)
1883 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1884
a258bf26
LP
1885 /* We need to bind mount the right tty to /dev/console since
1886 * ptys can only exist on pts file systems. To have something
81f5049b 1887 * to bind mount things on we create a empty regular file. */
a258bf26 1888
03cfe0d5 1889 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1890 r = touch(to);
1891 if (r < 0)
1892 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1893
60e76d48 1894 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1895}
1896
8e5430c4
LP
1897static int setup_keyring(void) {
1898 key_serial_t keyring;
1899
1900 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1901 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1902 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1903 * these system calls let's make sure we don't leak anything into the container. */
1904
1905 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1906 if (keyring == -1) {
1907 if (errno == ENOSYS)
1908 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1909 else if (IN_SET(errno, EACCES, EPERM))
1910 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1911 else
1912 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1913 }
1914
1915 return 0;
1916}
1917
1e4f1671 1918static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
1919 _cleanup_(unlink_and_freep) char *from = NULL;
1920 _cleanup_free_ char *fifo = NULL;
1921 _cleanup_close_ int fd = -1;
7fd1b19b 1922 _cleanup_umask_ mode_t u;
9ec5a93c
LP
1923 const char *to;
1924 int r;
e58a1277 1925
e58a1277 1926 assert(kmsg_socket >= 0);
a258bf26 1927
e58a1277 1928 u = umask(0000);
a258bf26 1929
9ec5a93c
LP
1930 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1931 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1932 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1933 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1934
1935 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 1938
9ec5a93c 1939 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 1940 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
1941
1942 from = TAKE_PTR(fifo);
1943 to = "/proc/kmsg";
1944
60e76d48
ZJS
1945 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1946 if (r < 0)
1947 return r;
e58a1277 1948
669fc4e5 1949 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
1950 if (fd < 0)
1951 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1952
9ec5a93c 1953 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 1954 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
1955 if (r < 0)
1956 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1957
25ea79fe 1958 return 0;
88213476
LP
1959}
1960
1c4baffc 1961static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1962 union in_addr_union *exposed = userdata;
1963
1964 assert(rtnl);
1965 assert(m);
1966 assert(exposed);
1967
7a8f6325 1968 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1969 return 0;
1970}
1971
3a74cea5 1972static int setup_hostname(void) {
c818eef1 1973 int r;
3a74cea5 1974
0c582db0 1975 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1976 return 0;
1977
c818eef1
LP
1978 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1979 if (r < 0)
1980 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 1981
7027ff61 1982 return 0;
3a74cea5
LP
1983}
1984
57fb9fb5 1985static int setup_journal(const char *directory) {
e01ff70a 1986 sd_id128_t this_id;
0f5e1382 1987 _cleanup_free_ char *d = NULL;
e01ff70a 1988 const char *p, *q;
8054d749 1989 bool try;
e01ff70a 1990 char id[33];
57fb9fb5
LP
1991 int r;
1992
df9a75e4
LP
1993 /* Don't link journals in ephemeral mode */
1994 if (arg_ephemeral)
1995 return 0;
1996
8054d749
LP
1997 if (arg_link_journal == LINK_NO)
1998 return 0;
1999
2000 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2001
4d680aee 2002 r = sd_id128_get_machine(&this_id);
f647962d
MS
2003 if (r < 0)
2004 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 2005
e01ff70a 2006 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 2007 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2008 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2009 if (try)
4d680aee 2010 return 0;
df9a75e4 2011 return -EEXIST;
4d680aee
ZJS
2012 }
2013
03cfe0d5
LP
2014 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2015 if (r < 0)
2016 return log_error_errno(r, "Failed to create /var: %m");
2017
2018 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2019 if (r < 0)
2020 return log_error_errno(r, "Failed to create /var/log: %m");
2021
2022 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2023 if (r < 0)
2024 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2025
e01ff70a
MS
2026 (void) sd_id128_to_string(arg_uuid, id);
2027
03cfe0d5
LP
2028 p = strjoina("/var/log/journal/", id);
2029 q = prefix_roota(directory, p);
27407a01 2030
e1873695 2031 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2032 if (try)
2033 return 0;
27407a01 2034
8054d749
LP
2035 log_error("%s: already a mount point, refusing to use for journal", p);
2036 return -EEXIST;
57fb9fb5
LP
2037 }
2038
e1873695 2039 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2040 if (try)
2041 return 0;
57fb9fb5 2042
8054d749
LP
2043 log_error("%s: already a mount point, refusing to use for journal", q);
2044 return -EEXIST;
57fb9fb5
LP
2045 }
2046
2047 r = readlink_and_make_absolute(p, &d);
2048 if (r >= 0) {
3742095b 2049 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2050 path_equal(d, q)) {
2051
03cfe0d5 2052 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2053 if (r < 0)
709f6e46 2054 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2055 return 0;
57fb9fb5
LP
2056 }
2057
4a62c710
MS
2058 if (unlink(p) < 0)
2059 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2060 } else if (r == -EINVAL) {
2061
2062 if (arg_link_journal == LINK_GUEST &&
2063 rmdir(p) < 0) {
2064
27407a01
ZJS
2065 if (errno == ENOTDIR) {
2066 log_error("%s already exists and is neither a symlink nor a directory", p);
2067 return r;
4314d33f
MS
2068 } else
2069 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2070 }
4314d33f
MS
2071 } else if (r != -ENOENT)
2072 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2073
2074 if (arg_link_journal == LINK_GUEST) {
2075
2076 if (symlink(q, p) < 0) {
8054d749 2077 if (try) {
56f64d95 2078 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2079 return 0;
4314d33f
MS
2080 } else
2081 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2082 }
2083
03cfe0d5 2084 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2085 if (r < 0)
709f6e46 2086 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2087 return 0;
57fb9fb5
LP
2088 }
2089
2090 if (arg_link_journal == LINK_HOST) {
ccddd104 2091 /* don't create parents here — if the host doesn't have
574edc90 2092 * permanent journal set up, don't force it here */
ba8e6c4d 2093
dae8b82e
ZJS
2094 r = mkdir_errno_wrapper(p, 0755);
2095 if (r < 0 && r != -EEXIST) {
8054d749 2096 if (try) {
dae8b82e 2097 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2098 return 0;
4314d33f 2099 } else
dae8b82e 2100 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2101 }
2102
27407a01
ZJS
2103 } else if (access(p, F_OK) < 0)
2104 return 0;
57fb9fb5 2105
cdb2b9d0
LP
2106 if (dir_is_empty(q) == 0)
2107 log_warning("%s is not empty, proceeding anyway.", q);
2108
03cfe0d5 2109 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2110 if (r < 0)
2111 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2112
60e76d48
ZJS
2113 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2114 if (r < 0)
4a62c710 2115 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2116
27407a01 2117 return 0;
57fb9fb5
LP
2118}
2119
88213476 2120static int drop_capabilities(void) {
520e0d54 2121 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
2122}
2123
db999e0f
LP
2124static int reset_audit_loginuid(void) {
2125 _cleanup_free_ char *p = NULL;
2126 int r;
2127
0c582db0 2128 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2129 return 0;
2130
2131 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2132 if (r == -ENOENT)
db999e0f 2133 return 0;
f647962d
MS
2134 if (r < 0)
2135 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2136
2137 /* Already reset? */
2138 if (streq(p, "4294967295"))
2139 return 0;
2140
ad118bda 2141 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2142 if (r < 0) {
10a87006
LP
2143 log_error_errno(r,
2144 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2145 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2146 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2147 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2148 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2149
db999e0f 2150 sleep(5);
77b6e194 2151 }
db999e0f
LP
2152
2153 return 0;
77b6e194
LP
2154}
2155
785890ac
LP
2156static int setup_propagate(const char *root) {
2157 const char *p, *q;
709f6e46 2158 int r;
785890ac
LP
2159
2160 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2161 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2162 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2163 (void) mkdir_p(p, 0600);
2164
709f6e46
MS
2165 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2166 if (r < 0)
2167 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2168
709f6e46
MS
2169 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2170 if (r < 0)
2171 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2172
709f6e46
MS
2173 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2174 if (r < 0)
2175 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2176
03cfe0d5 2177 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2178 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2179 if (r < 0)
2180 return r;
785890ac 2181
60e76d48
ZJS
2182 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2183 if (r < 0)
2184 return r;
785890ac 2185
19caffac
AC
2186 /* machined will MS_MOVE into that directory, and that's only
2187 * supported for non-shared mounts. */
60e76d48 2188 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2189}
2190
317feb4d 2191static int setup_machine_id(const char *directory) {
691675ba
LP
2192 const char *etc_machine_id;
2193 sd_id128_t id;
3bbaff3e 2194 int r;
e01ff70a 2195
317feb4d
LP
2196 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2197 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2198 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2199 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2200 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2201 * container behaves nicely). */
2202
e01ff70a
MS
2203 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2204
691675ba 2205 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2206 if (r < 0) {
2207 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2208 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2209
317feb4d
LP
2210 if (sd_id128_is_null(arg_uuid)) {
2211 r = sd_id128_randomize(&arg_uuid);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2214 }
2215 } else {
2216 if (sd_id128_is_null(id)) {
2217 log_error("Machine ID in container image is zero, refusing.");
2218 return -EINVAL;
2219 }
e01ff70a 2220
317feb4d
LP
2221 arg_uuid = id;
2222 }
691675ba 2223
e01ff70a
MS
2224 return 0;
2225}
2226
7336138e
LP
2227static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2228 int r;
2229
2230 assert(directory);
2231
0de7acce 2232 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2233 return 0;
2234
2235 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2236 if (r == -EOPNOTSUPP)
2237 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2238 if (r == -EBADE)
2239 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2240 if (r < 0)
2241 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2242 if (r == 0)
2243 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2244 else
2245 log_debug("Patched directory tree to match UID/GID range.");
2246
2247 return r;
2248}
2249
113cea80 2250/*
6d416b9c
LS
2251 * Return values:
2252 * < 0 : wait_for_terminate() failed to get the state of the
2253 * container, the container was terminated by a signal, or
2254 * failed for an unknown reason. No change is made to the
2255 * container argument.
2256 * > 0 : The program executed in the container terminated with an
2257 * error. The exit code of the program executed in the
919699ec
LP
2258 * container is returned. The container argument has been set
2259 * to CONTAINER_TERMINATED.
6d416b9c
LS
2260 * 0 : The container is being rebooted, has been shut down or exited
2261 * successfully. The container argument has been set to either
2262 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2263 *
6d416b9c
LS
2264 * That is, success is indicated by a return value of zero, and an
2265 * error is indicated by a non-zero value.
113cea80
DH
2266 */
2267static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2268 siginfo_t status;
919699ec 2269 int r;
113cea80
DH
2270
2271 r = wait_for_terminate(pid, &status);
f647962d
MS
2272 if (r < 0)
2273 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2274
2275 switch (status.si_code) {
fddbb89c 2276
113cea80 2277 case CLD_EXITED:
b5a2179b 2278 if (status.si_status == 0)
919699ec 2279 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2280 else
919699ec 2281 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2282
919699ec
LP
2283 *container = CONTAINER_TERMINATED;
2284 return status.si_status;
113cea80
DH
2285
2286 case CLD_KILLED:
2287 if (status.si_status == SIGINT) {
919699ec 2288 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2289 *container = CONTAINER_TERMINATED;
919699ec
LP
2290 return 0;
2291
113cea80 2292 } else if (status.si_status == SIGHUP) {
919699ec 2293 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2294 *container = CONTAINER_REBOOTED;
919699ec 2295 return 0;
113cea80 2296 }
919699ec 2297
4831981d 2298 _fallthrough_;
113cea80 2299 case CLD_DUMPED:
fddbb89c 2300 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2301 return -EIO;
113cea80
DH
2302
2303 default:
fddbb89c 2304 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2305 return -EIO;
113cea80 2306 }
113cea80
DH
2307}
2308
023fb90b
LP
2309static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2310 pid_t pid;
2311
4a0b58c4 2312 pid = PTR_TO_PID(userdata);
023fb90b 2313 if (pid > 0) {
c6c8f6e2 2314 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2315 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2316 sd_event_source_set_userdata(s, NULL);
2317 return 0;
2318 }
2319 }
2320
2321 sd_event_exit(sd_event_source_get_event(s), 0);
2322 return 0;
2323}
2324
6916b164 2325static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2326 pid_t pid;
2327
2328 assert(s);
2329 assert(ssi);
2330
2331 pid = PTR_TO_PID(userdata);
2332
6916b164
AU
2333 for (;;) {
2334 siginfo_t si = {};
abdb9b08 2335
6916b164
AU
2336 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2337 return log_error_errno(errno, "Failed to waitid(): %m");
2338 if (si.si_pid == 0) /* No pending children. */
2339 break;
abdb9b08 2340 if (si.si_pid == pid) {
6916b164
AU
2341 /* The main process we care for has exited. Return from
2342 * signal handler but leave the zombie. */
2343 sd_event_exit(sd_event_source_get_event(s), 0);
2344 break;
2345 }
abdb9b08 2346
6916b164
AU
2347 /* Reap all other children. */
2348 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2349 }
2350
2351 return 0;
2352}
2353
abdb9b08
LP
2354static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2355 pid_t pid;
2356
2357 assert(m);
2358
2359 pid = PTR_TO_PID(userdata);
2360
2361 if (arg_kill_signal > 0) {
2362 log_info("Container termination requested. Attempting to halt container.");
2363 (void) kill(pid, arg_kill_signal);
2364 } else {
2365 log_info("Container termination requested. Exiting.");
2366 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2367 }
2368
2369 return 0;
2370}
2371
ec16945e 2372static int determine_names(void) {
1b9cebf6 2373 int r;
ec16945e 2374
c1521918
LP
2375 if (arg_template && !arg_directory && arg_machine) {
2376
2377 /* If --template= was specified then we should not
2378 * search for a machine, but instead create a new one
2379 * in /var/lib/machine. */
2380
605405c6 2381 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2382 if (!arg_directory)
2383 return log_oom();
2384 }
2385
ec16945e 2386 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2387 if (arg_machine) {
2388 _cleanup_(image_unrefp) Image *i = NULL;
2389
5ef46e5f 2390 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2391 if (r == -ENOENT)
2392 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2393 if (r < 0)
2394 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2395
eb38edce 2396 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2397 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2398 else
0f03c2a4 2399 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2400 if (r < 0)
0f3be6ca 2401 return log_oom();
1b9cebf6 2402
aee327b8
LP
2403 if (!arg_ephemeral)
2404 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2405 } else {
2406 r = safe_getcwd(&arg_directory);
2407 if (r < 0)
2408 return log_error_errno(r, "Failed to determine current directory: %m");
2409 }
ec16945e 2410
0f3be6ca 2411 if (!arg_directory && !arg_image) {
1b9cebf6 2412 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2413 return -EINVAL;
2414 }
2415 }
2416
2417 if (!arg_machine) {
b9ba4dab
LP
2418 if (arg_directory && path_equal(arg_directory, "/"))
2419 arg_machine = gethostname_malloc();
4827ab48
LP
2420 else {
2421 if (arg_image) {
2422 char *e;
2423
2424 arg_machine = strdup(basename(arg_image));
2425
2426 /* Truncate suffix if there is one */
2427 e = endswith(arg_machine, ".raw");
2428 if (e)
2429 *e = 0;
2430 } else
2431 arg_machine = strdup(basename(arg_directory));
2432 }
ec16945e
LP
2433 if (!arg_machine)
2434 return log_oom();
2435
ae691c1d 2436 hostname_cleanup(arg_machine);
ec16945e
LP
2437 if (!machine_name_is_valid(arg_machine)) {
2438 log_error("Failed to determine machine name automatically, please use -M.");
2439 return -EINVAL;
2440 }
b9ba4dab
LP
2441
2442 if (arg_ephemeral) {
2443 char *b;
2444
2445 /* Add a random suffix when this is an
2446 * ephemeral machine, so that we can run many
2447 * instances at once without manually having
2448 * to specify -M each time. */
2449
2450 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2451 return log_oom();
2452
2453 free(arg_machine);
2454 arg_machine = b;
2455 }
ec16945e
LP
2456 }
2457
2458 return 0;
2459}
2460
8d4aa2bb 2461static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2462 char *chased;
2463 int r;
2464
2465 assert(p);
2466
2467 if (!*p)
2468 return 0;
2469
8d4aa2bb 2470 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2473
8405dcf7
ZJS
2474 free_and_replace(*p, chased);
2475 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2476}
2477
03cfe0d5 2478static int determine_uid_shift(const char *directory) {
6dac160c
LP
2479 int r;
2480
0de7acce 2481 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2482 arg_uid_shift = 0;
6dac160c 2483 return 0;
03cfe0d5 2484 }
6dac160c
LP
2485
2486 if (arg_uid_shift == UID_INVALID) {
2487 struct stat st;
2488
03cfe0d5 2489 r = stat(directory, &st);
6dac160c 2490 if (r < 0)
03cfe0d5 2491 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2492
2493 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2494
2495 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2496 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2497 return -EINVAL;
2498 }
2499
2500 arg_uid_range = UINT32_C(0x10000);
2501 }
2502
2503 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2504 log_error("UID base too high for UID range.");
2505 return -EINVAL;
2506 }
2507
6dac160c
LP
2508 return 0;
2509}
2510
03cfe0d5
LP
2511static int inner_child(
2512 Barrier *barrier,
2513 const char *directory,
2514 bool secondary,
2515 int kmsg_socket,
2516 int rtnl_socket,
f757855e 2517 FDSet *fds) {
69c79d3c 2518
03cfe0d5 2519 _cleanup_free_ char *home = NULL;
e01ff70a 2520 char as_uuid[37];
88614c8a 2521 size_t n_env = 1;
03cfe0d5 2522 const char *envp[] = {
0c300adf 2523 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2524 NULL, /* container */
03cfe0d5
LP
2525 NULL, /* TERM */
2526 NULL, /* HOME */
2527 NULL, /* USER */
2528 NULL, /* LOGNAME */
2529 NULL, /* container_uuid */
2530 NULL, /* LISTEN_FDS */
2531 NULL, /* LISTEN_PID */
9c1e04d0 2532 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2533 NULL
2534 };
1a68e1e5 2535 const char *exec_target;
2371271c 2536 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2537 int r;
88213476 2538
03cfe0d5
LP
2539 assert(barrier);
2540 assert(directory);
2541 assert(kmsg_socket >= 0);
88213476 2542
0de7acce 2543 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2544 /* Tell the parent, that it now can write the UID map. */
2545 (void) barrier_place(barrier); /* #1 */
7027ff61 2546
03cfe0d5
LP
2547 /* Wait until the parent wrote the UID map */
2548 if (!barrier_place_and_sync(barrier)) { /* #2 */
2549 log_error("Parent died too early");
2550 return -ESRCH;
2551 }
88213476
LP
2552 }
2553
6d66bd3b
EV
2554 r = reset_uid_gid();
2555 if (r < 0)
2556 return log_error_errno(r, "Couldn't become new root: %m");
2557
0de7acce 2558 r = mount_all(NULL,
4f086aab 2559 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2560 arg_uid_shift,
2561 arg_uid_range,
2562 arg_selinux_apifs_context);
03cfe0d5
LP
2563 if (r < 0)
2564 return r;
2565
04413780
ZJS
2566 if (!arg_network_namespace_path && arg_private_network) {
2567 r = unshare(CLONE_NEWNET);
2568 if (r < 0)
2569 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2570
2571 /* Tell the parent that it can setup network interfaces. */
2572 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2573 }
2574
4f086aab 2575 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2576 if (r < 0)
2577 return r;
2578
03cfe0d5
LP
2579 /* Wait until we are cgroup-ified, so that we
2580 * can mount the right cgroup path writable */
75116558 2581 if (!barrier_place_and_sync(barrier)) { /* #4 */
03cfe0d5
LP
2582 log_error("Parent died too early");
2583 return -ESRCH;
88213476
LP
2584 }
2585
5a8ff0e6 2586 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2587 r = unshare(CLONE_NEWCGROUP);
2588 if (r < 0)
04413780 2589 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2590 r = mount_cgroups(
2591 "",
2592 arg_unified_cgroup_hierarchy,
2593 arg_userns_mode != USER_NAMESPACE_NO,
2594 arg_uid_shift,
2595 arg_uid_range,
5a8ff0e6 2596 arg_selinux_apifs_context,
ada54120 2597 true);
0996ef00
CB
2598 if (r < 0)
2599 return r;
2600 } else {
2601 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2602 if (r < 0)
2603 return r;
2604 }
ec16945e 2605
1e4f1671 2606 r = setup_boot_id();
03cfe0d5
LP
2607 if (r < 0)
2608 return r;
ec16945e 2609
1e4f1671 2610 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2611 if (r < 0)
2612 return r;
2613 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2614
03cfe0d5
LP
2615 if (setsid() < 0)
2616 return log_error_errno(errno, "setsid() failed: %m");
2617
2618 if (arg_private_network)
2619 loopback_setup();
2620
7a8f6325
LP
2621 if (arg_expose_ports) {
2622 r = expose_port_send_rtnl(rtnl_socket);
2623 if (r < 0)
2624 return r;
2625 rtnl_socket = safe_close(rtnl_socket);
2626 }
03cfe0d5 2627
81f345df
LP
2628 if (arg_oom_score_adjust_set) {
2629 r = set_oom_score_adjust(arg_oom_score_adjust);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to adjust OOM score: %m");
2632 }
2633
d107bb7d
LP
2634 if (arg_cpuset)
2635 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2636 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2637
709f6e46
MS
2638 r = drop_capabilities();
2639 if (r < 0)
2640 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5 2641
c818eef1 2642 (void) setup_hostname();
03cfe0d5 2643
050f7277 2644 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2645 r = safe_personality(arg_personality);
2646 if (r < 0)
2647 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2648 } else if (secondary) {
21022b9d
LP
2649 r = safe_personality(PER_LINUX32);
2650 if (r < 0)
2651 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2652 }
2653
349cc4a5 2654#if HAVE_SELINUX
03cfe0d5 2655 if (arg_selinux_context)
2ed96880 2656 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2657 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2658#endif
2659
ee645080 2660 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2661 if (r < 0)
2662 return r;
2663
66edd963
LP
2664 if (arg_no_new_privileges)
2665 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2666 return log_error_errno(errno, "Failed to disable new privileges: %m");
2667
6aadfa4c
ILG
2668 /* LXC sets container=lxc, so follow the scheme here */
2669 envp[n_env++] = strjoina("container=", arg_container_service_name);
2670
03cfe0d5
LP
2671 envp[n_env] = strv_find_prefix(environ, "TERM=");
2672 if (envp[n_env])
313cefa1 2673 n_env++;
03cfe0d5
LP
2674
2675 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2676 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2677 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2678 return log_oom();
2679
3bbaff3e 2680 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2681
691675ba 2682 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2683 return log_oom();
03cfe0d5
LP
2684
2685 if (fdset_size(fds) > 0) {
2686 r = fdset_cloexec(fds, false);
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2689
2690 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2691 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2692 return log_oom();
2693 }
9c1e04d0
AP
2694 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2695 return log_oom();
03cfe0d5 2696
2371271c
TG
2697 env_use = strv_env_merge(2, envp, arg_setenv);
2698 if (!env_use)
2699 return log_oom();
03cfe0d5
LP
2700
2701 /* Let the parent know that we are ready and
2702 * wait until the parent is ready with the
2703 * setup, too... */
75116558 2704 if (!barrier_place_and_sync(barrier)) { /* #5 */
03cfe0d5
LP
2705 log_error("Parent died too early");
2706 return -ESRCH;
2707 }
2708
5f932eb9
LP
2709 if (arg_chdir)
2710 if (chdir(arg_chdir) < 0)
2711 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2712
7732f92b 2713 if (arg_start_mode == START_PID2) {
75bf701f 2714 r = stub_pid1(arg_uuid);
7732f92b
LP
2715 if (r < 0)
2716 return r;
2717 }
2718
8ca082b4
LP
2719 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
2720 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
2721 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 2722 log_close();
8ca082b4
LP
2723 log_set_open_when_needed(true);
2724
03cfe0d5
LP
2725 (void) fdset_close_others(fds);
2726
7732f92b 2727 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2728 char **a;
2729 size_t m;
2730
2731 /* Automatically search for the init system */
2732
75f32f04
ZJS
2733 m = strv_length(arg_parameters);
2734 a = newa(char*, m + 2);
2735 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2736 a[1 + m] = NULL;
03cfe0d5 2737
ced58da7 2738 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2739 execve(a[0], a, env_use);
2740
ced58da7 2741 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2742 execve(a[0], a, env_use);
2743
ced58da7 2744 a[0] = (char*) "/sbin/init";
03cfe0d5 2745 execve(a[0], a, env_use);
ced58da7
LP
2746
2747 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2748 } else if (!strv_isempty(arg_parameters)) {
2749 exec_target = arg_parameters[0];
f757855e 2750 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2751 } else {
5f932eb9 2752 if (!arg_chdir)
d929b0f9
ZJS
2753 /* If we cannot change the directory, we'll end up in /, that is expected. */
2754 (void) chdir(home ?: "/root");
5f932eb9 2755
03cfe0d5
LP
2756 execle("/bin/bash", "-bash", NULL, env_use);
2757 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2758
2759 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2760 }
2761
8ca082b4 2762 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2763}
2764
9c1e04d0
AP
2765static int setup_sd_notify_child(void) {
2766 static const int one = 1;
2767 int fd = -1;
2768 union sockaddr_union sa = {
2769 .sa.sa_family = AF_UNIX,
2770 };
2771 int r;
2772
2773 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2774 if (fd < 0)
2775 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2776
2777 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2778 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2779
2780 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2781 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2782 if (r < 0) {
2783 safe_close(fd);
2784 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2785 }
2786
adc7d9f0
EV
2787 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2788 if (r < 0) {
2789 safe_close(fd);
2790 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2791 }
2792
9c1e04d0
AP
2793 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2794 if (r < 0) {
2795 safe_close(fd);
2796 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2797 }
2798
2799 return fd;
2800}
2801
03cfe0d5
LP
2802static int outer_child(
2803 Barrier *barrier,
2804 const char *directory,
2805 const char *console,
2d845785 2806 DissectedImage *dissected_image,
03cfe0d5
LP
2807 bool interactive,
2808 bool secondary,
2809 int pid_socket,
e01ff70a 2810 int uuid_socket,
9c1e04d0 2811 int notify_socket,
03cfe0d5
LP
2812 int kmsg_socket,
2813 int rtnl_socket,
825d5287 2814 int uid_shift_socket,
8199d554 2815 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
2816 FDSet *fds,
2817 int netns_fd) {
03cfe0d5 2818
bf428efb
LP
2819 _cleanup_close_ int fd = -1;
2820 int r, which_failed;
03cfe0d5
LP
2821 pid_t pid;
2822 ssize_t l;
03cfe0d5
LP
2823
2824 assert(barrier);
2825 assert(directory);
2826 assert(console);
2827 assert(pid_socket >= 0);
e01ff70a 2828 assert(uuid_socket >= 0);
9c1e04d0 2829 assert(notify_socket >= 0);
03cfe0d5
LP
2830 assert(kmsg_socket >= 0);
2831
2832 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2833 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2834
2835 if (interactive) {
2b33ab09 2836 int terminal;
03cfe0d5 2837
2b33ab09
LP
2838 terminal = open_terminal(console, O_RDWR);
2839 if (terminal < 0)
2840 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 2841
17cac366
LP
2842 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
2843 r = log_dup_console();
2844 if (r < 0)
2845 return log_error_errno(r, "Failed to duplicate stderr: %m");
2846
2b33ab09
LP
2847 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2848 if (r < 0)
2849 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
2850 }
2851
2852 r = reset_audit_loginuid();
2853 if (r < 0)
2854 return r;
2855
2856 /* Mark everything as slave, so that we still
2857 * receive mounts from the real root, but don't
2858 * propagate mounts to the real root. */
60e76d48
ZJS
2859 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2860 if (r < 0)
2861 return r;
03cfe0d5 2862
2d845785 2863 if (dissected_image) {
2d3a5a73
LP
2864 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2865 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2866 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2867 * makes sure ESP partitions and userns are compatible. */
2868
2869 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
2870 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
2871 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
2872 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
2873 if (r < 0)
2874 return r;
2875 }
03cfe0d5 2876
391567f4
LP
2877 r = determine_uid_shift(directory);
2878 if (r < 0)
2879 return r;
2880
0de7acce 2881 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2882 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2883 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2884 if (l < 0)
2885 return log_error_errno(errno, "Failed to send UID shift: %m");
2886 if (l != sizeof(arg_uid_shift)) {
2887 log_error("Short write while sending UID shift.");
2888 return -EIO;
2889 }
0e7ac751 2890
0de7acce 2891 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2892 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2893 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2894 * not it will pick a different one, and send it back to us. */
2895
2896 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2897 if (l < 0)
2898 return log_error_errno(errno, "Failed to recv UID shift: %m");
2899 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2900 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2901 return -EIO;
2902 }
2903 }
2904
2905 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2906 }
2907
2d3a5a73
LP
2908 if (dissected_image) {
2909 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2910 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2911 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2912 if (r < 0)
2913 return r;
2914 }
2915
8199d554
LP
2916 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2917 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2918
2919 r = detect_unified_cgroup_hierarchy_from_image(directory);
2920 if (r < 0)
2921 return r;
2922
2923 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2924 if (l < 0)
2925 return log_error_errno(errno, "Failed to send cgroup mode: %m");
2926 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
b8b846d7 2927 log_error("Short write while sending cgroup mode.");
8199d554
LP
2928 return -EIO;
2929 }
2930
2931 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2932 }
2933
03cfe0d5 2934 /* Turn directory into bind mount */
60e76d48
ZJS
2935 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2936 if (r < 0)
2937 return r;
03cfe0d5 2938
b53ede69
PW
2939 r = setup_pivot_root(
2940 directory,
2941 arg_pivot_root_new,
2942 arg_pivot_root_old);
2943 if (r < 0)
2944 return r;
2945
0de7acce
LP
2946 r = setup_volatile(
2947 directory,
2948 arg_volatile_mode,
2949 arg_userns_mode != USER_NAMESPACE_NO,
2950 arg_uid_shift,
2951 arg_uid_range,
2952 arg_selinux_context);
03cfe0d5
LP
2953 if (r < 0)
2954 return r;
2955
0de7acce
LP
2956 r = setup_volatile_state(
2957 directory,
2958 arg_volatile_mode,
2959 arg_userns_mode != USER_NAMESPACE_NO,
2960 arg_uid_shift,
2961 arg_uid_range,
2962 arg_selinux_context);
03cfe0d5
LP
2963 if (r < 0)
2964 return r;
2965
4ad14eff
LP
2966 /* Mark everything as shared so our mounts get propagated down. This is
2967 * required to make new bind mounts available in systemd services
2968 * inside the containter that create a new mount namespace.
2969 * See https://github.com/systemd/systemd/issues/3860
2970 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2971 * shared propagation mode. */
4ad14eff
LP
2972 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2973 if (r < 0)
2974 return r;
2975
2976 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2977 if (r < 0)
2978 return r;
2979
03cfe0d5
LP
2980 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2981 if (r < 0)
2982 return r;
2983
03cfe0d5 2984 if (arg_read_only) {
6b7c9f8b 2985 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2986 if (r < 0)
2987 return log_error_errno(r, "Failed to make tree read-only: %m");
2988 }
2989
0de7acce 2990 r = mount_all(directory,
4f086aab 2991 arg_mount_settings,
0de7acce
LP
2992 arg_uid_shift,
2993 arg_uid_range,
2994 arg_selinux_apifs_context);
03cfe0d5
LP
2995 if (r < 0)
2996 return r;
2997
07fa00f9
LP
2998 r = copy_devnodes(directory);
2999 if (r < 0)
03cfe0d5
LP
3000 return r;
3001
3002 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3003
07fa00f9
LP
3004 r = setup_pts(directory);
3005 if (r < 0)
03cfe0d5
LP
3006 return r;
3007
3008 r = setup_propagate(directory);
3009 if (r < 0)
3010 return r;
3011
3012 r = setup_dev_console(directory, console);
3013 if (r < 0)
3014 return r;
3015
8e5430c4
LP
3016 r = setup_keyring();
3017 if (r < 0)
3018 return r;
3019
960e4569 3020 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
3021 if (r < 0)
3022 return r;
3023
3024 r = setup_timezone(directory);
3025 if (r < 0)
3026 return r;
3027
3028 r = setup_resolv_conf(directory);
3029 if (r < 0)
3030 return r;
3031
e01ff70a
MS
3032 r = setup_machine_id(directory);
3033 if (r < 0)
3034 return r;
3035
03cfe0d5
LP
3036 r = setup_journal(directory);
3037 if (r < 0)
3038 return r;
3039
0de7acce
LP
3040 r = mount_custom(
3041 directory,
3042 arg_custom_mounts,
3043 arg_n_custom_mounts,
3044 arg_userns_mode != USER_NAMESPACE_NO,
3045 arg_uid_shift,
3046 arg_uid_range,
3047 arg_selinux_apifs_context);
03cfe0d5
LP
3048 if (r < 0)
3049 return r;
3050
5a8ff0e6 3051 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3052 r = mount_cgroups(
3053 directory,
3054 arg_unified_cgroup_hierarchy,
3055 arg_userns_mode != USER_NAMESPACE_NO,
3056 arg_uid_shift,
3057 arg_uid_range,
5a8ff0e6 3058 arg_selinux_apifs_context,
ada54120 3059 false);
0996ef00
CB
3060 if (r < 0)
3061 return r;
3062 }
03cfe0d5
LP
3063
3064 r = mount_move_root(directory);
3065 if (r < 0)
3066 return log_error_errno(r, "Failed to move root directory: %m");
3067
9c1e04d0
AP
3068 fd = setup_sd_notify_child();
3069 if (fd < 0)
3070 return fd;
3071
bf428efb
LP
3072 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3073 if (r < 0)
3074 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3075
03cfe0d5 3076 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3077 arg_clone_ns_flags |
8869a0b4 3078 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3079 if (pid < 0)
3080 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3081 if (pid == 0) {
3082 pid_socket = safe_close(pid_socket);
e01ff70a 3083 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3084 notify_socket = safe_close(notify_socket);
825d5287 3085 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3086
3087 /* The inner child has all namespaces that are
3088 * requested, so that we all are owned by the user if
3089 * user namespaces are turned on. */
3090
d7bea6b6
DP
3091 if (arg_network_namespace_path) {
3092 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3093 if (r < 0)
3094 return r;
3095 }
3096
f757855e 3097 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3098 if (r < 0)
3099 _exit(EXIT_FAILURE);
3100
3101 _exit(EXIT_SUCCESS);
3102 }
3103
3104 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3105 if (l < 0)
3106 return log_error_errno(errno, "Failed to send PID: %m");
3107 if (l != sizeof(pid)) {
3108 log_error("Short write while sending PID.");
3109 return -EIO;
3110 }
3111
e01ff70a
MS
3112 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3113 if (l < 0)
3114 return log_error_errno(errno, "Failed to send machine ID: %m");
3115 if (l != sizeof(arg_uuid)) {
3116 log_error("Short write while sending machine ID.");
3117 return -EIO;
3118 }
3119
9c1e04d0
AP
3120 l = send_one_fd(notify_socket, fd, 0);
3121 if (l < 0)
3122 return log_error_errno(errno, "Failed to send notify fd: %m");
3123
03cfe0d5 3124 pid_socket = safe_close(pid_socket);
e01ff70a 3125 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3126 notify_socket = safe_close(notify_socket);
327e26d6
KN
3127 kmsg_socket = safe_close(kmsg_socket);
3128 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3129 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3130
3131 return 0;
3132}
3133
0e7ac751 3134static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3135 bool tried_hashed = false;
0e7ac751
LP
3136 unsigned n_tries = 100;
3137 uid_t candidate;
3138 int r;
3139
3140 assert(shift);
3141 assert(ret_lock_file);
0de7acce 3142 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3143 assert(arg_uid_range == 0x10000U);
3144
3145 candidate = *shift;
3146
3147 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3148
3149 for (;;) {
fbd0b64f 3150 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3151 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3152
3153 if (--n_tries <= 0)
3154 return -EBUSY;
3155
87d5e4f2 3156 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3157 goto next;
3158 if ((candidate & UINT32_C(0xFFFF)) != 0)
3159 goto next;
3160
3161 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3162 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3163 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3164 goto next;
3165 if (r < 0)
3166 return r;
3167
3168 /* Make some superficial checks whether the range is currently known in the user database */
3169 if (getpwuid(candidate))
3170 goto next;
3171 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3172 goto next;
3173 if (getgrgid(candidate))
3174 goto next;
3175 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3176 goto next;
3177
3178 *ret_lock_file = lf;
3179 lf = (struct LockFile) LOCK_FILE_INIT;
3180 *shift = candidate;
3181 return 0;
3182
3183 next:
d381c8a6
LP
3184 if (arg_machine && !tried_hashed) {
3185 /* Try to hash the base from the container name */
3186
3187 static const uint8_t hash_key[] = {
3188 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3189 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3190 };
3191
3192 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3193
3194 tried_hashed = true;
3195 } else
3196 random_bytes(&candidate, sizeof(candidate));
3197
87d5e4f2 3198 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3199 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3200 }
3201}
3202
03cfe0d5 3203static int setup_uid_map(pid_t pid) {
fbd0b64f 3204 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3205 int r;
3206
3207 assert(pid > 1);
3208
3209 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3210 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3211 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to write UID map: %m");
3214
3215 /* We always assign the same UID and GID ranges */
3216 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3217 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3218 if (r < 0)
3219 return log_error_errno(r, "Failed to write GID map: %m");
3220
3221 return 0;
3222}
3223
9c1e04d0 3224static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3225 char buf[NOTIFY_BUFFER_MAX+1];
3226 char *p = NULL;
3227 struct iovec iovec = {
3228 .iov_base = buf,
3229 .iov_len = sizeof(buf)-1,
3230 };
3231 union {
3232 struct cmsghdr cmsghdr;
3233 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3234 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3235 } control = {};
3236 struct msghdr msghdr = {
3237 .msg_iov = &iovec,
3238 .msg_iovlen = 1,
3239 .msg_control = &control,
3240 .msg_controllen = sizeof(control),
3241 };
3242 struct cmsghdr *cmsg;
3243 struct ucred *ucred = NULL;
3244 ssize_t n;
3245 pid_t inner_child_pid;
3246 _cleanup_strv_free_ char **tags = NULL;
3247
3248 assert(userdata);
3249
3250 inner_child_pid = PTR_TO_PID(userdata);
3251
3252 if (revents != EPOLLIN) {
3253 log_warning("Got unexpected poll event for notify fd.");
3254 return 0;
3255 }
3256
3257 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3258 if (n < 0) {
3742095b 3259 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3260 return 0;
3261
3262 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3263 }
3264 cmsg_close_all(&msghdr);
3265
3266 CMSG_FOREACH(cmsg, &msghdr) {
3267 if (cmsg->cmsg_level == SOL_SOCKET &&
3268 cmsg->cmsg_type == SCM_CREDENTIALS &&
3269 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3270
3271 ucred = (struct ucred*) CMSG_DATA(cmsg);
3272 }
3273 }
3274
3275 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3276 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3277 return 0;
3278 }
3279
3280 if ((size_t) n >= sizeof(buf)) {
3281 log_warning("Received notify message exceeded maximum size. Ignoring.");
3282 return 0;
3283 }
3284
3285 buf[n] = 0;
3286 tags = strv_split(buf, "\n\r");
3287 if (!tags)
3288 return log_oom();
3289
3290 if (strv_find(tags, "READY=1"))
3291 sd_notifyf(false, "READY=1\n");
3292
3293 p = strv_find_startswith(tags, "STATUS=");
3294 if (p)
3295 sd_notifyf(false, "STATUS=Container running: %s", p);
3296
3297 return 0;
3298}
3299
5773024d 3300static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3301 int r;
9c1e04d0 3302
5773024d 3303 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3304 if (r < 0)
3305 return log_error_errno(r, "Failed to allocate notify event source: %m");
3306
5773024d 3307 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3308
3309 return 0;
3310}
3311
5d961407
LP
3312static int merge_settings(Settings *settings, const char *path) {
3313 int rl;
f757855e 3314
5d961407
LP
3315 assert(settings);
3316 assert(path);
f757855e 3317
5d961407
LP
3318 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3319 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3320
7732f92b
LP
3321 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3322 settings->start_mode >= 0) {
3323 arg_start_mode = settings->start_mode;
130d3d22 3324 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3325 }
3326
b53ede69
PW
3327 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3328 settings->pivot_root_new) {
3329 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3330 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3331 }
3332
5f932eb9 3333 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3334 settings->working_directory)
3335 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3336
f757855e 3337 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3338 settings->environment)
3339 strv_free_and_replace(arg_setenv, settings->environment);
f757855e
LP
3340
3341 if ((arg_settings_mask & SETTING_USER) == 0 &&
1cc6c93a
YW
3342 settings->user)
3343 free_and_replace(arg_user, settings->user);
f757855e
LP
3344
3345 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3346 uint64_t plus;
f757855e 3347
0e265674
LP
3348 plus = settings->capability;
3349 if (settings_private_network(settings))
3350 plus |= (1ULL << CAP_NET_ADMIN);
3351
3352 if (!arg_settings_trusted && plus != 0) {
3353 if (settings->capability != 0)
5d961407 3354 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3355 } else
520e0d54 3356 arg_caps_retain |= plus;
f757855e 3357
520e0d54 3358 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3359 }
3360
3361 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3362 settings->kill_signal > 0)
3363 arg_kill_signal = settings->kill_signal;
3364
3365 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3366 settings->personality != PERSONALITY_INVALID)
3367 arg_personality = settings->personality;
3368
3369 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3370 !sd_id128_is_null(settings->machine_id)) {
3371
3372 if (!arg_settings_trusted)
5d961407 3373 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3374 else
3375 arg_uuid = settings->machine_id;
3376 }
3377
3378 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3379 settings->read_only >= 0)
3380 arg_read_only = settings->read_only;
3381
3382 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3383 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3384 arg_volatile_mode = settings->volatile_mode;
3385
3386 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3387 settings->n_custom_mounts > 0) {
3388
3389 if (!arg_settings_trusted)
5d961407 3390 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3391 else {
3392 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3393 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3394 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3395 settings->n_custom_mounts = 0;
3396 }
3397 }
3398
3399 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3400 (settings->private_network >= 0 ||
3401 settings->network_veth >= 0 ||
3402 settings->network_bridge ||
22b28dfd 3403 settings->network_zone ||
f757855e
LP
3404 settings->network_interfaces ||
3405 settings->network_macvlan ||
f6d6bad1
LP
3406 settings->network_ipvlan ||
3407 settings->network_veth_extra)) {
f757855e
LP
3408
3409 if (!arg_settings_trusted)
5d961407 3410 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3411 else {
f6d6bad1 3412 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3413 arg_private_network = settings_private_network(settings);
3414
130d3d22
YW
3415 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3416 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3417 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3418 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3419
1cc6c93a
YW
3420 free_and_replace(arg_network_bridge, settings->network_bridge);
3421 free_and_replace(arg_network_zone, settings->network_zone);
f757855e
LP
3422 }
3423 }
3424
3425 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3426 settings->expose_ports) {
3427
3428 if (!arg_settings_trusted)
5d961407 3429 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3430 else {
3431 expose_port_free_all(arg_expose_ports);
1cc6c93a 3432 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3433 }
3434 }
3435
0de7acce
LP
3436 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3437 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3438
3439 if (!arg_settings_trusted)
5d961407 3440 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3441 else {
3442 arg_userns_mode = settings->userns_mode;
3443 arg_uid_shift = settings->uid_shift;
3444 arg_uid_range = settings->uid_range;
3445 arg_userns_chown = settings->userns_chown;
3446 }
3447 }
3448
9c1e04d0
AP
3449 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3450 arg_notify_ready = settings->notify_ready;
3451
960e4569
LP
3452 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3453
3454 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
5d961407 3455 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3456 else {
130d3d22
YW
3457 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3458 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569
LP
3459 }
3460 }
3461
bf428efb
LP
3462 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3463 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3464 continue;
3465
3466 if (!settings->rlimit[rl])
3467 continue;
3468
3469 if (!arg_settings_trusted) {
5d961407 3470 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3471 continue;
3472 }
3473
3474 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3475 }
3476
3a9530e5
LP
3477 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3478 settings->hostname)
3479 free_and_replace(arg_hostname, settings->hostname);
3480
66edd963
LP
3481 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3482 settings->no_new_privileges >= 0)
3483 arg_no_new_privileges = settings->no_new_privileges;
3484
81f345df
LP
3485 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3486 settings->oom_score_adjust_set) {
3487
3488 if (!arg_settings_trusted)
5d961407 3489 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3490 else {
3491 arg_oom_score_adjust = settings->oom_score_adjust;
3492 arg_oom_score_adjust_set = true;
3493 }
3494 }
3495
d107bb7d
LP
3496 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3497 settings->cpuset) {
3498
3499 if (!arg_settings_trusted)
5d961407 3500 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3501 else {
3502 if (arg_cpuset)
3503 CPU_FREE(arg_cpuset);
3504 arg_cpuset = TAKE_PTR(settings->cpuset);
3505 arg_cpuset_ncpus = settings->cpuset_ncpus;
3506 }
3507 }
3508
09d423e9
LP
3509 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3510 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3511 arg_resolv_conf = settings->resolv_conf;
3512
4e1d6aa9
LP
3513 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3514 settings->link_journal != _LINK_JOURNAL_INVALID) {
3515
3516 if (!arg_settings_trusted)
3517 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3518 else {
3519 arg_link_journal = settings->link_journal;
3520 arg_link_journal_try = settings->link_journal_try;
3521 }
3522 }
3523
1688841f
LP
3524 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3525 settings->timezone != _TIMEZONE_MODE_INVALID)
3526 arg_timezone = settings->timezone;
3527
f757855e
LP
3528 return 0;
3529}
3530
5d961407
LP
3531static int load_settings(void) {
3532 _cleanup_(settings_freep) Settings *settings = NULL;
3533 _cleanup_fclose_ FILE *f = NULL;
3534 _cleanup_free_ char *p = NULL;
3535 const char *fn, *i;
3536 int r;
3537
3538 /* If all settings are masked, there's no point in looking for
3539 * the settings file */
3540 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3541 return 0;
3542
3543 fn = strjoina(arg_machine, ".nspawn");
3544
3545 /* We first look in the admin's directories in /etc and /run */
3546 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3547 _cleanup_free_ char *j = NULL;
3548
3549 j = strjoin(i, "/", fn);
3550 if (!j)
3551 return log_oom();
3552
3553 f = fopen(j, "re");
3554 if (f) {
3555 p = TAKE_PTR(j);
3556
3557 /* By default, we trust configuration from /etc and /run */
3558 if (arg_settings_trusted < 0)
3559 arg_settings_trusted = true;
3560
3561 break;
3562 }
3563
3564 if (errno != ENOENT)
3565 return log_error_errno(errno, "Failed to open %s: %m", j);
3566 }
3567
3568 if (!f) {
3569 /* After that, let's look for a file next to the
3570 * actual image we shall boot. */
3571
3572 if (arg_image) {
3573 p = file_in_same_dir(arg_image, fn);
3574 if (!p)
3575 return log_oom();
3576 } else if (arg_directory) {
3577 p = file_in_same_dir(arg_directory, fn);
3578 if (!p)
3579 return log_oom();
3580 }
3581
3582 if (p) {
3583 f = fopen(p, "re");
3584 if (!f && errno != ENOENT)
3585 return log_error_errno(errno, "Failed to open %s: %m", p);
3586
3587 /* By default, we do not trust configuration from /var/lib/machines */
3588 if (arg_settings_trusted < 0)
3589 arg_settings_trusted = false;
3590 }
3591 }
3592
3593 if (!f)
3594 return 0;
3595
3596 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3597
3598 r = settings_load(f, p, &settings);
3599 if (r < 0)
3600 return r;
3601
3602 return merge_settings(settings, p);
3603}
3604
b0067625
ZJS
3605static int run(int master,
3606 const char* console,
2d845785 3607 DissectedImage *dissected_image,
b0067625
ZJS
3608 bool interactive,
3609 bool secondary,
3610 FDSet *fds,
3611 char veth_name[IFNAMSIZ], bool *veth_created,
3612 union in_addr_union *exposed,
3613 pid_t *pid, int *ret) {
3614
3615 static const struct sigaction sa = {
3616 .sa_handler = nop_signal_handler,
e28c7cd0 3617 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3618 };
3619
8e766630 3620 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
3621 _cleanup_close_ int etc_passwd_lock = -1;
3622 _cleanup_close_pair_ int
3623 kmsg_socket_pair[2] = { -1, -1 },
3624 rtnl_socket_pair[2] = { -1, -1 },
3625 pid_socket_pair[2] = { -1, -1 },
3626 uuid_socket_pair[2] = { -1, -1 },
3627 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
3628 uid_shift_socket_pair[2] = { -1, -1 },
3629 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3630
b0067625
ZJS
3631 _cleanup_close_ int notify_socket= -1;
3632 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3633 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3634 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3635 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3636 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 3637 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625
ZJS
3638 ContainerStatus container_status = 0;
3639 char last_char = 0;
3640 int ifi = 0, r;
3641 ssize_t l;
3642 sigset_t mask_chld;
d7bea6b6 3643 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
3644
3645 assert_se(sigemptyset(&mask_chld) == 0);
3646 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3647
3648 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3649 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3650 * check with getpwuid() if the specific user already exists. Note that /etc might be
3651 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3652 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3653 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3654 * really ours. */
3655
3656 etc_passwd_lock = take_etc_passwd_lock(NULL);
3657 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3658 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3659 }
3660
3661 r = barrier_create(&barrier);
3662 if (r < 0)
3663 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3664
3665 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3666 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3667
3668 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3669 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3670
3671 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3672 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3673
3674 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3675 return log_error_errno(errno, "Failed to create id socket pair: %m");
3676
3677 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3678 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3679
3680 if (arg_userns_mode != USER_NAMESPACE_NO)
3681 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3682 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3683
8199d554
LP
3684 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3685 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3686 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3687
b0067625
ZJS
3688 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3689 * parent's blocking calls and give it a chance to call wait() and terminate. */
3690 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3691 if (r < 0)
3692 return log_error_errno(errno, "Failed to change the signal mask: %m");
3693
3694 r = sigaction(SIGCHLD, &sa, NULL);
3695 if (r < 0)
3696 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3697
d7bea6b6
DP
3698 if (arg_network_namespace_path) {
3699 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3700 if (netns_fd < 0)
3701 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3702
3703 r = fd_is_network_ns(netns_fd);
3704 if (r < 0 && r != -ENOTTY)
3705 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
3706 if (r == 0) {
3707 log_error("Path %s doesn't refer to a network namespace", arg_network_namespace_path);
3708 return -EINVAL;
3709 }
3710 }
3711
b0067625
ZJS
3712 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3713 if (*pid < 0)
3714 return log_error_errno(errno, "clone() failed%s: %m",
3715 errno == EINVAL ?
3716 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3717
3718 if (*pid == 0) {
3719 /* The outer child only has a file system namespace. */
3720 barrier_set_role(&barrier, BARRIER_CHILD);
3721
3722 master = safe_close(master);
3723
3724 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3725 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3726 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3727 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3728 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3729 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 3730 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
3731
3732 (void) reset_all_signal_handlers();
3733 (void) reset_signal_mask();
3734
3735 r = outer_child(&barrier,
3736 arg_directory,
3737 console,
2d845785 3738 dissected_image,
b0067625
ZJS
3739 interactive,
3740 secondary,
3741 pid_socket_pair[1],
3742 uuid_socket_pair[1],
3743 notify_socket_pair[1],
3744 kmsg_socket_pair[1],
3745 rtnl_socket_pair[1],
3746 uid_shift_socket_pair[1],
8199d554 3747 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
3748 fds,
3749 netns_fd);
b0067625
ZJS
3750 if (r < 0)
3751 _exit(EXIT_FAILURE);
3752
3753 _exit(EXIT_SUCCESS);
3754 }
3755
3756 barrier_set_role(&barrier, BARRIER_PARENT);
3757
3758 fds = fdset_free(fds);
3759
3760 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3761 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3762 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3763 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3764 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3765 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 3766 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
3767
3768 if (arg_userns_mode != USER_NAMESPACE_NO) {
3769 /* The child just let us know the UID shift it might have read from the image. */
3770 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3771 if (l < 0)
3772 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3773 if (l != sizeof arg_uid_shift) {
3774 log_error("Short read while reading UID shift.");
3775 return -EIO;
3776 }
3777
3778 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3779 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3780 * image, but if that's already in use, pick a new one, and report back to the child,
3781 * which one we now picked. */
3782
3783 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3784 if (r < 0)
3785 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3786
3787 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3788 if (l < 0)
3789 return log_error_errno(errno, "Failed to send UID shift: %m");
3790 if (l != sizeof arg_uid_shift) {
3791 log_error("Short write while writing UID shift.");
3792 return -EIO;
3793 }
3794 }
3795 }
3796
8199d554
LP
3797 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3798 /* The child let us know the support cgroup mode it might have read from the image. */
3799 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3800 if (l < 0)
3801 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3802 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
3803 log_error("Short read while reading cgroup mode.");
3804 return -EIO;
3805 }
3806 }
3807
b0067625 3808 /* Wait for the outer child. */
d2e0ac3d
LP
3809 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3810 if (r < 0)
3811 return r;
3812 if (r != EXIT_SUCCESS)
3813 return -EIO;
b0067625
ZJS
3814
3815 /* And now retrieve the PID of the inner child. */
3816 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3817 if (l < 0)
3818 return log_error_errno(errno, "Failed to read inner child PID: %m");
3819 if (l != sizeof *pid) {
3820 log_error("Short read while reading inner child PID.");
3821 return -EIO;
3822 }
3823
3824 /* We also retrieve container UUID in case it was generated by outer child */
3825 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3826 if (l < 0)
3827 return log_error_errno(errno, "Failed to read container machine ID: %m");
3828 if (l != sizeof(arg_uuid)) {
3829 log_error("Short read while reading container machined ID.");
3830 return -EIO;
3831 }
3832
3833 /* We also retrieve the socket used for notifications generated by outer child */
3834 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3835 if (notify_socket < 0)
3836 return log_error_errno(notify_socket,
3837 "Failed to receive notification socket from the outer child: %m");
3838
3839 log_debug("Init process invoked as PID "PID_FMT, *pid);
3840
3841 if (arg_userns_mode != USER_NAMESPACE_NO) {
3842 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3843 log_error("Child died too early.");
3844 return -ESRCH;
3845 }
3846
3847 r = setup_uid_map(*pid);
3848 if (r < 0)
3849 return r;
3850
3851 (void) barrier_place(&barrier); /* #2 */
3852 }
3853
3854 if (arg_private_network) {
75116558
PS
3855 if (!arg_network_namespace_path) {
3856 /* Wait until the child has unshared its network namespace. */
3857 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3858 log_error("Child died too early");
3859 return -ESRCH;
3860 }
3861 }
3862
b0067625
ZJS
3863 r = move_network_interfaces(*pid, arg_network_interfaces);
3864 if (r < 0)
3865 return r;
3866
3867 if (arg_network_veth) {
3868 r = setup_veth(arg_machine, *pid, veth_name,
3869 arg_network_bridge || arg_network_zone);
3870 if (r < 0)
3871 return r;
3872 else if (r > 0)
3873 ifi = r;
3874
3875 if (arg_network_bridge) {
3876 /* Add the interface to a bridge */
3877 r = setup_bridge(veth_name, arg_network_bridge, false);
3878 if (r < 0)
3879 return r;
3880 if (r > 0)
3881 ifi = r;
3882 } else if (arg_network_zone) {
3883 /* Add the interface to a bridge, possibly creating it */
3884 r = setup_bridge(veth_name, arg_network_zone, true);
3885 if (r < 0)
3886 return r;
3887 if (r > 0)
3888 ifi = r;
3889 }
3890 }
3891
3892 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3893 if (r < 0)
3894 return r;
3895
3896 /* We created the primary and extra veth links now; let's remember this, so that we know to
3897 remove them later on. Note that we don't bother with removing veth links that were created
3898 here when their setup failed half-way, because in that case the kernel should be able to
3899 remove them on its own, since they cannot be referenced by anything yet. */
3900 *veth_created = true;
3901
3902 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3903 if (r < 0)
3904 return r;
3905
3906 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3907 if (r < 0)
3908 return r;
3909 }
3910
abdb9b08
LP
3911 if (arg_register || !arg_keep_unit) {
3912 r = sd_bus_default_system(&bus);
3913 if (r < 0)
3914 return log_error_errno(r, "Failed to open system bus: %m");
3915 }
3916
3917 if (!arg_keep_unit) {
3918 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3919 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3920 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3921
75152a4d
LP
3922 r = sd_bus_match_signal_async(
3923 bus,
3924 NULL,
3925 "org.freedesktop.systemd1",
3926 NULL,
3927 "org.freedesktop.systemd1.Scope",
3928 "RequestStop",
3929 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 3930 if (r < 0)
75152a4d 3931 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
3932 }
3933
b0067625
ZJS
3934 if (arg_register) {
3935 r = register_machine(
abdb9b08 3936 bus,
b0067625
ZJS
3937 arg_machine,
3938 *pid,
3939 arg_directory,
3940 arg_uuid,
3941 ifi,
3942 arg_slice,
3943 arg_custom_mounts, arg_n_custom_mounts,
3944 arg_kill_signal,
3945 arg_property,
3946 arg_keep_unit,
3947 arg_container_service_name);
3948 if (r < 0)
3949 return r;
abdb9b08 3950
cd2dfc6f
LP
3951 } else if (!arg_keep_unit) {
3952 r = allocate_scope(
abdb9b08 3953 bus,
cd2dfc6f
LP
3954 arg_machine,
3955 *pid,
3956 arg_slice,
3957 arg_custom_mounts, arg_n_custom_mounts,
3958 arg_kill_signal,
3959 arg_property);
3960 if (r < 0)
3961 return r;
3962
3963 } else if (arg_slice || arg_property)
3964 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3965
f0bef277 3966 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3967 if (r < 0)
3968 return r;
3969
720f0a2f
LP
3970 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
3971 if (r < 0)
3972 return r;
b0067625 3973
de54e02d 3974 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3975 if (r < 0)
3976 return r;
3977
3978 /* Notify the child that the parent is ready with all
3979 * its setup (including cgroup-ification), and that
3980 * the child can now hand over control to the code to
3981 * run inside the container. */
75116558 3982 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
3983
3984 /* Block SIGCHLD here, before notifying child.
3985 * process_pty() will handle it with the other signals. */
3986 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3987
3988 /* Reset signal to default */
3989 r = default_signals(SIGCHLD, -1);
3990 if (r < 0)
3991 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3992
3993 r = sd_event_new(&event);
3994 if (r < 0)
3995 return log_error_errno(r, "Failed to get default event source: %m");
3996
8fd010bb
LP
3997 (void) sd_event_set_watchdog(event, true);
3998
abdb9b08
LP
3999 if (bus) {
4000 r = sd_bus_attach_event(bus, event, 0);
4001 if (r < 0)
4002 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4003 }
4004
5773024d 4005 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4006 if (r < 0)
4007 return r;
4008
4009 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 4010 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
4011 log_error("Child died too early.");
4012 return -ESRCH;
4013 }
4014
4015 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4016 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4017 etc_passwd_lock = safe_close(etc_passwd_lock);
4018
4019 sd_notifyf(false,
4020 "STATUS=Container running.\n"
4021 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4022 if (!arg_notify_ready)
919f5ae0 4023 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4024
4025 if (arg_kill_signal > 0) {
4026 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4027 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4028 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4029 } else {
4030 /* Immediately exit */
919f5ae0
LP
4031 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4032 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4033 }
4034
6916b164 4035 /* Exit when the child exits */
919f5ae0 4036 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4037
4038 if (arg_expose_ports) {
4039 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4040 if (r < 0)
4041 return r;
4042
4043 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4044 }
4045
4046 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4047
4048 r = pty_forward_new(event, master,
4049 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
4050 &forward);
4051 if (r < 0)
4052 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4053
4054 r = sd_event_loop(event);
4055 if (r < 0)
4056 return log_error_errno(r, "Failed to run event loop: %m");
4057
4058 pty_forward_get_last_char(forward, &last_char);
4059
4060 forward = pty_forward_free(forward);
4061
4062 if (!arg_quiet && last_char != '\n')
4063 putc('\n', stdout);
4064
4065 /* Kill if it is not dead yet anyway */
abdb9b08
LP
4066 if (arg_register && !arg_keep_unit && bus)
4067 terminate_machine(bus, *pid);
b0067625
ZJS
4068
4069 /* Normally redundant, but better safe than sorry */
c67b0082 4070 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4071
4072 r = wait_for_container(*pid, &container_status);
4073 *pid = 0;
4074
4075 if (r < 0)
4076 /* We failed to wait for the container, or the container exited abnormally. */
4077 return r;
4078 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4079 /* r > 0 → The container exited with a non-zero status.
4080 * As a special case, we need to replace 133 with a different value,
4081 * because 133 is special-cased in the service file to reboot the container.
4082 * otherwise → The container exited with zero status and a reboot was not requested.
4083 */
2a49b612 4084 if (r == EXIT_FORCE_RESTART)
27e29a1e 4085 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4086 *ret = r;
b0067625
ZJS
4087 return 0; /* finito */
4088 }
4089
4090 /* CONTAINER_REBOOTED, loop again */
4091
4092 if (arg_keep_unit) {
4093 /* Special handling if we are running as a service: instead of simply
4094 * restarting the machine we want to restart the entire service, so let's
4095 * inform systemd about this with the special exit code 133. The service
4096 * file uses RestartForceExitStatus=133 so that this results in a full
4097 * nspawn restart. This is necessary since we might have cgroup parameters
4098 * set we want to have flushed out. */
2a49b612
ZJS
4099 *ret = EXIT_FORCE_RESTART;
4100 return 0; /* finito */
b0067625
ZJS
4101 }
4102
4103 expose_port_flush(arg_expose_ports, exposed);
4104
4105 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4106 *veth_created = false;
4107 return 1; /* loop again */
4108}
4109
bf428efb 4110static int initialize_rlimits(void) {
bf428efb
LP
4111 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4112 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4113 * container execution environments. */
4114
4115 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4116 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4117 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4118 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4119 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4120 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4121 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4122 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4123 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4124 [RLIMIT_NICE] = { 0, 0 },
4125 [RLIMIT_NOFILE] = { 1024, 4096 },
4126 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4127 [RLIMIT_RTPRIO] = { 0, 0 },
4128 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4129 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4130
4131 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4132 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4133 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4134 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4135 * that PID 1 changes a number of other resource limits during early initialization which is why we
4136 * don't read the other limits from PID 1 but prefer the static table above. */
4137 };
4138
4139 int rl;
4140
4141 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4142 /* Let's only fill in what the user hasn't explicitly configured anyway */
4143 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4144 const struct rlimit *v;
4145 struct rlimit buffer;
4146
4147 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4148 /* For these two let's read the limits off PID 1. See above for an explanation. */
4149
4150 if (prlimit(1, rl, NULL, &buffer) < 0)
4151 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4152
4153 v = &buffer;
4154 } else
4155 v = kernel_defaults + rl;
4156
4157 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4158 if (!arg_rlimit[rl])
4159 return log_oom();
4160 }
4161
4162 if (DEBUG_LOGGING) {
4163 _cleanup_free_ char *k = NULL;
4164
4165 (void) rlimit_format(arg_rlimit[rl], &k);
4166 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4167 }
4168 }
4169
4170 return 0;
4171}
4172
03cfe0d5 4173int main(int argc, char *argv[]) {
2d845785
LP
4174 _cleanup_free_ char *console = NULL;
4175 _cleanup_close_ int master = -1;
03cfe0d5 4176 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4177 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4178 char veth_name[IFNAMSIZ] = "";
17cbb288 4179 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4180 pid_t pid = 0;
03cfe0d5 4181 union in_addr_union exposed = {};
8e766630 4182 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4183 bool interactive, veth_created = false, remove_tmprootdir = false;
4184 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4185 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4186 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4187 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4188
4189 log_parse_environment();
4190 log_open();
415fc41c 4191
7732f92b
LP
4192 /* Make sure rename_process() in the stub init process can work */
4193 saved_argv = argv;
4194 saved_argc = argc;
4195
03cfe0d5
LP
4196 r = parse_argv(argc, argv);
4197 if (r <= 0)
4198 goto finish;
4199
fba868fa
LP
4200 r = must_be_root();
4201 if (r < 0)
03cfe0d5 4202 goto finish;
fba868fa 4203
bf428efb
LP
4204 r = initialize_rlimits();
4205 if (r < 0)
4206 goto finish;
4207
f757855e
LP
4208 r = determine_names();
4209 if (r < 0)
4210 goto finish;
4211
4212 r = load_settings();
4213 if (r < 0)
4214 goto finish;
4215
4216 r = verify_arguments();
4217 if (r < 0)
4218 goto finish;
03cfe0d5 4219
8199d554
LP
4220 r = detect_unified_cgroup_hierarchy_from_environment();
4221 if (r < 0)
4222 goto finish;
4223
03cfe0d5
LP
4224 n_fd_passed = sd_listen_fds(false);
4225 if (n_fd_passed > 0) {
4226 r = fdset_new_listen_fds(&fds, false);
4227 if (r < 0) {
4228 log_error_errno(r, "Failed to collect file descriptors: %m");
4229 goto finish;
4230 }
4231 }
4232
83e803a9
ZJS
4233 /* The "default" umask. This is appropriate for most file and directory
4234 * operations performed by nspawn, and is the umask that will be used for
4235 * the child. Functions like copy_devnodes() change the umask temporarily. */
4236 umask(0022);
4237
03cfe0d5
LP
4238 if (arg_directory) {
4239 assert(!arg_image);
4240
4241 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4242 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4243 r = -EINVAL;
4244 goto finish;
4245 }
4246
4247 if (arg_ephemeral) {
4248 _cleanup_free_ char *np = NULL;
4249
8d4aa2bb 4250 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4251 if (r < 0)
4252 goto finish;
4253
03cfe0d5
LP
4254 /* If the specified path is a mount point we
4255 * generate the new snapshot immediately
4256 * inside it under a random name. However if
4257 * the specified is not a mount point we
4258 * create the new snapshot in the parent
4259 * directory, just next to it. */
e1873695 4260 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4261 if (r < 0) {
4262 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4263 goto finish;
4264 }
4265 if (r > 0)
770b5ce4 4266 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4267 else
770b5ce4 4268 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4269 if (r < 0) {
0f3be6ca 4270 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4271 goto finish;
4272 }
4273
4274 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4275 if (r < 0) {
4276 log_error_errno(r, "Failed to lock %s: %m", np);
4277 goto finish;
4278 }
4279
17cbb288
LP
4280 r = btrfs_subvol_snapshot(arg_directory, np,
4281 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4282 BTRFS_SNAPSHOT_FALLBACK_COPY |
4283 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4284 BTRFS_SNAPSHOT_RECURSIVE |
4285 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4286 if (r < 0) {
4287 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4288 goto finish;
ec16945e
LP
4289 }
4290
1cc6c93a 4291 free_and_replace(arg_directory, np);
ec16945e 4292
17cbb288 4293 remove_directory = true;
30535c16
LP
4294
4295 } else {
cb638b5e 4296 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4297 if (r < 0)
4298 goto finish;
4299
30535c16
LP
4300 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4301 if (r == -EBUSY) {
4302 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4303 goto finish;
4304 }
4305 if (r < 0) {
4306 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4307 goto finish;
30535c16
LP
4308 }
4309
4310 if (arg_template) {
8d4aa2bb 4311 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4312 if (r < 0)
4313 goto finish;
4314
17cbb288
LP
4315 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4316 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4317 BTRFS_SNAPSHOT_FALLBACK_COPY |
4318 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4319 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4320 BTRFS_SNAPSHOT_RECURSIVE |
4321 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4322 if (r == -EEXIST) {
4323 if (!arg_quiet)
4324 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4325 } else if (r < 0) {
83521414 4326 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4327 goto finish;
4328 } else {
4329 if (!arg_quiet)
4330 log_info("Populated %s from template %s.", arg_directory, arg_template);
4331 }
4332 }
ec16945e
LP
4333 }
4334
7732f92b 4335 if (arg_start_mode == START_BOOT) {
a5201ed6 4336 const char *p;
c9fe05e0 4337
a5201ed6
LP
4338 if (arg_pivot_root_new)
4339 p = prefix_roota(arg_directory, arg_pivot_root_new);
4340 else
4341 p = arg_directory;
c9fe05e0
AR
4342
4343 if (path_is_os_tree(p) <= 0) {
4344 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4345 r = -EINVAL;
1b9e5b12
LP
4346 goto finish;
4347 }
4348 } else {
c9fe05e0
AR
4349 const char *p, *q;
4350
a5201ed6
LP
4351 if (arg_pivot_root_new)
4352 p = prefix_roota(arg_directory, arg_pivot_root_new);
4353 else
4354 p = arg_directory;
c9fe05e0
AR
4355
4356 q = strjoina(p, "/usr/");
1b9e5b12 4357
c9fe05e0
AR
4358 if (laccess(q, F_OK) < 0) {
4359 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4360 r = -EINVAL;
1b9e5b12 4361 goto finish;
1b9e5b12
LP
4362 }
4363 }
ec16945e 4364
6b9132a9 4365 } else {
ec16945e
LP
4366 assert(arg_image);
4367 assert(!arg_template);
4368
8d4aa2bb 4369 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4370 if (r < 0)
4371 goto finish;
4372
0f3be6ca
LP
4373 if (arg_ephemeral) {
4374 _cleanup_free_ char *np = NULL;
4375
4376 r = tempfn_random(arg_image, "machine.", &np);
4377 if (r < 0) {
4378 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4379 goto finish;
4380 }
4381
4382 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4383 if (r < 0) {
4384 r = log_error_errno(r, "Failed to create image lock: %m");
4385 goto finish;
4386 }
4387
1c876927 4388 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
4389 if (r < 0) {
4390 r = log_error_errno(r, "Failed to copy image file: %m");
4391 goto finish;
4392 }
4393
1cc6c93a 4394 free_and_replace(arg_image, np);
0f3be6ca
LP
4395
4396 remove_image = true;
4397 } else {
4398 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4399 if (r == -EBUSY) {
4400 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4401 goto finish;
4402 }
4403 if (r < 0) {
4404 r = log_error_errno(r, "Failed to create image lock: %m");
4405 goto finish;
4406 }
4623e8e6 4407
78ebe980
LP
4408 if (!arg_root_hash) {
4409 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4410 if (r < 0) {
4411 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4412 goto finish;
4413 }
4414 }
30535c16
LP
4415 }
4416
c67b0082 4417 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4418 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4419 goto finish;
1b9e5b12 4420 }
6b9132a9 4421
c67b0082
LP
4422 remove_tmprootdir = true;
4423
4424 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4425 if (!arg_directory) {
4426 r = log_oom();
4427 goto finish;
6b9132a9 4428 }
88213476 4429
2d845785
LP
4430 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4431 if (r < 0) {
4432 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4433 goto finish;
4434 }
1b9e5b12 4435
4526113f 4436 r = dissect_image_and_warn(
e0f9e7bd 4437 loop->fd,
4526113f 4438 arg_image,
e0f9e7bd
LP
4439 arg_root_hash, arg_root_hash_size,
4440 DISSECT_IMAGE_REQUIRE_ROOT,
4441 &dissected_image);
2d845785 4442 if (r == -ENOPKG) {
4526113f 4443 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4444 log_notice("Note that the disk image needs to\n"
4445 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4446 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4447 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4448 " d) or contain a file system without a partition table\n"
4449 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4450 goto finish;
2d845785 4451 }
4526113f 4452 if (r < 0)
842f3b0f 4453 goto finish;
1b9e5b12 4454
4623e8e6
LP
4455 if (!arg_root_hash && dissected_image->can_verity)
4456 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4457
4458 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4459 if (r < 0)
4460 goto finish;
0f3be6ca
LP
4461
4462 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4463 if (remove_image && unlink(arg_image) >= 0)
4464 remove_image = false;
842f3b0f 4465 }
842f3b0f 4466
86c0dd4a 4467 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4468 if (r < 0)
4469 goto finish;
4470
03cfe0d5
LP
4471 interactive =
4472 isatty(STDIN_FILENO) > 0 &&
4473 isatty(STDOUT_FILENO) > 0;
9c857b9d 4474
669fc4e5 4475 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
db7feb7e 4476 if (master < 0) {
ec16945e 4477 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4478 goto finish;
4479 }
4480
611b312b
LP
4481 r = ptsname_malloc(master, &console);
4482 if (r < 0) {
4483 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4484 goto finish;
68b02049
DW
4485 }
4486
4487 if (arg_selinux_apifs_context) {
4488 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4489 if (r < 0)
4490 goto finish;
a258bf26
LP
4491 }
4492
a258bf26 4493 if (unlockpt(master) < 0) {
ec16945e 4494 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4495 goto finish;
4496 }
4497
9c857b9d
LP
4498 if (!arg_quiet)
4499 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4500 arg_machine, arg_image ?: arg_directory);
4501
72c0a2c2 4502 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4503
66edd963 4504 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4505 r = log_error_errno(errno, "Failed to become subreaper: %m");
4506 goto finish;
4507 }
4508
d87be9b0 4509 for (;;) {
b0067625
ZJS
4510 r = run(master,
4511 console,
2d845785 4512 dissected_image,
b0067625
ZJS
4513 interactive, secondary,
4514 fds,
4515 veth_name, &veth_created,
4516 &exposed,
4517 &pid, &ret);
4518 if (r <= 0)
d87be9b0 4519 break;
d87be9b0 4520 }
88213476
LP
4521
4522finish:
af4ec430 4523 sd_notify(false,
2a49b612
ZJS
4524 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4525 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4526
9444b1f2 4527 if (pid > 0)
c67b0082 4528 (void) kill(pid, SIGKILL);
88213476 4529
503546da 4530 /* Try to flush whatever is still queued in the pty */
6a0f896b 4531 if (master >= 0) {
1c876927 4532 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4533 master = safe_close(master);
4534 }
4535
4536 if (pid > 0)
4537 (void) wait_for_terminate(pid, NULL);
503546da 4538
50ebcf6c
LP
4539 pager_close();
4540
17cbb288 4541 if (remove_directory && arg_directory) {
ec16945e
LP
4542 int k;
4543
17cbb288 4544 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4545 if (k < 0)
17cbb288 4546 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4547 }
4548
0f3be6ca
LP
4549 if (remove_image && arg_image) {
4550 if (unlink(arg_image) < 0)
4551 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4552 }
4553
c67b0082
LP
4554 if (remove_tmprootdir) {
4555 if (rmdir(tmprootdir) < 0)
4556 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4557 }
4558
785890ac
LP
4559 if (arg_machine) {
4560 const char *p;
4561
63c372cb 4562 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4563 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4564 }
4565
7a8f6325 4566 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4567
4568 if (veth_created)
4569 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4570 (void) remove_bridge(arg_network_zone);
f757855e 4571
04d391da 4572 free(arg_directory);
ec16945e
LP
4573 free(arg_template);
4574 free(arg_image);
7027ff61 4575 free(arg_machine);
3a9530e5 4576 free(arg_hostname);
c74e630d 4577 free(arg_user);
b53ede69
PW
4578 free(arg_pivot_root_new);
4579 free(arg_pivot_root_old);
5f932eb9 4580 free(arg_chdir);
c74e630d 4581 strv_free(arg_setenv);
f757855e 4582 free(arg_network_bridge);
c74e630d
LP
4583 strv_free(arg_network_interfaces);
4584 strv_free(arg_network_macvlan);
4bbfe7ad 4585 strv_free(arg_network_ipvlan);
f6d6bad1 4586 strv_free(arg_network_veth_extra);
f757855e 4587 strv_free(arg_parameters);
df1fac6d
LP
4588 free(arg_network_zone);
4589 free(arg_network_namespace_path);
4590 strv_free(arg_property);
f757855e
LP
4591 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4592 expose_port_free_all(arg_expose_ports);
4623e8e6 4593 free(arg_root_hash);
bf428efb 4594 rlimit_free_all(arg_rlimit);
df1fac6d
LP
4595 strv_free(arg_syscall_whitelist);
4596 strv_free(arg_syscall_blacklist);
d107bb7d 4597 arg_cpuset = cpu_set_mfree(arg_cpuset);
6d0b55c2 4598
ec16945e 4599 return r < 0 ? EXIT_FAILURE : ret;
88213476 4600}