]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: make use of log_set_open_when_needed() in nspawn too
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
88213476
LP
6***/
7
349cc4a5 8#if HAVE_BLKID
6b5cf3ea 9#include <blkid.h>
8fe0087e 10#endif
88213476 11#include <errno.h>
88213476 12#include <getopt.h>
0e7ac751 13#include <grp.h>
1b9e5b12 14#include <linux/loop.h>
0e7ac751 15#include <pwd.h>
8fe0087e 16#include <sched.h>
349cc4a5 17#if HAVE_SELINUX
8fe0087e 18#include <selinux/selinux.h>
1b9e5b12 19#endif
8fe0087e
LP
20#include <signal.h>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <sys/file.h>
25#include <sys/mount.h>
26#include <sys/personality.h>
27#include <sys/prctl.h>
28#include <sys/types.h>
6916b164 29#include <sys/wait.h>
8fe0087e 30#include <unistd.h>
1b9e5b12 31
b053cd5f 32#include "sd-bus.h"
1f0cd86b 33#include "sd-daemon.h"
1f0cd86b 34#include "sd-id128.h"
8fe0087e 35
b5efdb8a 36#include "alloc-util.h"
8fe0087e
LP
37#include "barrier.h"
38#include "base-filesystem.h"
39#include "blkid-util.h"
40#include "btrfs-util.h"
b053cd5f 41#include "bus-util.h"
8fe0087e 42#include "cap-list.h"
430f0182 43#include "capability-util.h"
04d391da 44#include "cgroup-util.h"
8fe0087e 45#include "copy.h"
d107bb7d 46#include "cpu-set-util.h"
4fc9982c 47#include "dev-setup.h"
2d845785 48#include "dissect-image.h"
8fe0087e 49#include "env-util.h"
3ffd4af2 50#include "fd-util.h"
842f3b0f 51#include "fdset.h"
a5c32cff 52#include "fileio.h"
f97b34a6 53#include "format-util.h"
f4f15635 54#include "fs-util.h"
1b9e5b12 55#include "gpt.h"
4623e8e6 56#include "hexdecoct.h"
8fe0087e 57#include "hostname-util.h"
910fd145 58#include "id128-util.h"
8fe0087e 59#include "log.h"
2d845785 60#include "loop-util.h"
8fe0087e 61#include "loopback-setup.h"
1b9cebf6 62#include "machine-image.h"
8fe0087e
LP
63#include "macro.h"
64#include "missing.h"
65#include "mkdir.h"
4349cd7c 66#include "mount-util.h"
8fe0087e 67#include "netlink-util.h"
07630cea 68#include "nspawn-cgroup.h"
3603efde 69#include "nspawn-def.h"
07630cea
LP
70#include "nspawn-expose-ports.h"
71#include "nspawn-mount.h"
72#include "nspawn-network.h"
7336138e 73#include "nspawn-patch-uid.h"
07630cea 74#include "nspawn-register.h"
910fd145 75#include "nspawn-seccomp.h"
07630cea
LP
76#include "nspawn-settings.h"
77#include "nspawn-setuid.h"
7732f92b 78#include "nspawn-stub-pid1.h"
50ebcf6c 79#include "pager.h"
6bedfcbb 80#include "parse-util.h"
8fe0087e 81#include "path-util.h"
0b452006 82#include "process-util.h"
8fe0087e
LP
83#include "ptyfwd.h"
84#include "random-util.h"
8869a0b4 85#include "raw-clone.h"
bf428efb 86#include "rlimit-util.h"
8fe0087e 87#include "rm-rf.h"
68b02049 88#include "selinux-util.h"
8fe0087e 89#include "signal-util.h"
2583fbea 90#include "socket-util.h"
8fcde012 91#include "stat-util.h"
15a5e950 92#include "stdio-util.h"
5c828e66 93#include "string-table.h"
07630cea 94#include "string-util.h"
8fe0087e
LP
95#include "strv.h"
96#include "terminal-util.h"
97#include "udev-util.h"
affb60b1 98#include "umask-util.h"
b1d4f8e1 99#include "user-util.h"
8fe0087e 100#include "util.h"
e9642be2 101
62b1e758
YW
102#if HAVE_SPLIT_USR
103#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
104#else
105#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
106#endif
107
9c1e04d0
AP
108/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
109 * nspawn_notify_socket_path is relative to the container
110 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
111#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 112
2a49b612
ZJS
113#define EXIT_FORCE_RESTART 133
114
113cea80
DH
115typedef enum ContainerStatus {
116 CONTAINER_TERMINATED,
117 CONTAINER_REBOOTED
118} ContainerStatus;
119
57fb9fb5
LP
120typedef enum LinkJournal {
121 LINK_NO,
122 LINK_AUTO,
123 LINK_HOST,
124 LINK_GUEST
125} LinkJournal;
88213476
LP
126
127static char *arg_directory = NULL;
ec16945e 128static char *arg_template = NULL;
5f932eb9 129static char *arg_chdir = NULL;
b53ede69
PW
130static char *arg_pivot_root_new = NULL;
131static char *arg_pivot_root_old = NULL;
687d0825 132static char *arg_user = NULL;
9444b1f2 133static sd_id128_t arg_uuid = {};
3a9530e5
LP
134static char *arg_machine = NULL; /* The name used by the host to refer to this */
135static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
136static const char *arg_selinux_context = NULL;
137static const char *arg_selinux_apifs_context = NULL;
9444b1f2 138static const char *arg_slice = NULL;
ff01d048 139static bool arg_private_network = false;
bc2f673e 140static bool arg_read_only = false;
7732f92b 141static StartMode arg_start_mode = START_PID1;
ec16945e 142static bool arg_ephemeral = false;
57fb9fb5 143static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 144static bool arg_link_journal_try = false;
520e0d54 145static uint64_t arg_caps_retain =
50b52222
LP
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
148 (1ULL << CAP_CHOWN) |
149 (1ULL << CAP_DAC_OVERRIDE) |
150 (1ULL << CAP_DAC_READ_SEARCH) |
151 (1ULL << CAP_FOWNER) |
152 (1ULL << CAP_FSETID) |
153 (1ULL << CAP_IPC_OWNER) |
154 (1ULL << CAP_KILL) |
155 (1ULL << CAP_LEASE) |
156 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 157 (1ULL << CAP_MKNOD) |
5076f0cc
LP
158 (1ULL << CAP_NET_BIND_SERVICE) |
159 (1ULL << CAP_NET_BROADCAST) |
160 (1ULL << CAP_NET_RAW) |
5076f0cc 161 (1ULL << CAP_SETFCAP) |
50b52222 162 (1ULL << CAP_SETGID) |
5076f0cc
LP
163 (1ULL << CAP_SETPCAP) |
164 (1ULL << CAP_SETUID) |
165 (1ULL << CAP_SYS_ADMIN) |
50b52222 166 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
167 (1ULL << CAP_SYS_CHROOT) |
168 (1ULL << CAP_SYS_NICE) |
169 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 170 (1ULL << CAP_SYS_RESOURCE) |
50b52222 171 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538 172static CustomMount *arg_custom_mounts = NULL;
88614c8a 173static size_t arg_n_custom_mounts = 0;
f4889f65 174static char **arg_setenv = NULL;
284c0b91 175static bool arg_quiet = false;
eb91eb18 176static bool arg_register = true;
89f7c846 177static bool arg_keep_unit = false;
aa28aefe 178static char **arg_network_interfaces = NULL;
c74e630d 179static char **arg_network_macvlan = NULL;
4bbfe7ad 180static char **arg_network_ipvlan = NULL;
69c79d3c 181static bool arg_network_veth = false;
f6d6bad1 182static char **arg_network_veth_extra = NULL;
f757855e 183static char *arg_network_bridge = NULL;
22b28dfd 184static char *arg_network_zone = NULL;
d7bea6b6 185static char *arg_network_namespace_path = NULL;
050f7277 186static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 187static char *arg_image = NULL;
f757855e 188static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 189static ExposePort *arg_expose_ports = NULL;
f36933fe 190static char **arg_property = NULL;
0de7acce 191static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 192static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 193static bool arg_userns_chown = false;
c6c8f6e2 194static int arg_kill_signal = 0;
5da38d07 195static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
196static SettingsMask arg_settings_mask = 0;
197static int arg_settings_trusted = -1;
198static char **arg_parameters = NULL;
6aadfa4c 199static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 200static bool arg_notify_ready = false;
5a8ff0e6 201static bool arg_use_cgns = true;
0c582db0 202static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 203static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
204static void *arg_root_hash = NULL;
205static size_t arg_root_hash_size = 0;
960e4569
LP
206static char **arg_syscall_whitelist = NULL;
207static char **arg_syscall_blacklist = NULL;
bf428efb 208static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 209static bool arg_no_new_privileges = false;
81f345df
LP
210static int arg_oom_score_adjust = 0;
211static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
212static cpu_set_t *arg_cpuset = NULL;
213static unsigned arg_cpuset_ncpus = 0;
88213476 214
601185b4 215static void help(void) {
50ebcf6c
LP
216
217 (void) pager_open(false, false);
218
88213476
LP
219 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
220 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
221 " -h --help Show this help\n"
222 " --version Print version string\n"
69c79d3c 223 " -q --quiet Do not show status information\n"
1b9e5b12 224 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
225 " --template=PATH Initialize root directory from template directory,\n"
226 " if missing\n"
227 " -x --ephemeral Run container with snapshot of root directory, and\n"
228 " remove it after exit\n"
229 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 230 " --root-hash=HASH Specify verity root hash\n"
7732f92b 231 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 232 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 233 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
234 " --pivot-root=PATH[:PATH]\n"
235 " Pivot root to given directory in the container\n"
a8828ed9 236 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 237 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 238 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 239 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 240 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 241 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 242 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 243 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 244 " Similar, but with user configured UID/GID range\n"
24597ee0 245 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
246 " --private-network Disable network in container\n"
247 " --network-interface=INTERFACE\n"
248 " Assign an existing network interface to the\n"
249 " container\n"
c74e630d
LP
250 " --network-macvlan=INTERFACE\n"
251 " Create a macvlan network interface based on an\n"
252 " existing network interface to the container\n"
4bbfe7ad
TG
253 " --network-ipvlan=INTERFACE\n"
254 " Create a ipvlan network interface based on an\n"
255 " existing network interface to the container\n"
a8eaaee7 256 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 257 " and container\n"
f6d6bad1
LP
258 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
259 " Add an additional virtual Ethernet link between\n"
260 " host and container\n"
ab046dde 261 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
262 " Add a virtual Ethernet connection to the container\n"
263 " and attach it to an existing bridge on the host\n"
264 " --network-zone=NAME Similar, but attach the new interface to an\n"
265 " an automatically managed bridge interface\n"
d7bea6b6
DP
266 " --network-namespace-path=PATH\n"
267 " Set network namespace to the one represented by\n"
268 " the specified kernel namespace file node\n"
6d0b55c2 269 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 270 " Expose a container IP port on the host\n"
82adf6af
LP
271 " -Z --selinux-context=SECLABEL\n"
272 " Set the SELinux security context to be used by\n"
273 " processes in the container\n"
274 " -L --selinux-apifs-context=SECLABEL\n"
275 " Set the SELinux security context to be used by\n"
276 " API/tmpfs file systems in the container\n"
a8828ed9
DW
277 " --capability=CAP In addition to the default, retain specified\n"
278 " capability\n"
279 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
280 " --system-call-filter=LIST|~LIST\n"
281 " Permit/prohibit specific system calls\n"
bf428efb 282 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
283 " --oom-score-adjust=VALUE\n"
284 " Adjust the OOM score value for the payload\n"
d107bb7d 285 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 286 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
287 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
288 " host, try-guest, try-host\n"
574edc90 289 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 290 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
291 " --bind=PATH[:PATH[:OPTIONS]]\n"
292 " Bind mount a file or directory from the host into\n"
a8828ed9 293 " the container\n"
5e5bfa6e
EY
294 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
295 " Similar, but creates a read-only bind mount\n"
06c17c39 296 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
297 " --overlay=PATH[:PATH...]:PATH\n"
298 " Create an overlay mount from the host to \n"
299 " the container\n"
300 " --overlay-ro=PATH[:PATH...]:PATH\n"
301 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 302 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 303 " --register=BOOLEAN Register container as machine\n"
89f7c846 304 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 305 " the service unit nspawn is running in\n"
6d0b55c2 306 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 307 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 308 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 309 , program_invocation_short_name);
88213476
LP
310}
311
86c0dd4a 312static int custom_mount_check_all(void) {
88614c8a 313 size_t i;
5a8af538 314
5a8af538
LP
315 for (i = 0; i < arg_n_custom_mounts; i++) {
316 CustomMount *m = &arg_custom_mounts[i];
317
0de7acce 318 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
319
320 if (arg_userns_chown) {
321 log_error("--private-users-chown may not be combined with custom root mounts.");
322 return -EINVAL;
323 } else if (arg_uid_shift == UID_INVALID) {
324 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
325 return -EINVAL;
326 }
825d5287 327 }
5a8af538
LP
328 }
329
330 return 0;
331}
332
8199d554 333static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 334 const char *e;
415fc41c 335 int r;
5da38d07 336
efdb0237
LP
337 /* Allow the user to control whether the unified hierarchy is used */
338 e = getenv("UNIFIED_CGROUP_HIERARCHY");
339 if (e) {
340 r = parse_boolean(e);
341 if (r < 0)
342 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
343 if (r > 0)
344 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
345 else
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
347 }
348
8199d554
LP
349 return 0;
350}
351
352static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
353 int r;
354
355 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
356 * image actually supports. */
b4cccbc1
LP
357 r = cg_all_unified();
358 if (r < 0)
359 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
360 if (r > 0) {
a8725a06
ZJS
361 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
362 * routine only detects 231, so we'll have a false negative here for 230. */
363 r = systemd_installation_has_version(directory, 230);
364 if (r < 0)
365 return log_error_errno(r, "Failed to determine systemd version in container: %m");
366 if (r > 0)
367 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
368 else
369 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 370 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
371 /* Mixed cgroup hierarchy support was added in 233 */
372 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
373 if (r < 0)
374 return log_error_errno(r, "Failed to determine systemd version in container: %m");
375 if (r > 0)
376 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
377 else
378 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
379 } else
5da38d07 380 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 381
8199d554
LP
382 log_debug("Using %s hierarchy for container.",
383 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
384 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
385
efdb0237
LP
386 return 0;
387}
388
0c582db0
LB
389static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
390 int r;
391
392 r = getenv_bool(name);
393 if (r == -ENXIO)
394 return;
395 if (r < 0)
396 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
397 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
398}
399
4f086aab
SU
400static void parse_mount_settings_env(void) {
401 int r;
402 const char *e;
403
404 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
405 if (!e)
406 return;
407
408 if (streq(e, "network")) {
409 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
410 return;
411 }
412
413 r = parse_boolean(e);
414 if (r < 0) {
415 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
416 return;
ab8ee0f2 417 }
4f086aab 418
ab8ee0f2
ZJS
419 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
420 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
421}
422
88213476
LP
423static int parse_argv(int argc, char *argv[]) {
424
a41fe3a2 425 enum {
acbeb427
ZJS
426 ARG_VERSION = 0x100,
427 ARG_PRIVATE_NETWORK,
bc2f673e 428 ARG_UUID,
5076f0cc 429 ARG_READ_ONLY,
57fb9fb5 430 ARG_CAPABILITY,
420c7379 431 ARG_DROP_CAPABILITY,
17fe0523
LP
432 ARG_LINK_JOURNAL,
433 ARG_BIND,
f4889f65 434 ARG_BIND_RO,
06c17c39 435 ARG_TMPFS,
5a8af538
LP
436 ARG_OVERLAY,
437 ARG_OVERLAY_RO,
eb91eb18 438 ARG_SHARE_SYSTEM,
89f7c846 439 ARG_REGISTER,
aa28aefe 440 ARG_KEEP_UNIT,
69c79d3c 441 ARG_NETWORK_INTERFACE,
c74e630d 442 ARG_NETWORK_MACVLAN,
4bbfe7ad 443 ARG_NETWORK_IPVLAN,
ab046dde 444 ARG_NETWORK_BRIDGE,
22b28dfd 445 ARG_NETWORK_ZONE,
f6d6bad1 446 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 447 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 448 ARG_PERSONALITY,
4d9f07b4 449 ARG_VOLATILE,
ec16945e 450 ARG_TEMPLATE,
f36933fe 451 ARG_PROPERTY,
6dac160c 452 ARG_PRIVATE_USERS,
c6c8f6e2 453 ARG_KILL_SIGNAL,
f757855e 454 ARG_SETTINGS,
5f932eb9 455 ARG_CHDIR,
b53ede69 456 ARG_PIVOT_ROOT,
7336138e 457 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 458 ARG_NOTIFY_READY,
4623e8e6 459 ARG_ROOT_HASH,
960e4569 460 ARG_SYSTEM_CALL_FILTER,
bf428efb 461 ARG_RLIMIT,
3a9530e5 462 ARG_HOSTNAME,
66edd963 463 ARG_NO_NEW_PRIVILEGES,
81f345df 464 ARG_OOM_SCORE_ADJUST,
d107bb7d 465 ARG_CPU_AFFINITY,
a41fe3a2
LP
466 };
467
88213476 468 static const struct option options[] = {
d7bea6b6
DP
469 { "help", no_argument, NULL, 'h' },
470 { "version", no_argument, NULL, ARG_VERSION },
471 { "directory", required_argument, NULL, 'D' },
472 { "template", required_argument, NULL, ARG_TEMPLATE },
473 { "ephemeral", no_argument, NULL, 'x' },
474 { "user", required_argument, NULL, 'u' },
475 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
476 { "as-pid2", no_argument, NULL, 'a' },
477 { "boot", no_argument, NULL, 'b' },
478 { "uuid", required_argument, NULL, ARG_UUID },
479 { "read-only", no_argument, NULL, ARG_READ_ONLY },
480 { "capability", required_argument, NULL, ARG_CAPABILITY },
481 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 482 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
483 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
484 { "bind", required_argument, NULL, ARG_BIND },
485 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
486 { "tmpfs", required_argument, NULL, ARG_TMPFS },
487 { "overlay", required_argument, NULL, ARG_OVERLAY },
488 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
489 { "machine", required_argument, NULL, 'M' },
3a9530e5 490 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
491 { "slice", required_argument, NULL, 'S' },
492 { "setenv", required_argument, NULL, 'E' },
493 { "selinux-context", required_argument, NULL, 'Z' },
494 { "selinux-apifs-context", required_argument, NULL, 'L' },
495 { "quiet", no_argument, NULL, 'q' },
496 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
497 { "register", required_argument, NULL, ARG_REGISTER },
498 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
499 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
500 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
501 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
502 { "network-veth", no_argument, NULL, 'n' },
503 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
504 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
505 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
506 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
507 { "personality", required_argument, NULL, ARG_PERSONALITY },
508 { "image", required_argument, NULL, 'i' },
509 { "volatile", optional_argument, NULL, ARG_VOLATILE },
510 { "port", required_argument, NULL, 'p' },
511 { "property", required_argument, NULL, ARG_PROPERTY },
512 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
513 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
514 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
515 { "settings", required_argument, NULL, ARG_SETTINGS },
516 { "chdir", required_argument, NULL, ARG_CHDIR },
517 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
518 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
519 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
520 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 521 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 522 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 523 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
eb9da376 524 {}
88213476
LP
525 };
526
9444b1f2 527 int c, r;
6aadfa4c 528 const char *p, *e;
a42c8b54 529 uint64_t plus = 0, minus = 0;
f757855e 530 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
531
532 assert(argc >= 0);
533 assert(argv);
534
2e1f244e 535 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
536
537 switch (c) {
538
539 case 'h':
601185b4
ZJS
540 help();
541 return 0;
88213476 542
acbeb427 543 case ARG_VERSION:
3f6fd1ba 544 return version();
acbeb427 545
88213476 546 case 'D':
0f03c2a4 547 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 548 if (r < 0)
0f03c2a4 549 return r;
ec16945e
LP
550 break;
551
552 case ARG_TEMPLATE:
0f03c2a4 553 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 554 if (r < 0)
0f03c2a4 555 return r;
88213476
LP
556 break;
557
1b9e5b12 558 case 'i':
0f03c2a4 559 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 560 if (r < 0)
0f03c2a4 561 return r;
ec16945e
LP
562 break;
563
564 case 'x':
565 arg_ephemeral = true;
1b9e5b12
LP
566 break;
567
687d0825 568 case 'u':
2fc09a9c
DM
569 r = free_and_strdup(&arg_user, optarg);
570 if (r < 0)
7027ff61 571 return log_oom();
687d0825 572
f757855e 573 arg_settings_mask |= SETTING_USER;
687d0825
MV
574 break;
575
22b28dfd
LP
576 case ARG_NETWORK_ZONE: {
577 char *j;
578
579 j = strappend("vz-", optarg);
580 if (!j)
581 return log_oom();
582
583 if (!ifname_valid(j)) {
584 log_error("Network zone name not valid: %s", j);
585 free(j);
586 return -EINVAL;
587 }
588
589 free(arg_network_zone);
590 arg_network_zone = j;
591
592 arg_network_veth = true;
593 arg_private_network = true;
594 arg_settings_mask |= SETTING_NETWORK;
595 break;
596 }
597
ab046dde 598 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
599
600 if (!ifname_valid(optarg)) {
601 log_error("Bridge interface name not valid: %s", optarg);
602 return -EINVAL;
603 }
604
f757855e
LP
605 r = free_and_strdup(&arg_network_bridge, optarg);
606 if (r < 0)
607 return log_oom();
ab046dde 608
4831981d 609 _fallthrough_;
0dfaa006 610 case 'n':
69c79d3c
LP
611 arg_network_veth = true;
612 arg_private_network = true;
f757855e 613 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
614 break;
615
f6d6bad1
LP
616 case ARG_NETWORK_VETH_EXTRA:
617 r = veth_extra_parse(&arg_network_veth_extra, optarg);
618 if (r < 0)
619 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
620
621 arg_private_network = true;
622 arg_settings_mask |= SETTING_NETWORK;
623 break;
624
aa28aefe 625 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
626
627 if (!ifname_valid(optarg)) {
628 log_error("Network interface name not valid: %s", optarg);
629 return -EINVAL;
630 }
631
c74e630d
LP
632 if (strv_extend(&arg_network_interfaces, optarg) < 0)
633 return log_oom();
634
635 arg_private_network = true;
f757855e 636 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
637 break;
638
639 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
640
641 if (!ifname_valid(optarg)) {
642 log_error("MACVLAN network interface name not valid: %s", optarg);
643 return -EINVAL;
644 }
645
c74e630d 646 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
647 return log_oom();
648
4bbfe7ad 649 arg_private_network = true;
f757855e 650 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
651 break;
652
653 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
654
655 if (!ifname_valid(optarg)) {
656 log_error("IPVLAN network interface name not valid: %s", optarg);
657 return -EINVAL;
658 }
659
4bbfe7ad
TG
660 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
661 return log_oom();
662
4831981d 663 _fallthrough_;
ff01d048
LP
664 case ARG_PRIVATE_NETWORK:
665 arg_private_network = true;
f757855e 666 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
667 break;
668
d7bea6b6
DP
669 case ARG_NETWORK_NAMESPACE_PATH:
670 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
671 if (r < 0)
672 return r;
673
674 break;
675
0f0dbc46 676 case 'b':
7732f92b
LP
677 if (arg_start_mode == START_PID2) {
678 log_error("--boot and --as-pid2 may not be combined.");
679 return -EINVAL;
680 }
681
682 arg_start_mode = START_BOOT;
683 arg_settings_mask |= SETTING_START_MODE;
684 break;
685
686 case 'a':
687 if (arg_start_mode == START_BOOT) {
688 log_error("--boot and --as-pid2 may not be combined.");
689 return -EINVAL;
690 }
691
692 arg_start_mode = START_PID2;
693 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
694 break;
695
144f0fc0 696 case ARG_UUID:
9444b1f2 697 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
698 if (r < 0)
699 return log_error_errno(r, "Invalid UUID: %s", optarg);
700
701 if (sd_id128_is_null(arg_uuid)) {
702 log_error("Machine UUID may not be all zeroes.");
703 return -EINVAL;
aa96c6cb 704 }
f757855e
LP
705
706 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 707 break;
aa96c6cb 708
9444b1f2 709 case 'S':
c74e630d 710 arg_slice = optarg;
144f0fc0
LP
711 break;
712
7027ff61 713 case 'M':
c1521918 714 if (isempty(optarg))
97b11eed 715 arg_machine = mfree(arg_machine);
c1521918 716 else {
0c3c4284 717 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
718 log_error("Invalid machine name: %s", optarg);
719 return -EINVAL;
720 }
7027ff61 721
0c3c4284
LP
722 r = free_and_strdup(&arg_machine, optarg);
723 if (r < 0)
eb91eb18 724 return log_oom();
eb91eb18 725 }
9ce6d1b3 726 break;
7027ff61 727
3a9530e5
LP
728 case ARG_HOSTNAME:
729 if (isempty(optarg))
730 arg_hostname = mfree(arg_hostname);
731 else {
732 if (!hostname_is_valid(optarg, false)) {
733 log_error("Invalid hostname: %s", optarg);
734 return -EINVAL;
735 }
736
737 r = free_and_strdup(&arg_hostname, optarg);
738 if (r < 0)
739 return log_oom();
740 }
741
742 arg_settings_mask |= SETTING_HOSTNAME;
743 break;
744
82adf6af
LP
745 case 'Z':
746 arg_selinux_context = optarg;
a8828ed9
DW
747 break;
748
82adf6af
LP
749 case 'L':
750 arg_selinux_apifs_context = optarg;
a8828ed9
DW
751 break;
752
bc2f673e
LP
753 case ARG_READ_ONLY:
754 arg_read_only = true;
f757855e 755 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
756 break;
757
420c7379
LP
758 case ARG_CAPABILITY:
759 case ARG_DROP_CAPABILITY: {
6cbe4ed1 760 p = optarg;
9ed794a3 761 for (;;) {
6cbe4ed1 762 _cleanup_free_ char *t = NULL;
5076f0cc 763
6cbe4ed1
SS
764 r = extract_first_word(&p, &t, ",", 0);
765 if (r < 0)
766 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 767
6cbe4ed1
SS
768 if (r == 0)
769 break;
5076f0cc 770
39ed67d1
LP
771 if (streq(t, "all")) {
772 if (c == ARG_CAPABILITY)
a42c8b54 773 plus = (uint64_t) -1;
39ed67d1 774 else
a42c8b54 775 minus = (uint64_t) -1;
39ed67d1 776 } else {
2822da4f
LP
777 int cap;
778
779 cap = capability_from_name(t);
780 if (cap < 0) {
39ed67d1
LP
781 log_error("Failed to parse capability %s.", t);
782 return -EINVAL;
783 }
784
785 if (c == ARG_CAPABILITY)
a42c8b54 786 plus |= 1ULL << (uint64_t) cap;
39ed67d1 787 else
a42c8b54 788 minus |= 1ULL << (uint64_t) cap;
5076f0cc 789 }
5076f0cc
LP
790 }
791
f757855e 792 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
793 break;
794 }
795
66edd963
LP
796 case ARG_NO_NEW_PRIVILEGES:
797 r = parse_boolean(optarg);
798 if (r < 0)
799 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
800
801 arg_no_new_privileges = r;
802 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
803 break;
804
57fb9fb5
LP
805 case 'j':
806 arg_link_journal = LINK_GUEST;
574edc90 807 arg_link_journal_try = true;
57fb9fb5
LP
808 break;
809
810 case ARG_LINK_JOURNAL:
53e438e3 811 if (streq(optarg, "auto")) {
57fb9fb5 812 arg_link_journal = LINK_AUTO;
53e438e3
LP
813 arg_link_journal_try = false;
814 } else if (streq(optarg, "no")) {
57fb9fb5 815 arg_link_journal = LINK_NO;
53e438e3
LP
816 arg_link_journal_try = false;
817 } else if (streq(optarg, "guest")) {
57fb9fb5 818 arg_link_journal = LINK_GUEST;
53e438e3
LP
819 arg_link_journal_try = false;
820 } else if (streq(optarg, "host")) {
57fb9fb5 821 arg_link_journal = LINK_HOST;
53e438e3
LP
822 arg_link_journal_try = false;
823 } else if (streq(optarg, "try-guest")) {
574edc90
MP
824 arg_link_journal = LINK_GUEST;
825 arg_link_journal_try = true;
826 } else if (streq(optarg, "try-host")) {
827 arg_link_journal = LINK_HOST;
828 arg_link_journal_try = true;
829 } else {
57fb9fb5
LP
830 log_error("Failed to parse link journal mode %s", optarg);
831 return -EINVAL;
832 }
833
834 break;
835
17fe0523 836 case ARG_BIND:
f757855e
LP
837 case ARG_BIND_RO:
838 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
839 if (r < 0)
840 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 841
f757855e 842 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 843 break;
06c17c39 844
f757855e
LP
845 case ARG_TMPFS:
846 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
847 if (r < 0)
848 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 849
f757855e 850 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 851 break;
5a8af538
LP
852
853 case ARG_OVERLAY:
ad85779a
LP
854 case ARG_OVERLAY_RO:
855 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
856 if (r == -EADDRNOTAVAIL)
857 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
858 if (r < 0)
859 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 860
f757855e 861 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 862 break;
06c17c39 863
a5f1cb3b 864 case 'E': {
f4889f65
LP
865 char **n;
866
867 if (!env_assignment_is_valid(optarg)) {
868 log_error("Environment variable assignment '%s' is not valid.", optarg);
869 return -EINVAL;
870 }
871
872 n = strv_env_set(arg_setenv, optarg);
873 if (!n)
874 return log_oom();
875
130d3d22 876 strv_free_and_replace(arg_setenv, n);
f757855e 877 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
878 break;
879 }
880
284c0b91
LP
881 case 'q':
882 arg_quiet = true;
883 break;
884
8a96d94e 885 case ARG_SHARE_SYSTEM:
a6b5216c 886 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
887 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
888 arg_clone_ns_flags = 0;
8a96d94e
LP
889 break;
890
eb91eb18
LP
891 case ARG_REGISTER:
892 r = parse_boolean(optarg);
893 if (r < 0) {
894 log_error("Failed to parse --register= argument: %s", optarg);
895 return r;
896 }
897
898 arg_register = r;
899 break;
900
89f7c846
LP
901 case ARG_KEEP_UNIT:
902 arg_keep_unit = true;
903 break;
904
6afc95b7
LP
905 case ARG_PERSONALITY:
906
ac45f971 907 arg_personality = personality_from_string(optarg);
050f7277 908 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
909 log_error("Unknown or unsupported personality '%s'.", optarg);
910 return -EINVAL;
911 }
912
f757855e 913 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
914 break;
915
4d9f07b4
LP
916 case ARG_VOLATILE:
917
918 if (!optarg)
f757855e 919 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
920 else if (streq(optarg, "help")) {
921 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
922 return 0;
923 } else {
f757855e 924 VolatileMode m;
4d9f07b4 925
f757855e
LP
926 m = volatile_mode_from_string(optarg);
927 if (m < 0) {
928 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 929 return -EINVAL;
f757855e
LP
930 } else
931 arg_volatile_mode = m;
6d0b55c2
LP
932 }
933
f757855e
LP
934 arg_settings_mask |= SETTING_VOLATILE_MODE;
935 break;
6d0b55c2 936
f757855e
LP
937 case 'p':
938 r = expose_port_parse(&arg_expose_ports, optarg);
939 if (r == -EEXIST)
940 return log_error_errno(r, "Duplicate port specification: %s", optarg);
941 if (r < 0)
942 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 943
f757855e 944 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 945 break;
6d0b55c2 946
f36933fe
LP
947 case ARG_PROPERTY:
948 if (strv_extend(&arg_property, optarg) < 0)
949 return log_oom();
950
951 break;
952
ae209204
ZJS
953 case ARG_PRIVATE_USERS: {
954 int boolean = -1;
0de7acce 955
ae209204
ZJS
956 if (!optarg)
957 boolean = true;
958 else if (!in_charset(optarg, DIGITS))
959 /* do *not* parse numbers as booleans */
960 boolean = parse_boolean(optarg);
961
962 if (boolean == false) {
0de7acce
LP
963 /* no: User namespacing off */
964 arg_userns_mode = USER_NAMESPACE_NO;
965 arg_uid_shift = UID_INVALID;
966 arg_uid_range = UINT32_C(0x10000);
ae209204 967 } else if (boolean == true) {
0de7acce
LP
968 /* yes: User namespacing on, UID range is read from root dir */
969 arg_userns_mode = USER_NAMESPACE_FIXED;
970 arg_uid_shift = UID_INVALID;
971 arg_uid_range = UINT32_C(0x10000);
972 } else if (streq(optarg, "pick")) {
973 /* pick: User namespacing on, UID range is picked randomly */
974 arg_userns_mode = USER_NAMESPACE_PICK;
975 arg_uid_shift = UID_INVALID;
976 arg_uid_range = UINT32_C(0x10000);
977 } else {
6c2058b3 978 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
979 const char *range, *shift;
980
0de7acce
LP
981 /* anything else: User namespacing on, UID range is explicitly configured */
982
6dac160c
LP
983 range = strchr(optarg, ':');
984 if (range) {
6c2058b3
ZJS
985 buffer = strndup(optarg, range - optarg);
986 if (!buffer)
987 return log_oom();
988 shift = buffer;
6dac160c
LP
989
990 range++;
bfd292ec
ZJS
991 r = safe_atou32(range, &arg_uid_range);
992 if (r < 0)
be715731 993 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
994 } else
995 shift = optarg;
996
be715731
ZJS
997 r = parse_uid(shift, &arg_uid_shift);
998 if (r < 0)
999 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
1000
1001 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
1002 }
1003
be715731
ZJS
1004 if (arg_uid_range <= 0) {
1005 log_error("UID range cannot be 0.");
1006 return -EINVAL;
1007 }
1008
0de7acce 1009 arg_settings_mask |= SETTING_USERNS;
6dac160c 1010 break;
ae209204 1011 }
6dac160c 1012
0de7acce 1013 case 'U':
ccabee0d
LP
1014 if (userns_supported()) {
1015 arg_userns_mode = USER_NAMESPACE_PICK;
1016 arg_uid_shift = UID_INVALID;
1017 arg_uid_range = UINT32_C(0x10000);
1018
1019 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1020 }
1021
7336138e
LP
1022 break;
1023
0de7acce 1024 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1025 arg_userns_chown = true;
0de7acce
LP
1026
1027 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1028 break;
1029
c6c8f6e2 1030 case ARG_KILL_SIGNAL:
5c828e66
LP
1031 if (streq(optarg, "help")) {
1032 DUMP_STRING_TABLE(signal, int, _NSIG);
1033 return 0;
1034 }
1035
29a3db75 1036 arg_kill_signal = signal_from_string(optarg);
c6c8f6e2
LP
1037 if (arg_kill_signal < 0) {
1038 log_error("Cannot parse signal: %s", optarg);
1039 return -EINVAL;
1040 }
1041
f757855e
LP
1042 arg_settings_mask |= SETTING_KILL_SIGNAL;
1043 break;
1044
1045 case ARG_SETTINGS:
1046
1047 /* no → do not read files
1048 * yes → read files, do not override cmdline, trust only subset
1049 * override → read files, override cmdline, trust only subset
1050 * trusted → read files, do not override cmdline, trust all
1051 */
1052
1053 r = parse_boolean(optarg);
1054 if (r < 0) {
1055 if (streq(optarg, "trusted")) {
1056 mask_all_settings = false;
1057 mask_no_settings = false;
1058 arg_settings_trusted = true;
1059
1060 } else if (streq(optarg, "override")) {
1061 mask_all_settings = false;
1062 mask_no_settings = true;
1063 arg_settings_trusted = -1;
1064 } else
1065 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1066 } else if (r > 0) {
1067 /* yes */
1068 mask_all_settings = false;
1069 mask_no_settings = false;
1070 arg_settings_trusted = -1;
1071 } else {
1072 /* no */
1073 mask_all_settings = true;
1074 mask_no_settings = false;
1075 arg_settings_trusted = false;
1076 }
1077
c6c8f6e2
LP
1078 break;
1079
5f932eb9
LP
1080 case ARG_CHDIR:
1081 if (!path_is_absolute(optarg)) {
1082 log_error("Working directory %s is not an absolute path.", optarg);
1083 return -EINVAL;
1084 }
1085
1086 r = free_and_strdup(&arg_chdir, optarg);
1087 if (r < 0)
1088 return log_oom();
1089
1090 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1091 break;
1092
b53ede69
PW
1093 case ARG_PIVOT_ROOT:
1094 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1095 if (r < 0)
1096 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1097
1098 arg_settings_mask |= SETTING_PIVOT_ROOT;
1099 break;
1100
9c1e04d0
AP
1101 case ARG_NOTIFY_READY:
1102 r = parse_boolean(optarg);
1103 if (r < 0) {
1104 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1105 return -EINVAL;
1106 }
1107 arg_notify_ready = r;
1108 arg_settings_mask |= SETTING_NOTIFY_READY;
1109 break;
1110
4623e8e6
LP
1111 case ARG_ROOT_HASH: {
1112 void *k;
1113 size_t l;
1114
1115 r = unhexmem(optarg, strlen(optarg), &k, &l);
1116 if (r < 0)
1117 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1118 if (l < sizeof(sd_id128_t)) {
1119 log_error("Root hash must be at least 128bit long: %s", optarg);
1120 free(k);
1121 return -EINVAL;
1122 }
1123
1124 free(arg_root_hash);
1125 arg_root_hash = k;
1126 arg_root_hash_size = l;
1127 break;
1128 }
1129
960e4569
LP
1130 case ARG_SYSTEM_CALL_FILTER: {
1131 bool negative;
1132 const char *items;
1133
1134 negative = optarg[0] == '~';
1135 items = negative ? optarg + 1 : optarg;
1136
1137 for (;;) {
1138 _cleanup_free_ char *word = NULL;
1139
1140 r = extract_first_word(&items, &word, NULL, 0);
1141 if (r == 0)
1142 break;
1143 if (r == -ENOMEM)
1144 return log_oom();
1145 if (r < 0)
1146 return log_error_errno(r, "Failed to parse system call filter: %m");
1147
1148 if (negative)
1149 r = strv_extend(&arg_syscall_blacklist, word);
1150 else
1151 r = strv_extend(&arg_syscall_whitelist, word);
1152 if (r < 0)
1153 return log_oom();
1154 }
1155
1156 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1157 break;
1158 }
1159
bf428efb
LP
1160 case ARG_RLIMIT: {
1161 const char *eq;
1162 char *name;
1163 int rl;
1164
5c828e66
LP
1165 if (streq(optarg, "help")) {
1166 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1167 return 0;
1168 }
1169
bf428efb
LP
1170 eq = strchr(optarg, '=');
1171 if (!eq) {
1172 log_error("--rlimit= expects an '=' assignment.");
1173 return -EINVAL;
1174 }
1175
1176 name = strndup(optarg, eq - optarg);
1177 if (!name)
1178 return log_oom();
1179
1180 rl = rlimit_from_string_harder(name);
1181 if (rl < 0) {
1182 log_error("Unknown resource limit: %s", name);
1183 return -EINVAL;
1184 }
1185
1186 if (!arg_rlimit[rl]) {
1187 arg_rlimit[rl] = new0(struct rlimit, 1);
1188 if (!arg_rlimit[rl])
1189 return log_oom();
1190 }
1191
1192 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1193 if (r < 0)
1194 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1195
1196 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1197 break;
1198 }
1199
81f345df
LP
1200 case ARG_OOM_SCORE_ADJUST:
1201 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1202 if (r < 0)
1203 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1204
1205 arg_oom_score_adjust_set = true;
1206 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1207 break;
1208
d107bb7d
LP
1209 case ARG_CPU_AFFINITY: {
1210 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1211
1212 r = parse_cpu_set(optarg, &cpuset);
1213 if (r < 0)
1214 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1215
1216 if (arg_cpuset)
1217 CPU_FREE(arg_cpuset);
1218
1219 arg_cpuset = TAKE_PTR(cpuset);
1220 arg_cpuset_ncpus = r;
1221 arg_settings_mask |= SETTING_CPU_AFFINITY;
1222 break;
1223 }
1224
88213476
LP
1225 case '?':
1226 return -EINVAL;
1227
1228 default:
eb9da376 1229 assert_not_reached("Unhandled option");
88213476 1230 }
88213476 1231
d7bea6b6
DP
1232 /* If --network-namespace-path is given with any other network-related option,
1233 * we need to error out, to avoid conflicts between different network options. */
1234 if (arg_network_namespace_path &&
1235 (arg_network_interfaces || arg_network_macvlan ||
1236 arg_network_ipvlan || arg_network_veth_extra ||
1237 arg_network_bridge || arg_network_zone ||
1238 arg_network_veth || arg_private_network)) {
1239 log_error("--network-namespace-path cannot be combined with other network options.");
1240 return -EINVAL;
1241 }
1242
0c582db0
LB
1243 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1244 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1245 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1246 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1247
4f086aab
SU
1248 if (arg_userns_mode != USER_NAMESPACE_NO)
1249 arg_mount_settings |= MOUNT_USE_USERNS;
1250
1251 if (arg_private_network)
1252 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1253
1254 parse_mount_settings_env();
1255
48a8d337
LB
1256 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1257 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1258 arg_register = false;
0c582db0
LB
1259 if (arg_start_mode != START_PID1) {
1260 log_error("--boot cannot be used without namespacing.");
1261 return -EINVAL;
1262 }
1263 }
eb91eb18 1264
0de7acce 1265 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1266 arg_userns_chown = true;
1267
cd2dfc6f 1268 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
8d9c2bca
AJ
1269 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1270 * The latter is not technically a user session, but we don't need to labour the point. */
cd2dfc6f 1271 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846
LP
1272 return -EINVAL;
1273 }
1274
1b9e5b12
LP
1275 if (arg_directory && arg_image) {
1276 log_error("--directory= and --image= may not be combined.");
1277 return -EINVAL;
1278 }
1279
ec16945e
LP
1280 if (arg_template && arg_image) {
1281 log_error("--template= and --image= may not be combined.");
1282 return -EINVAL;
1283 }
1284
8cd328d8
LP
1285 if (arg_ephemeral && arg_template && !arg_directory) {
1286 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1287 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1288 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1289 * --directory=". */
1290
ae2a15bc 1291 arg_directory = TAKE_PTR(arg_template);
8cd328d8
LP
1292 }
1293
ec16945e
LP
1294 if (arg_template && !(arg_directory || arg_machine)) {
1295 log_error("--template= needs --directory= or --machine=.");
1296 return -EINVAL;
1297 }
1298
1299 if (arg_ephemeral && arg_template) {
1300 log_error("--ephemeral and --template= may not be combined.");
1301 return -EINVAL;
1302 }
1303
df9a75e4
LP
1304 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1305 log_error("--ephemeral and --link-journal= may not be combined.");
1306 return -EINVAL;
1307 }
1308
ccabee0d 1309 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1310 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1311 return -EOPNOTSUPP;
1312 }
1313
1314 if (arg_userns_chown && arg_read_only) {
1315 log_error("--read-only and --private-users-chown may not be combined.");
1316 return -EINVAL;
1317 }
f757855e 1318
22b28dfd
LP
1319 if (arg_network_bridge && arg_network_zone) {
1320 log_error("--network-bridge= and --network-zone= may not be combined.");
1321 return -EINVAL;
1322 }
1323
f757855e
LP
1324 if (argc > optind) {
1325 arg_parameters = strv_copy(argv + optind);
1326 if (!arg_parameters)
1327 return log_oom();
1328
7732f92b 1329 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1330 }
1331
1332 /* Load all settings from .nspawn files */
1333 if (mask_no_settings)
1334 arg_settings_mask = 0;
1335
1336 /* Don't load any settings from .nspawn files */
1337 if (mask_all_settings)
1338 arg_settings_mask = _SETTINGS_MASK_ALL;
1339
520e0d54 1340 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1341
399e391f
ZJS
1342 r = cg_unified_flush();
1343 if (r < 0)
1344 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1345
6aadfa4c
ILG
1346 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1347 if (e)
1348 arg_container_service_name = e;
1349
5a8ff0e6
CB
1350 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1351 if (r < 0)
1352 arg_use_cgns = cg_ns_supported();
1353 else
1354 arg_use_cgns = r;
1355
86c0dd4a
LP
1356 r = custom_mount_check_all();
1357 if (r < 0)
1358 return r;
1359
f757855e
LP
1360 return 1;
1361}
1362
1363static int verify_arguments(void) {
4f086aab
SU
1364 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1365 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1366 return -EINVAL;
1367 }
1368
1369 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1370 log_error("Cannot combine --private-users with read-write mounts.");
1371 return -EINVAL;
1372 }
f757855e
LP
1373
1374 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1375 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1376 return -EINVAL;
1377 }
1378
6d0b55c2
LP
1379 if (arg_expose_ports && !arg_private_network) {
1380 log_error("Cannot use --port= without private networking.");
1381 return -EINVAL;
1382 }
1383
349cc4a5 1384#if ! HAVE_LIBIPTC
1c1ea217
EV
1385 if (arg_expose_ports) {
1386 log_error("--port= is not supported, compiled without libiptc support.");
1387 return -EOPNOTSUPP;
1388 }
1389#endif
1390
7732f92b 1391 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1392 arg_kill_signal = SIGRTMIN+3;
1393
f757855e 1394 return 0;
88213476
LP
1395}
1396
03cfe0d5
LP
1397static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1398 assert(p);
1399
0de7acce 1400 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1401 return 0;
1402
1403 if (uid == UID_INVALID && gid == GID_INVALID)
1404 return 0;
1405
1406 if (uid != UID_INVALID) {
1407 uid += arg_uid_shift;
1408
1409 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1410 return -EOVERFLOW;
1411 }
1412
1413 if (gid != GID_INVALID) {
1414 gid += (gid_t) arg_uid_shift;
1415
1416 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1417 return -EOVERFLOW;
1418 }
1419
1420 if (lchown(p, uid, gid) < 0)
1421 return -errno;
b12afc8c
LP
1422
1423 return 0;
1424}
1425
03cfe0d5
LP
1426static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1427 const char *q;
dae8b82e 1428 int r;
03cfe0d5
LP
1429
1430 q = prefix_roota(root, path);
dae8b82e
ZJS
1431 r = mkdir_errno_wrapper(q, mode);
1432 if (r == -EEXIST)
1433 return 0;
1434 if (r < 0)
1435 return r;
03cfe0d5
LP
1436
1437 return userns_lchown(q, uid, gid);
1438}
1439
e58a1277 1440static int setup_timezone(const char *dest) {
03cfe0d5
LP
1441 _cleanup_free_ char *p = NULL, *q = NULL;
1442 const char *where, *check, *what;
d4036145
LP
1443 char *z, *y;
1444 int r;
f8440af5 1445
e58a1277
LP
1446 assert(dest);
1447
1448 /* Fix the timezone, if possible */
d4036145
LP
1449 r = readlink_malloc("/etc/localtime", &p);
1450 if (r < 0) {
0b493a02
MP
1451 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1452 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1453 * with a symbolic link to a time zone data file.
0b493a02
MP
1454 *
1455 * Example:
21dc0227 1456 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1457 */
d4036145
LP
1458 return 0;
1459 }
1460
1461 z = path_startswith(p, "../usr/share/zoneinfo/");
1462 if (!z)
1463 z = path_startswith(p, "/usr/share/zoneinfo/");
1464 if (!z) {
1465 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1466 return 0;
1467 }
1468
03cfe0d5 1469 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1470 r = readlink_malloc(where, &q);
1471 if (r >= 0) {
1472 y = path_startswith(q, "../usr/share/zoneinfo/");
1473 if (!y)
1474 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1475
d4036145
LP
1476 /* Already pointing to the right place? Then do nothing .. */
1477 if (y && streq(y, z))
1478 return 0;
1479 }
1480
03cfe0d5 1481 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1482 check = prefix_roota(dest, check);
03cfe0d5 1483 if (laccess(check, F_OK) < 0) {
d4036145
LP
1484 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1485 return 0;
1486 }
68fb0892 1487
8ccf7e9e
LP
1488 if (unlink(where) < 0 && errno != ENOENT) {
1489 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1490 errno,
1491 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1492 return 0;
1493 }
4d9f07b4 1494
03cfe0d5 1495 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1496 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1497 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1498 errno,
1499 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1500 return 0;
1501 }
e58a1277 1502
03cfe0d5
LP
1503 r = userns_lchown(where, 0, 0);
1504 if (r < 0)
1505 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1506
e58a1277 1507 return 0;
88213476
LP
1508}
1509
7357272e 1510static int resolved_listening(void) {
b053cd5f 1511 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1512 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1513 int r;
1514
7357272e 1515 /* Check if resolved is listening */
b053cd5f
LP
1516
1517 r = sd_bus_open_system(&bus);
1518 if (r < 0)
1519 return r;
1520
7357272e
DM
1521 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1522 if (r <= 0)
1523 return r;
1524
1525 r = sd_bus_get_property_string(bus,
1526 "org.freedesktop.resolve1",
1527 "/org/freedesktop/resolve1",
1528 "org.freedesktop.resolve1.Manager",
1529 "DNSStubListener",
1530 NULL,
1531 &dns_stub_listener_mode);
1532 if (r < 0)
1533 return r;
1534
1535 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1536}
1537
2547bb41 1538static int setup_resolv_conf(const char *dest) {
87447ae4
LP
1539 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1540 const char *where;
1541 int r, found;
2547bb41
LP
1542
1543 assert(dest);
1544
1545 if (arg_private_network)
1546 return 0;
1547
87447ae4
LP
1548 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1549 if (r < 0) {
1550 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1551 return 0;
1552 }
1553
1554 where = strjoina(etc, "/resolv.conf");
1555 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1556 if (found < 0) {
1557 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1558 return 0;
1559 }
79d80fc1 1560
62b1e758 1561 if (access(STATIC_RESOLV_CONF, F_OK) >= 0 &&
7357272e 1562 resolved_listening() > 0) {
87447ae4 1563
3539724c
LP
1564 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1565 * container, so that the container can use the host's resolver. Given that network namespacing is
1566 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1567 * advantage that the container will be able to follow the host's DNS server configuration changes
1568 * transparently. */
1569
87447ae4
LP
1570 if (found == 0) /* missing? */
1571 (void) touch(resolved);
5367354d 1572
62b1e758 1573 r = mount_verbose(LOG_DEBUG, STATIC_RESOLV_CONF, resolved, NULL, MS_BIND, NULL);
60e76d48 1574 if (r >= 0)
87447ae4 1575 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1576 }
1577
1578 /* If that didn't work, let's copy the file */
1c876927 1579 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1580 if (r < 0) {
3539724c
LP
1581 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1582 * resolved or something similar runs inside and the symlink points there.
68a313c5 1583 *
3539724c 1584 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1585 */
87447ae4 1586 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1587 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1588 return 0;
1589 }
2547bb41 1590
03cfe0d5
LP
1591 r = userns_lchown(where, 0, 0);
1592 if (r < 0)
3539724c 1593 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1594
2547bb41
LP
1595 return 0;
1596}
1597
1e4f1671 1598static int setup_boot_id(void) {
cdde6ba6
LP
1599 _cleanup_(unlink_and_freep) char *from = NULL;
1600 _cleanup_free_ char *path = NULL;
3bbaff3e 1601 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1602 const char *to;
04bc4a3f
LP
1603 int r;
1604
04bc4a3f
LP
1605 /* Generate a new randomized boot ID, so that each boot-up of
1606 * the container gets a new one */
1607
cdde6ba6
LP
1608 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1609 if (r < 0)
1610 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1611
1612 r = sd_id128_randomize(&rnd);
f647962d
MS
1613 if (r < 0)
1614 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1615
cdde6ba6 1616 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1617 if (r < 0)
1618 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1619
cdde6ba6
LP
1620 from = TAKE_PTR(path);
1621 to = "/proc/sys/kernel/random/boot_id";
1622
60e76d48 1623 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1624 if (r < 0)
1625 return r;
04bc4a3f 1626
cdde6ba6 1627 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1628}
1629
e58a1277 1630static int copy_devnodes(const char *dest) {
88213476
LP
1631
1632 static const char devnodes[] =
1633 "null\0"
1634 "zero\0"
1635 "full\0"
1636 "random\0"
1637 "urandom\0"
85614d66
TG
1638 "tty\0"
1639 "net/tun\0";
88213476
LP
1640
1641 const char *d;
e58a1277 1642 int r = 0;
7fd1b19b 1643 _cleanup_umask_ mode_t u;
a258bf26
LP
1644
1645 assert(dest);
124640f1
LP
1646
1647 u = umask(0000);
88213476 1648
03cfe0d5
LP
1649 /* Create /dev/net, so that we can create /dev/net/tun in it */
1650 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1651 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1652
88213476 1653 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1654 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1655 struct stat st;
88213476 1656
7f112f50 1657 from = strappend("/dev/", d);
03cfe0d5 1658 to = prefix_root(dest, from);
88213476
LP
1659
1660 if (stat(from, &st) < 0) {
1661
4a62c710
MS
1662 if (errno != ENOENT)
1663 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1664
a258bf26 1665 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1666
03cfe0d5 1667 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1668 return -EIO;
a258bf26 1669
85614d66 1670 } else {
81f5049b 1671 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1672 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1673 if (errno == EEXIST)
8dbf71ec 1674 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1675 if (errno != EPERM)
1676 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1677
1678 /* Some systems abusively restrict mknod but
1679 * allow bind mounts. */
1680 r = touch(to);
1681 if (r < 0)
1682 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1683 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1684 if (r < 0)
1685 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1686 }
6278cf60 1687
03cfe0d5
LP
1688 r = userns_lchown(to, 0, 0);
1689 if (r < 0)
1690 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1691 }
88213476
LP
1692 }
1693
e58a1277
LP
1694 return r;
1695}
88213476 1696
03cfe0d5
LP
1697static int setup_pts(const char *dest) {
1698 _cleanup_free_ char *options = NULL;
1699 const char *p;
709f6e46 1700 int r;
03cfe0d5 1701
349cc4a5 1702#if HAVE_SELINUX
03cfe0d5
LP
1703 if (arg_selinux_apifs_context)
1704 (void) asprintf(&options,
3dce8915 1705 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1706 arg_uid_shift + TTY_GID,
1707 arg_selinux_apifs_context);
1708 else
1709#endif
1710 (void) asprintf(&options,
3dce8915 1711 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1712 arg_uid_shift + TTY_GID);
f2d88580 1713
03cfe0d5 1714 if (!options)
f2d88580
LP
1715 return log_oom();
1716
03cfe0d5 1717 /* Mount /dev/pts itself */
cc9fce65 1718 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1719 r = mkdir_errno_wrapper(p, 0755);
1720 if (r < 0)
1721 return log_error_errno(r, "Failed to create /dev/pts: %m");
1722
60e76d48
ZJS
1723 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1724 if (r < 0)
1725 return r;
709f6e46
MS
1726 r = userns_lchown(p, 0, 0);
1727 if (r < 0)
1728 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1729
1730 /* Create /dev/ptmx symlink */
1731 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1732 if (symlink("pts/ptmx", p) < 0)
1733 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1734 r = userns_lchown(p, 0, 0);
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1737
03cfe0d5
LP
1738 /* And fix /dev/pts/ptmx ownership */
1739 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1740 r = userns_lchown(p, 0, 0);
1741 if (r < 0)
1742 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1743
f2d88580
LP
1744 return 0;
1745}
1746
e58a1277 1747static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1748 _cleanup_umask_ mode_t u;
1749 const char *to;
e58a1277 1750 int r;
e58a1277
LP
1751
1752 assert(dest);
1753 assert(console);
1754
1755 u = umask(0000);
1756
03cfe0d5 1757 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1758 if (r < 0)
1759 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1760
a258bf26
LP
1761 /* We need to bind mount the right tty to /dev/console since
1762 * ptys can only exist on pts file systems. To have something
81f5049b 1763 * to bind mount things on we create a empty regular file. */
a258bf26 1764
03cfe0d5 1765 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1766 r = touch(to);
1767 if (r < 0)
1768 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1769
60e76d48 1770 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1771}
1772
8e5430c4
LP
1773static int setup_keyring(void) {
1774 key_serial_t keyring;
1775
1776 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1777 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1778 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1779 * these system calls let's make sure we don't leak anything into the container. */
1780
1781 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1782 if (keyring == -1) {
1783 if (errno == ENOSYS)
1784 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1785 else if (IN_SET(errno, EACCES, EPERM))
1786 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1787 else
1788 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1789 }
1790
1791 return 0;
1792}
1793
1e4f1671 1794static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
1795 _cleanup_(unlink_and_freep) char *from = NULL;
1796 _cleanup_free_ char *fifo = NULL;
1797 _cleanup_close_ int fd = -1;
7fd1b19b 1798 _cleanup_umask_ mode_t u;
9ec5a93c
LP
1799 const char *to;
1800 int r;
e58a1277 1801
e58a1277 1802 assert(kmsg_socket >= 0);
a258bf26 1803
e58a1277 1804 u = umask(0000);
a258bf26 1805
9ec5a93c
LP
1806 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1807 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1808 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1809 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1810
1811 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1812 if (r < 0)
1813 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 1814
9ec5a93c 1815 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 1816 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
1817
1818 from = TAKE_PTR(fifo);
1819 to = "/proc/kmsg";
1820
60e76d48
ZJS
1821 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1822 if (r < 0)
1823 return r;
e58a1277
LP
1824
1825 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1826 if (fd < 0)
1827 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1828
9ec5a93c 1829 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 1830 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
1831 if (r < 0)
1832 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1833
25ea79fe 1834 return 0;
88213476
LP
1835}
1836
1c4baffc 1837static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1838 union in_addr_union *exposed = userdata;
1839
1840 assert(rtnl);
1841 assert(m);
1842 assert(exposed);
1843
7a8f6325 1844 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1845 return 0;
1846}
1847
3a74cea5 1848static int setup_hostname(void) {
c818eef1 1849 int r;
3a74cea5 1850
0c582db0 1851 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1852 return 0;
1853
c818eef1
LP
1854 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1855 if (r < 0)
1856 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 1857
7027ff61 1858 return 0;
3a74cea5
LP
1859}
1860
57fb9fb5 1861static int setup_journal(const char *directory) {
e01ff70a 1862 sd_id128_t this_id;
0f5e1382 1863 _cleanup_free_ char *d = NULL;
e01ff70a 1864 const char *p, *q;
8054d749 1865 bool try;
e01ff70a 1866 char id[33];
57fb9fb5
LP
1867 int r;
1868
df9a75e4
LP
1869 /* Don't link journals in ephemeral mode */
1870 if (arg_ephemeral)
1871 return 0;
1872
8054d749
LP
1873 if (arg_link_journal == LINK_NO)
1874 return 0;
1875
1876 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1877
4d680aee 1878 r = sd_id128_get_machine(&this_id);
f647962d
MS
1879 if (r < 0)
1880 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1881
e01ff70a 1882 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1883 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1884 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1885 if (try)
4d680aee 1886 return 0;
df9a75e4 1887 return -EEXIST;
4d680aee
ZJS
1888 }
1889
03cfe0d5
LP
1890 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1891 if (r < 0)
1892 return log_error_errno(r, "Failed to create /var: %m");
1893
1894 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1895 if (r < 0)
1896 return log_error_errno(r, "Failed to create /var/log: %m");
1897
1898 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1899 if (r < 0)
1900 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1901
e01ff70a
MS
1902 (void) sd_id128_to_string(arg_uuid, id);
1903
03cfe0d5
LP
1904 p = strjoina("/var/log/journal/", id);
1905 q = prefix_roota(directory, p);
27407a01 1906
e1873695 1907 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1908 if (try)
1909 return 0;
27407a01 1910
8054d749
LP
1911 log_error("%s: already a mount point, refusing to use for journal", p);
1912 return -EEXIST;
57fb9fb5
LP
1913 }
1914
e1873695 1915 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1916 if (try)
1917 return 0;
57fb9fb5 1918
8054d749
LP
1919 log_error("%s: already a mount point, refusing to use for journal", q);
1920 return -EEXIST;
57fb9fb5
LP
1921 }
1922
1923 r = readlink_and_make_absolute(p, &d);
1924 if (r >= 0) {
3742095b 1925 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
1926 path_equal(d, q)) {
1927
03cfe0d5 1928 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1929 if (r < 0)
709f6e46 1930 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1931 return 0;
57fb9fb5
LP
1932 }
1933
4a62c710
MS
1934 if (unlink(p) < 0)
1935 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1936 } else if (r == -EINVAL) {
1937
1938 if (arg_link_journal == LINK_GUEST &&
1939 rmdir(p) < 0) {
1940
27407a01
ZJS
1941 if (errno == ENOTDIR) {
1942 log_error("%s already exists and is neither a symlink nor a directory", p);
1943 return r;
4314d33f
MS
1944 } else
1945 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1946 }
4314d33f
MS
1947 } else if (r != -ENOENT)
1948 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1949
1950 if (arg_link_journal == LINK_GUEST) {
1951
1952 if (symlink(q, p) < 0) {
8054d749 1953 if (try) {
56f64d95 1954 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1955 return 0;
4314d33f
MS
1956 } else
1957 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1958 }
1959
03cfe0d5 1960 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1961 if (r < 0)
709f6e46 1962 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1963 return 0;
57fb9fb5
LP
1964 }
1965
1966 if (arg_link_journal == LINK_HOST) {
ccddd104 1967 /* don't create parents here — if the host doesn't have
574edc90 1968 * permanent journal set up, don't force it here */
ba8e6c4d 1969
dae8b82e
ZJS
1970 r = mkdir_errno_wrapper(p, 0755);
1971 if (r < 0 && r != -EEXIST) {
8054d749 1972 if (try) {
dae8b82e 1973 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1974 return 0;
4314d33f 1975 } else
dae8b82e 1976 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
1977 }
1978
27407a01
ZJS
1979 } else if (access(p, F_OK) < 0)
1980 return 0;
57fb9fb5 1981
cdb2b9d0
LP
1982 if (dir_is_empty(q) == 0)
1983 log_warning("%s is not empty, proceeding anyway.", q);
1984
03cfe0d5 1985 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1986 if (r < 0)
1987 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1988
60e76d48
ZJS
1989 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1990 if (r < 0)
4a62c710 1991 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1992
27407a01 1993 return 0;
57fb9fb5
LP
1994}
1995
88213476 1996static int drop_capabilities(void) {
520e0d54 1997 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1998}
1999
db999e0f
LP
2000static int reset_audit_loginuid(void) {
2001 _cleanup_free_ char *p = NULL;
2002 int r;
2003
0c582db0 2004 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2005 return 0;
2006
2007 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2008 if (r == -ENOENT)
db999e0f 2009 return 0;
f647962d
MS
2010 if (r < 0)
2011 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2012
2013 /* Already reset? */
2014 if (streq(p, "4294967295"))
2015 return 0;
2016
ad118bda 2017 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2018 if (r < 0) {
10a87006
LP
2019 log_error_errno(r,
2020 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2021 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2022 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2023 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2024 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2025
db999e0f 2026 sleep(5);
77b6e194 2027 }
db999e0f
LP
2028
2029 return 0;
77b6e194
LP
2030}
2031
785890ac
LP
2032static int setup_propagate(const char *root) {
2033 const char *p, *q;
709f6e46 2034 int r;
785890ac
LP
2035
2036 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2037 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2038 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2039 (void) mkdir_p(p, 0600);
2040
709f6e46
MS
2041 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2042 if (r < 0)
2043 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2044
709f6e46
MS
2045 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2046 if (r < 0)
2047 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2048
709f6e46
MS
2049 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2050 if (r < 0)
2051 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2052
03cfe0d5 2053 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2054 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2055 if (r < 0)
2056 return r;
785890ac 2057
60e76d48
ZJS
2058 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2059 if (r < 0)
2060 return r;
785890ac 2061
19caffac
AC
2062 /* machined will MS_MOVE into that directory, and that's only
2063 * supported for non-shared mounts. */
60e76d48 2064 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2065}
2066
317feb4d 2067static int setup_machine_id(const char *directory) {
691675ba
LP
2068 const char *etc_machine_id;
2069 sd_id128_t id;
3bbaff3e 2070 int r;
e01ff70a 2071
317feb4d
LP
2072 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2073 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2074 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2075 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2076 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2077 * container behaves nicely). */
2078
e01ff70a
MS
2079 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2080
691675ba 2081 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2082 if (r < 0) {
2083 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2084 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2085
317feb4d
LP
2086 if (sd_id128_is_null(arg_uuid)) {
2087 r = sd_id128_randomize(&arg_uuid);
2088 if (r < 0)
2089 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2090 }
2091 } else {
2092 if (sd_id128_is_null(id)) {
2093 log_error("Machine ID in container image is zero, refusing.");
2094 return -EINVAL;
2095 }
e01ff70a 2096
317feb4d
LP
2097 arg_uuid = id;
2098 }
691675ba 2099
e01ff70a
MS
2100 return 0;
2101}
2102
7336138e
LP
2103static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2104 int r;
2105
2106 assert(directory);
2107
0de7acce 2108 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2109 return 0;
2110
2111 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2112 if (r == -EOPNOTSUPP)
2113 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2114 if (r == -EBADE)
2115 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2116 if (r < 0)
2117 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2118 if (r == 0)
2119 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2120 else
2121 log_debug("Patched directory tree to match UID/GID range.");
2122
2123 return r;
2124}
2125
113cea80 2126/*
6d416b9c
LS
2127 * Return values:
2128 * < 0 : wait_for_terminate() failed to get the state of the
2129 * container, the container was terminated by a signal, or
2130 * failed for an unknown reason. No change is made to the
2131 * container argument.
2132 * > 0 : The program executed in the container terminated with an
2133 * error. The exit code of the program executed in the
919699ec
LP
2134 * container is returned. The container argument has been set
2135 * to CONTAINER_TERMINATED.
6d416b9c
LS
2136 * 0 : The container is being rebooted, has been shut down or exited
2137 * successfully. The container argument has been set to either
2138 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2139 *
6d416b9c
LS
2140 * That is, success is indicated by a return value of zero, and an
2141 * error is indicated by a non-zero value.
113cea80
DH
2142 */
2143static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2144 siginfo_t status;
919699ec 2145 int r;
113cea80
DH
2146
2147 r = wait_for_terminate(pid, &status);
f647962d
MS
2148 if (r < 0)
2149 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2150
2151 switch (status.si_code) {
fddbb89c 2152
113cea80 2153 case CLD_EXITED:
b5a2179b 2154 if (status.si_status == 0)
919699ec 2155 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2156 else
919699ec 2157 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2158
919699ec
LP
2159 *container = CONTAINER_TERMINATED;
2160 return status.si_status;
113cea80
DH
2161
2162 case CLD_KILLED:
2163 if (status.si_status == SIGINT) {
919699ec 2164 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2165 *container = CONTAINER_TERMINATED;
919699ec
LP
2166 return 0;
2167
113cea80 2168 } else if (status.si_status == SIGHUP) {
919699ec 2169 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2170 *container = CONTAINER_REBOOTED;
919699ec 2171 return 0;
113cea80 2172 }
919699ec 2173
4831981d 2174 _fallthrough_;
113cea80 2175 case CLD_DUMPED:
fddbb89c 2176 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2177 return -EIO;
113cea80
DH
2178
2179 default:
fddbb89c 2180 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2181 return -EIO;
113cea80 2182 }
113cea80
DH
2183}
2184
023fb90b
LP
2185static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2186 pid_t pid;
2187
4a0b58c4 2188 pid = PTR_TO_PID(userdata);
023fb90b 2189 if (pid > 0) {
c6c8f6e2 2190 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2191 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2192 sd_event_source_set_userdata(s, NULL);
2193 return 0;
2194 }
2195 }
2196
2197 sd_event_exit(sd_event_source_get_event(s), 0);
2198 return 0;
2199}
2200
6916b164 2201static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2202 pid_t pid;
2203
2204 assert(s);
2205 assert(ssi);
2206
2207 pid = PTR_TO_PID(userdata);
2208
6916b164
AU
2209 for (;;) {
2210 siginfo_t si = {};
abdb9b08 2211
6916b164
AU
2212 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2213 return log_error_errno(errno, "Failed to waitid(): %m");
2214 if (si.si_pid == 0) /* No pending children. */
2215 break;
abdb9b08 2216 if (si.si_pid == pid) {
6916b164
AU
2217 /* The main process we care for has exited. Return from
2218 * signal handler but leave the zombie. */
2219 sd_event_exit(sd_event_source_get_event(s), 0);
2220 break;
2221 }
abdb9b08 2222
6916b164
AU
2223 /* Reap all other children. */
2224 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2225 }
2226
2227 return 0;
2228}
2229
abdb9b08
LP
2230static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2231 pid_t pid;
2232
2233 assert(m);
2234
2235 pid = PTR_TO_PID(userdata);
2236
2237 if (arg_kill_signal > 0) {
2238 log_info("Container termination requested. Attempting to halt container.");
2239 (void) kill(pid, arg_kill_signal);
2240 } else {
2241 log_info("Container termination requested. Exiting.");
2242 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2243 }
2244
2245 return 0;
2246}
2247
ec16945e 2248static int determine_names(void) {
1b9cebf6 2249 int r;
ec16945e 2250
c1521918
LP
2251 if (arg_template && !arg_directory && arg_machine) {
2252
2253 /* If --template= was specified then we should not
2254 * search for a machine, but instead create a new one
2255 * in /var/lib/machine. */
2256
605405c6 2257 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2258 if (!arg_directory)
2259 return log_oom();
2260 }
2261
ec16945e 2262 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2263 if (arg_machine) {
2264 _cleanup_(image_unrefp) Image *i = NULL;
2265
2266 r = image_find(arg_machine, &i);
2267 if (r < 0)
2268 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2269 if (r == 0) {
35bca925 2270 log_error("No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2271 return -ENOENT;
2272 }
2273
eb38edce 2274 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2275 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2276 else
0f03c2a4 2277 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2278 if (r < 0)
0f3be6ca 2279 return log_oom();
1b9cebf6 2280
aee327b8
LP
2281 if (!arg_ephemeral)
2282 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2283 } else {
2284 r = safe_getcwd(&arg_directory);
2285 if (r < 0)
2286 return log_error_errno(r, "Failed to determine current directory: %m");
2287 }
ec16945e 2288
0f3be6ca 2289 if (!arg_directory && !arg_image) {
1b9cebf6 2290 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2291 return -EINVAL;
2292 }
2293 }
2294
2295 if (!arg_machine) {
4827ab48 2296
b9ba4dab
LP
2297 if (arg_directory && path_equal(arg_directory, "/"))
2298 arg_machine = gethostname_malloc();
4827ab48
LP
2299 else {
2300 if (arg_image) {
2301 char *e;
2302
2303 arg_machine = strdup(basename(arg_image));
2304
2305 /* Truncate suffix if there is one */
2306 e = endswith(arg_machine, ".raw");
2307 if (e)
2308 *e = 0;
2309 } else
2310 arg_machine = strdup(basename(arg_directory));
2311 }
ec16945e
LP
2312 if (!arg_machine)
2313 return log_oom();
2314
ae691c1d 2315 hostname_cleanup(arg_machine);
ec16945e
LP
2316 if (!machine_name_is_valid(arg_machine)) {
2317 log_error("Failed to determine machine name automatically, please use -M.");
2318 return -EINVAL;
2319 }
b9ba4dab
LP
2320
2321 if (arg_ephemeral) {
2322 char *b;
2323
2324 /* Add a random suffix when this is an
2325 * ephemeral machine, so that we can run many
2326 * instances at once without manually having
2327 * to specify -M each time. */
2328
2329 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2330 return log_oom();
2331
2332 free(arg_machine);
2333 arg_machine = b;
2334 }
ec16945e
LP
2335 }
2336
2337 return 0;
2338}
2339
8d4aa2bb 2340static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2341 char *chased;
2342 int r;
2343
2344 assert(p);
2345
2346 if (!*p)
2347 return 0;
2348
8d4aa2bb 2349 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2350 if (r < 0)
2351 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2352
8405dcf7
ZJS
2353 free_and_replace(*p, chased);
2354 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2355}
2356
03cfe0d5 2357static int determine_uid_shift(const char *directory) {
6dac160c
LP
2358 int r;
2359
0de7acce 2360 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2361 arg_uid_shift = 0;
6dac160c 2362 return 0;
03cfe0d5 2363 }
6dac160c
LP
2364
2365 if (arg_uid_shift == UID_INVALID) {
2366 struct stat st;
2367
03cfe0d5 2368 r = stat(directory, &st);
6dac160c 2369 if (r < 0)
03cfe0d5 2370 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2371
2372 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2373
2374 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2375 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2376 return -EINVAL;
2377 }
2378
2379 arg_uid_range = UINT32_C(0x10000);
2380 }
2381
2382 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2383 log_error("UID base too high for UID range.");
2384 return -EINVAL;
2385 }
2386
6dac160c
LP
2387 return 0;
2388}
2389
03cfe0d5
LP
2390static int inner_child(
2391 Barrier *barrier,
2392 const char *directory,
2393 bool secondary,
2394 int kmsg_socket,
2395 int rtnl_socket,
f757855e 2396 FDSet *fds) {
69c79d3c 2397
03cfe0d5 2398 _cleanup_free_ char *home = NULL;
e01ff70a 2399 char as_uuid[37];
88614c8a 2400 size_t n_env = 1;
03cfe0d5 2401 const char *envp[] = {
0c300adf 2402 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2403 NULL, /* container */
03cfe0d5
LP
2404 NULL, /* TERM */
2405 NULL, /* HOME */
2406 NULL, /* USER */
2407 NULL, /* LOGNAME */
2408 NULL, /* container_uuid */
2409 NULL, /* LISTEN_FDS */
2410 NULL, /* LISTEN_PID */
9c1e04d0 2411 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2412 NULL
2413 };
1a68e1e5 2414 const char *exec_target;
2371271c 2415 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2416 int r;
88213476 2417
03cfe0d5
LP
2418 assert(barrier);
2419 assert(directory);
2420 assert(kmsg_socket >= 0);
88213476 2421
0de7acce 2422 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2423 /* Tell the parent, that it now can write the UID map. */
2424 (void) barrier_place(barrier); /* #1 */
7027ff61 2425
03cfe0d5
LP
2426 /* Wait until the parent wrote the UID map */
2427 if (!barrier_place_and_sync(barrier)) { /* #2 */
2428 log_error("Parent died too early");
2429 return -ESRCH;
2430 }
88213476
LP
2431 }
2432
6d66bd3b
EV
2433 r = reset_uid_gid();
2434 if (r < 0)
2435 return log_error_errno(r, "Couldn't become new root: %m");
2436
0de7acce 2437 r = mount_all(NULL,
4f086aab 2438 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2439 arg_uid_shift,
2440 arg_uid_range,
2441 arg_selinux_apifs_context);
03cfe0d5
LP
2442 if (r < 0)
2443 return r;
2444
04413780
ZJS
2445 if (!arg_network_namespace_path && arg_private_network) {
2446 r = unshare(CLONE_NEWNET);
2447 if (r < 0)
2448 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2449
2450 /* Tell the parent that it can setup network interfaces. */
2451 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2452 }
2453
4f086aab 2454 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2455 if (r < 0)
2456 return r;
2457
03cfe0d5
LP
2458 /* Wait until we are cgroup-ified, so that we
2459 * can mount the right cgroup path writable */
75116558 2460 if (!barrier_place_and_sync(barrier)) { /* #4 */
03cfe0d5
LP
2461 log_error("Parent died too early");
2462 return -ESRCH;
88213476
LP
2463 }
2464
5a8ff0e6 2465 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2466 r = unshare(CLONE_NEWCGROUP);
2467 if (r < 0)
04413780 2468 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2469 r = mount_cgroups(
2470 "",
2471 arg_unified_cgroup_hierarchy,
2472 arg_userns_mode != USER_NAMESPACE_NO,
2473 arg_uid_shift,
2474 arg_uid_range,
5a8ff0e6 2475 arg_selinux_apifs_context,
ada54120 2476 true);
0996ef00
CB
2477 if (r < 0)
2478 return r;
2479 } else {
2480 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2481 if (r < 0)
2482 return r;
2483 }
ec16945e 2484
1e4f1671 2485 r = setup_boot_id();
03cfe0d5
LP
2486 if (r < 0)
2487 return r;
ec16945e 2488
1e4f1671 2489 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2490 if (r < 0)
2491 return r;
2492 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2493
03cfe0d5 2494 umask(0022);
30535c16 2495
03cfe0d5
LP
2496 if (setsid() < 0)
2497 return log_error_errno(errno, "setsid() failed: %m");
2498
2499 if (arg_private_network)
2500 loopback_setup();
2501
7a8f6325
LP
2502 if (arg_expose_ports) {
2503 r = expose_port_send_rtnl(rtnl_socket);
2504 if (r < 0)
2505 return r;
2506 rtnl_socket = safe_close(rtnl_socket);
2507 }
03cfe0d5 2508
81f345df
LP
2509 if (arg_oom_score_adjust_set) {
2510 r = set_oom_score_adjust(arg_oom_score_adjust);
2511 if (r < 0)
2512 return log_error_errno(r, "Failed to adjust OOM score: %m");
2513 }
2514
d107bb7d
LP
2515 if (arg_cpuset)
2516 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2517 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2518
709f6e46
MS
2519 r = drop_capabilities();
2520 if (r < 0)
2521 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5 2522
c818eef1 2523 (void) setup_hostname();
03cfe0d5 2524
050f7277 2525 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2526 r = safe_personality(arg_personality);
2527 if (r < 0)
2528 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2529 } else if (secondary) {
21022b9d
LP
2530 r = safe_personality(PER_LINUX32);
2531 if (r < 0)
2532 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2533 }
2534
349cc4a5 2535#if HAVE_SELINUX
03cfe0d5 2536 if (arg_selinux_context)
2ed96880 2537 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2538 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2539#endif
2540
ee645080 2541 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2542 if (r < 0)
2543 return r;
2544
66edd963
LP
2545 if (arg_no_new_privileges)
2546 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2547 return log_error_errno(errno, "Failed to disable new privileges: %m");
2548
6aadfa4c
ILG
2549 /* LXC sets container=lxc, so follow the scheme here */
2550 envp[n_env++] = strjoina("container=", arg_container_service_name);
2551
03cfe0d5
LP
2552 envp[n_env] = strv_find_prefix(environ, "TERM=");
2553 if (envp[n_env])
313cefa1 2554 n_env++;
03cfe0d5
LP
2555
2556 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2557 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2558 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2559 return log_oom();
2560
3bbaff3e 2561 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2562
691675ba 2563 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2564 return log_oom();
03cfe0d5
LP
2565
2566 if (fdset_size(fds) > 0) {
2567 r = fdset_cloexec(fds, false);
2568 if (r < 0)
2569 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2570
2571 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2572 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2573 return log_oom();
2574 }
9c1e04d0
AP
2575 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2576 return log_oom();
03cfe0d5 2577
2371271c
TG
2578 env_use = strv_env_merge(2, envp, arg_setenv);
2579 if (!env_use)
2580 return log_oom();
03cfe0d5
LP
2581
2582 /* Let the parent know that we are ready and
2583 * wait until the parent is ready with the
2584 * setup, too... */
75116558 2585 if (!barrier_place_and_sync(barrier)) { /* #5 */
03cfe0d5
LP
2586 log_error("Parent died too early");
2587 return -ESRCH;
2588 }
2589
5f932eb9
LP
2590 if (arg_chdir)
2591 if (chdir(arg_chdir) < 0)
2592 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2593
7732f92b 2594 if (arg_start_mode == START_PID2) {
75bf701f 2595 r = stub_pid1(arg_uuid);
7732f92b
LP
2596 if (r < 0)
2597 return r;
2598 }
2599
8ca082b4
LP
2600 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
2601 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
2602 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 2603 log_close();
8ca082b4
LP
2604 log_set_open_when_needed(true);
2605
03cfe0d5
LP
2606 (void) fdset_close_others(fds);
2607
7732f92b 2608 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2609 char **a;
2610 size_t m;
2611
2612 /* Automatically search for the init system */
2613
75f32f04
ZJS
2614 m = strv_length(arg_parameters);
2615 a = newa(char*, m + 2);
2616 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2617 a[1 + m] = NULL;
03cfe0d5 2618
ced58da7 2619 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2620 execve(a[0], a, env_use);
2621
ced58da7 2622 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2623 execve(a[0], a, env_use);
2624
ced58da7 2625 a[0] = (char*) "/sbin/init";
03cfe0d5 2626 execve(a[0], a, env_use);
ced58da7
LP
2627
2628 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2629 } else if (!strv_isempty(arg_parameters)) {
2630 exec_target = arg_parameters[0];
f757855e 2631 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2632 } else {
5f932eb9 2633 if (!arg_chdir)
d929b0f9
ZJS
2634 /* If we cannot change the directory, we'll end up in /, that is expected. */
2635 (void) chdir(home ?: "/root");
5f932eb9 2636
03cfe0d5
LP
2637 execle("/bin/bash", "-bash", NULL, env_use);
2638 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2639
2640 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2641 }
2642
8ca082b4 2643 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2644}
2645
9c1e04d0
AP
2646static int setup_sd_notify_child(void) {
2647 static const int one = 1;
2648 int fd = -1;
2649 union sockaddr_union sa = {
2650 .sa.sa_family = AF_UNIX,
2651 };
2652 int r;
2653
2654 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2655 if (fd < 0)
2656 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2657
2658 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2659 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2660
2661 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2662 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2663 if (r < 0) {
2664 safe_close(fd);
2665 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2666 }
2667
adc7d9f0
EV
2668 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2669 if (r < 0) {
2670 safe_close(fd);
2671 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2672 }
2673
9c1e04d0
AP
2674 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2675 if (r < 0) {
2676 safe_close(fd);
2677 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2678 }
2679
2680 return fd;
2681}
2682
03cfe0d5
LP
2683static int outer_child(
2684 Barrier *barrier,
2685 const char *directory,
2686 const char *console,
2d845785 2687 DissectedImage *dissected_image,
03cfe0d5
LP
2688 bool interactive,
2689 bool secondary,
2690 int pid_socket,
e01ff70a 2691 int uuid_socket,
9c1e04d0 2692 int notify_socket,
03cfe0d5
LP
2693 int kmsg_socket,
2694 int rtnl_socket,
825d5287 2695 int uid_shift_socket,
8199d554 2696 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
2697 FDSet *fds,
2698 int netns_fd) {
03cfe0d5 2699
bf428efb
LP
2700 _cleanup_close_ int fd = -1;
2701 int r, which_failed;
03cfe0d5
LP
2702 pid_t pid;
2703 ssize_t l;
03cfe0d5
LP
2704
2705 assert(barrier);
2706 assert(directory);
2707 assert(console);
2708 assert(pid_socket >= 0);
e01ff70a 2709 assert(uuid_socket >= 0);
9c1e04d0 2710 assert(notify_socket >= 0);
03cfe0d5
LP
2711 assert(kmsg_socket >= 0);
2712
2713 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2714 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2715
2716 if (interactive) {
2b33ab09 2717 int terminal;
03cfe0d5 2718
2b33ab09
LP
2719 terminal = open_terminal(console, O_RDWR);
2720 if (terminal < 0)
2721 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 2722
2b33ab09
LP
2723 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2724 if (r < 0)
2725 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
2726 }
2727
2728 r = reset_audit_loginuid();
2729 if (r < 0)
2730 return r;
2731
2732 /* Mark everything as slave, so that we still
2733 * receive mounts from the real root, but don't
2734 * propagate mounts to the real root. */
60e76d48
ZJS
2735 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2736 if (r < 0)
2737 return r;
03cfe0d5 2738
2d845785 2739 if (dissected_image) {
2d3a5a73
LP
2740 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2741 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2742 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2743 * makes sure ESP partitions and userns are compatible. */
2744
2745 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2746 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2747 if (r < 0)
2748 return r;
2749 }
03cfe0d5 2750
391567f4
LP
2751 r = determine_uid_shift(directory);
2752 if (r < 0)
2753 return r;
2754
0de7acce 2755 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2756 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2757 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2758 if (l < 0)
2759 return log_error_errno(errno, "Failed to send UID shift: %m");
2760 if (l != sizeof(arg_uid_shift)) {
2761 log_error("Short write while sending UID shift.");
2762 return -EIO;
2763 }
0e7ac751 2764
0de7acce 2765 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2766 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2767 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2768 * not it will pick a different one, and send it back to us. */
2769
2770 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2771 if (l < 0)
2772 return log_error_errno(errno, "Failed to recv UID shift: %m");
2773 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2774 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2775 return -EIO;
2776 }
2777 }
2778
2779 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2780 }
2781
2d3a5a73
LP
2782 if (dissected_image) {
2783 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2784 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2785 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2786 if (r < 0)
2787 return r;
2788 }
2789
8199d554
LP
2790 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2791 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2792
2793 r = detect_unified_cgroup_hierarchy_from_image(directory);
2794 if (r < 0)
2795 return r;
2796
2797 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2798 if (l < 0)
2799 return log_error_errno(errno, "Failed to send cgroup mode: %m");
2800 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
2801 log_error("Short write while sending cgroup mode: %m");
2802 return -EIO;
2803 }
2804
2805 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2806 }
2807
03cfe0d5 2808 /* Turn directory into bind mount */
60e76d48
ZJS
2809 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2810 if (r < 0)
2811 return r;
03cfe0d5 2812
b53ede69
PW
2813 r = setup_pivot_root(
2814 directory,
2815 arg_pivot_root_new,
2816 arg_pivot_root_old);
2817 if (r < 0)
2818 return r;
2819
0de7acce
LP
2820 r = setup_volatile(
2821 directory,
2822 arg_volatile_mode,
2823 arg_userns_mode != USER_NAMESPACE_NO,
2824 arg_uid_shift,
2825 arg_uid_range,
2826 arg_selinux_context);
03cfe0d5
LP
2827 if (r < 0)
2828 return r;
2829
0de7acce
LP
2830 r = setup_volatile_state(
2831 directory,
2832 arg_volatile_mode,
2833 arg_userns_mode != USER_NAMESPACE_NO,
2834 arg_uid_shift,
2835 arg_uid_range,
2836 arg_selinux_context);
03cfe0d5
LP
2837 if (r < 0)
2838 return r;
2839
4ad14eff
LP
2840 /* Mark everything as shared so our mounts get propagated down. This is
2841 * required to make new bind mounts available in systemd services
2842 * inside the containter that create a new mount namespace.
2843 * See https://github.com/systemd/systemd/issues/3860
2844 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2845 * shared propagation mode. */
4ad14eff
LP
2846 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2847 if (r < 0)
2848 return r;
2849
2850 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2851 if (r < 0)
2852 return r;
2853
03cfe0d5
LP
2854 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2855 if (r < 0)
2856 return r;
2857
03cfe0d5 2858 if (arg_read_only) {
6b7c9f8b 2859 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2860 if (r < 0)
2861 return log_error_errno(r, "Failed to make tree read-only: %m");
2862 }
2863
0de7acce 2864 r = mount_all(directory,
4f086aab 2865 arg_mount_settings,
0de7acce
LP
2866 arg_uid_shift,
2867 arg_uid_range,
2868 arg_selinux_apifs_context);
03cfe0d5
LP
2869 if (r < 0)
2870 return r;
2871
07fa00f9
LP
2872 r = copy_devnodes(directory);
2873 if (r < 0)
03cfe0d5
LP
2874 return r;
2875
2876 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2877
07fa00f9
LP
2878 r = setup_pts(directory);
2879 if (r < 0)
03cfe0d5
LP
2880 return r;
2881
2882 r = setup_propagate(directory);
2883 if (r < 0)
2884 return r;
2885
2886 r = setup_dev_console(directory, console);
2887 if (r < 0)
2888 return r;
2889
8e5430c4
LP
2890 r = setup_keyring();
2891 if (r < 0)
2892 return r;
2893
960e4569 2894 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
2895 if (r < 0)
2896 return r;
2897
2898 r = setup_timezone(directory);
2899 if (r < 0)
2900 return r;
2901
2902 r = setup_resolv_conf(directory);
2903 if (r < 0)
2904 return r;
2905
e01ff70a
MS
2906 r = setup_machine_id(directory);
2907 if (r < 0)
2908 return r;
2909
03cfe0d5
LP
2910 r = setup_journal(directory);
2911 if (r < 0)
2912 return r;
2913
0de7acce
LP
2914 r = mount_custom(
2915 directory,
2916 arg_custom_mounts,
2917 arg_n_custom_mounts,
2918 arg_userns_mode != USER_NAMESPACE_NO,
2919 arg_uid_shift,
2920 arg_uid_range,
2921 arg_selinux_apifs_context);
03cfe0d5
LP
2922 if (r < 0)
2923 return r;
2924
5a8ff0e6 2925 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2926 r = mount_cgroups(
2927 directory,
2928 arg_unified_cgroup_hierarchy,
2929 arg_userns_mode != USER_NAMESPACE_NO,
2930 arg_uid_shift,
2931 arg_uid_range,
5a8ff0e6 2932 arg_selinux_apifs_context,
ada54120 2933 false);
0996ef00
CB
2934 if (r < 0)
2935 return r;
2936 }
03cfe0d5
LP
2937
2938 r = mount_move_root(directory);
2939 if (r < 0)
2940 return log_error_errno(r, "Failed to move root directory: %m");
2941
9c1e04d0
AP
2942 fd = setup_sd_notify_child();
2943 if (fd < 0)
2944 return fd;
2945
bf428efb
LP
2946 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
2947 if (r < 0)
2948 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
2949
03cfe0d5 2950 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2951 arg_clone_ns_flags |
8869a0b4 2952 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2953 if (pid < 0)
2954 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2955 if (pid == 0) {
2956 pid_socket = safe_close(pid_socket);
e01ff70a 2957 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2958 notify_socket = safe_close(notify_socket);
825d5287 2959 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2960
2961 /* The inner child has all namespaces that are
2962 * requested, so that we all are owned by the user if
2963 * user namespaces are turned on. */
2964
d7bea6b6
DP
2965 if (arg_network_namespace_path) {
2966 r = namespace_enter(-1, -1, netns_fd, -1, -1);
2967 if (r < 0)
2968 return r;
2969 }
2970
f757855e 2971 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2972 if (r < 0)
2973 _exit(EXIT_FAILURE);
2974
2975 _exit(EXIT_SUCCESS);
2976 }
2977
2978 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2979 if (l < 0)
2980 return log_error_errno(errno, "Failed to send PID: %m");
2981 if (l != sizeof(pid)) {
2982 log_error("Short write while sending PID.");
2983 return -EIO;
2984 }
2985
e01ff70a
MS
2986 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2987 if (l < 0)
2988 return log_error_errno(errno, "Failed to send machine ID: %m");
2989 if (l != sizeof(arg_uuid)) {
2990 log_error("Short write while sending machine ID.");
2991 return -EIO;
2992 }
2993
9c1e04d0
AP
2994 l = send_one_fd(notify_socket, fd, 0);
2995 if (l < 0)
2996 return log_error_errno(errno, "Failed to send notify fd: %m");
2997
03cfe0d5 2998 pid_socket = safe_close(pid_socket);
e01ff70a 2999 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3000 notify_socket = safe_close(notify_socket);
327e26d6
KN
3001 kmsg_socket = safe_close(kmsg_socket);
3002 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3003 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3004
3005 return 0;
3006}
3007
0e7ac751 3008static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3009 bool tried_hashed = false;
0e7ac751
LP
3010 unsigned n_tries = 100;
3011 uid_t candidate;
3012 int r;
3013
3014 assert(shift);
3015 assert(ret_lock_file);
0de7acce 3016 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3017 assert(arg_uid_range == 0x10000U);
3018
3019 candidate = *shift;
3020
3021 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3022
3023 for (;;) {
fbd0b64f 3024 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3025 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3026
3027 if (--n_tries <= 0)
3028 return -EBUSY;
3029
87d5e4f2 3030 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3031 goto next;
3032 if ((candidate & UINT32_C(0xFFFF)) != 0)
3033 goto next;
3034
3035 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3036 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3037 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3038 goto next;
3039 if (r < 0)
3040 return r;
3041
3042 /* Make some superficial checks whether the range is currently known in the user database */
3043 if (getpwuid(candidate))
3044 goto next;
3045 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3046 goto next;
3047 if (getgrgid(candidate))
3048 goto next;
3049 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3050 goto next;
3051
3052 *ret_lock_file = lf;
3053 lf = (struct LockFile) LOCK_FILE_INIT;
3054 *shift = candidate;
3055 return 0;
3056
3057 next:
d381c8a6
LP
3058 if (arg_machine && !tried_hashed) {
3059 /* Try to hash the base from the container name */
3060
3061 static const uint8_t hash_key[] = {
3062 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3063 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3064 };
3065
3066 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3067
3068 tried_hashed = true;
3069 } else
3070 random_bytes(&candidate, sizeof(candidate));
3071
87d5e4f2 3072 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3073 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3074 }
3075}
3076
03cfe0d5 3077static int setup_uid_map(pid_t pid) {
fbd0b64f 3078 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3079 int r;
3080
3081 assert(pid > 1);
3082
3083 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3084 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3085 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3086 if (r < 0)
3087 return log_error_errno(r, "Failed to write UID map: %m");
3088
3089 /* We always assign the same UID and GID ranges */
3090 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3091 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3092 if (r < 0)
3093 return log_error_errno(r, "Failed to write GID map: %m");
3094
3095 return 0;
3096}
3097
9c1e04d0 3098static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3099 char buf[NOTIFY_BUFFER_MAX+1];
3100 char *p = NULL;
3101 struct iovec iovec = {
3102 .iov_base = buf,
3103 .iov_len = sizeof(buf)-1,
3104 };
3105 union {
3106 struct cmsghdr cmsghdr;
3107 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3108 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3109 } control = {};
3110 struct msghdr msghdr = {
3111 .msg_iov = &iovec,
3112 .msg_iovlen = 1,
3113 .msg_control = &control,
3114 .msg_controllen = sizeof(control),
3115 };
3116 struct cmsghdr *cmsg;
3117 struct ucred *ucred = NULL;
3118 ssize_t n;
3119 pid_t inner_child_pid;
3120 _cleanup_strv_free_ char **tags = NULL;
3121
3122 assert(userdata);
3123
3124 inner_child_pid = PTR_TO_PID(userdata);
3125
3126 if (revents != EPOLLIN) {
3127 log_warning("Got unexpected poll event for notify fd.");
3128 return 0;
3129 }
3130
3131 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3132 if (n < 0) {
3742095b 3133 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3134 return 0;
3135
3136 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3137 }
3138 cmsg_close_all(&msghdr);
3139
3140 CMSG_FOREACH(cmsg, &msghdr) {
3141 if (cmsg->cmsg_level == SOL_SOCKET &&
3142 cmsg->cmsg_type == SCM_CREDENTIALS &&
3143 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3144
3145 ucred = (struct ucred*) CMSG_DATA(cmsg);
3146 }
3147 }
3148
3149 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3150 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3151 return 0;
3152 }
3153
3154 if ((size_t) n >= sizeof(buf)) {
3155 log_warning("Received notify message exceeded maximum size. Ignoring.");
3156 return 0;
3157 }
3158
3159 buf[n] = 0;
3160 tags = strv_split(buf, "\n\r");
3161 if (!tags)
3162 return log_oom();
3163
3164 if (strv_find(tags, "READY=1"))
3165 sd_notifyf(false, "READY=1\n");
3166
3167 p = strv_find_startswith(tags, "STATUS=");
3168 if (p)
3169 sd_notifyf(false, "STATUS=Container running: %s", p);
3170
3171 return 0;
3172}
3173
5773024d 3174static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3175 int r;
9c1e04d0 3176
5773024d 3177 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3178 if (r < 0)
3179 return log_error_errno(r, "Failed to allocate notify event source: %m");
3180
5773024d 3181 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3182
3183 return 0;
3184}
3185
5d961407
LP
3186static int merge_settings(Settings *settings, const char *path) {
3187 int rl;
f757855e 3188
5d961407
LP
3189 assert(settings);
3190 assert(path);
f757855e 3191
5d961407
LP
3192 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3193 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3194
7732f92b
LP
3195 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3196 settings->start_mode >= 0) {
3197 arg_start_mode = settings->start_mode;
130d3d22 3198 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3199 }
3200
b53ede69
PW
3201 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3202 settings->pivot_root_new) {
3203 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3204 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3205 }
3206
5f932eb9 3207 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3208 settings->working_directory)
3209 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3210
f757855e 3211 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3212 settings->environment)
3213 strv_free_and_replace(arg_setenv, settings->environment);
f757855e
LP
3214
3215 if ((arg_settings_mask & SETTING_USER) == 0 &&
1cc6c93a
YW
3216 settings->user)
3217 free_and_replace(arg_user, settings->user);
f757855e
LP
3218
3219 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3220 uint64_t plus;
f757855e 3221
0e265674
LP
3222 plus = settings->capability;
3223 if (settings_private_network(settings))
3224 plus |= (1ULL << CAP_NET_ADMIN);
3225
3226 if (!arg_settings_trusted && plus != 0) {
3227 if (settings->capability != 0)
5d961407 3228 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3229 } else
520e0d54 3230 arg_caps_retain |= plus;
f757855e 3231
520e0d54 3232 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3233 }
3234
3235 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3236 settings->kill_signal > 0)
3237 arg_kill_signal = settings->kill_signal;
3238
3239 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3240 settings->personality != PERSONALITY_INVALID)
3241 arg_personality = settings->personality;
3242
3243 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3244 !sd_id128_is_null(settings->machine_id)) {
3245
3246 if (!arg_settings_trusted)
5d961407 3247 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3248 else
3249 arg_uuid = settings->machine_id;
3250 }
3251
3252 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3253 settings->read_only >= 0)
3254 arg_read_only = settings->read_only;
3255
3256 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3257 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3258 arg_volatile_mode = settings->volatile_mode;
3259
3260 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3261 settings->n_custom_mounts > 0) {
3262
3263 if (!arg_settings_trusted)
5d961407 3264 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3265 else {
3266 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3267 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3268 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3269 settings->n_custom_mounts = 0;
3270 }
3271 }
3272
3273 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3274 (settings->private_network >= 0 ||
3275 settings->network_veth >= 0 ||
3276 settings->network_bridge ||
22b28dfd 3277 settings->network_zone ||
f757855e
LP
3278 settings->network_interfaces ||
3279 settings->network_macvlan ||
f6d6bad1
LP
3280 settings->network_ipvlan ||
3281 settings->network_veth_extra)) {
f757855e
LP
3282
3283 if (!arg_settings_trusted)
5d961407 3284 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3285 else {
f6d6bad1 3286 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3287 arg_private_network = settings_private_network(settings);
3288
130d3d22
YW
3289 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3290 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3291 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3292 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3293
1cc6c93a
YW
3294 free_and_replace(arg_network_bridge, settings->network_bridge);
3295 free_and_replace(arg_network_zone, settings->network_zone);
f757855e
LP
3296 }
3297 }
3298
3299 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3300 settings->expose_ports) {
3301
3302 if (!arg_settings_trusted)
5d961407 3303 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3304 else {
3305 expose_port_free_all(arg_expose_ports);
1cc6c93a 3306 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3307 }
3308 }
3309
0de7acce
LP
3310 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3311 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3312
3313 if (!arg_settings_trusted)
5d961407 3314 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3315 else {
3316 arg_userns_mode = settings->userns_mode;
3317 arg_uid_shift = settings->uid_shift;
3318 arg_uid_range = settings->uid_range;
3319 arg_userns_chown = settings->userns_chown;
3320 }
3321 }
3322
9c1e04d0
AP
3323 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3324 arg_notify_ready = settings->notify_ready;
3325
960e4569
LP
3326 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3327
3328 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
5d961407 3329 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3330 else {
130d3d22
YW
3331 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3332 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569
LP
3333 }
3334 }
3335
bf428efb
LP
3336 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3337 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3338 continue;
3339
3340 if (!settings->rlimit[rl])
3341 continue;
3342
3343 if (!arg_settings_trusted) {
5d961407 3344 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3345 continue;
3346 }
3347
3348 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3349 }
3350
3a9530e5
LP
3351 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3352 settings->hostname)
3353 free_and_replace(arg_hostname, settings->hostname);
3354
66edd963
LP
3355 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3356 settings->no_new_privileges >= 0)
3357 arg_no_new_privileges = settings->no_new_privileges;
3358
81f345df
LP
3359 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3360 settings->oom_score_adjust_set) {
3361
3362 if (!arg_settings_trusted)
5d961407 3363 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3364 else {
3365 arg_oom_score_adjust = settings->oom_score_adjust;
3366 arg_oom_score_adjust_set = true;
3367 }
3368 }
3369
d107bb7d
LP
3370 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3371 settings->cpuset) {
3372
3373 if (!arg_settings_trusted)
5d961407 3374 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3375 else {
3376 if (arg_cpuset)
3377 CPU_FREE(arg_cpuset);
3378 arg_cpuset = TAKE_PTR(settings->cpuset);
3379 arg_cpuset_ncpus = settings->cpuset_ncpus;
3380 }
3381 }
3382
f757855e
LP
3383 return 0;
3384}
3385
5d961407
LP
3386static int load_settings(void) {
3387 _cleanup_(settings_freep) Settings *settings = NULL;
3388 _cleanup_fclose_ FILE *f = NULL;
3389 _cleanup_free_ char *p = NULL;
3390 const char *fn, *i;
3391 int r;
3392
3393 /* If all settings are masked, there's no point in looking for
3394 * the settings file */
3395 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3396 return 0;
3397
3398 fn = strjoina(arg_machine, ".nspawn");
3399
3400 /* We first look in the admin's directories in /etc and /run */
3401 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3402 _cleanup_free_ char *j = NULL;
3403
3404 j = strjoin(i, "/", fn);
3405 if (!j)
3406 return log_oom();
3407
3408 f = fopen(j, "re");
3409 if (f) {
3410 p = TAKE_PTR(j);
3411
3412 /* By default, we trust configuration from /etc and /run */
3413 if (arg_settings_trusted < 0)
3414 arg_settings_trusted = true;
3415
3416 break;
3417 }
3418
3419 if (errno != ENOENT)
3420 return log_error_errno(errno, "Failed to open %s: %m", j);
3421 }
3422
3423 if (!f) {
3424 /* After that, let's look for a file next to the
3425 * actual image we shall boot. */
3426
3427 if (arg_image) {
3428 p = file_in_same_dir(arg_image, fn);
3429 if (!p)
3430 return log_oom();
3431 } else if (arg_directory) {
3432 p = file_in_same_dir(arg_directory, fn);
3433 if (!p)
3434 return log_oom();
3435 }
3436
3437 if (p) {
3438 f = fopen(p, "re");
3439 if (!f && errno != ENOENT)
3440 return log_error_errno(errno, "Failed to open %s: %m", p);
3441
3442 /* By default, we do not trust configuration from /var/lib/machines */
3443 if (arg_settings_trusted < 0)
3444 arg_settings_trusted = false;
3445 }
3446 }
3447
3448 if (!f)
3449 return 0;
3450
3451 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3452
3453 r = settings_load(f, p, &settings);
3454 if (r < 0)
3455 return r;
3456
3457 return merge_settings(settings, p);
3458}
3459
b0067625
ZJS
3460static int run(int master,
3461 const char* console,
2d845785 3462 DissectedImage *dissected_image,
b0067625
ZJS
3463 bool interactive,
3464 bool secondary,
3465 FDSet *fds,
3466 char veth_name[IFNAMSIZ], bool *veth_created,
3467 union in_addr_union *exposed,
3468 pid_t *pid, int *ret) {
3469
3470 static const struct sigaction sa = {
3471 .sa_handler = nop_signal_handler,
e28c7cd0 3472 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3473 };
3474
8e766630 3475 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
3476 _cleanup_close_ int etc_passwd_lock = -1;
3477 _cleanup_close_pair_ int
3478 kmsg_socket_pair[2] = { -1, -1 },
3479 rtnl_socket_pair[2] = { -1, -1 },
3480 pid_socket_pair[2] = { -1, -1 },
3481 uuid_socket_pair[2] = { -1, -1 },
3482 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
3483 uid_shift_socket_pair[2] = { -1, -1 },
3484 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3485
b0067625
ZJS
3486 _cleanup_close_ int notify_socket= -1;
3487 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3488 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3489 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3490 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3491 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 3492 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625
ZJS
3493 ContainerStatus container_status = 0;
3494 char last_char = 0;
3495 int ifi = 0, r;
3496 ssize_t l;
3497 sigset_t mask_chld;
d7bea6b6 3498 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
3499
3500 assert_se(sigemptyset(&mask_chld) == 0);
3501 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3502
3503 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3504 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3505 * check with getpwuid() if the specific user already exists. Note that /etc might be
3506 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3507 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3508 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3509 * really ours. */
3510
3511 etc_passwd_lock = take_etc_passwd_lock(NULL);
3512 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3513 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3514 }
3515
3516 r = barrier_create(&barrier);
3517 if (r < 0)
3518 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3519
3520 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3521 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3522
3523 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3524 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3525
3526 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3527 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3528
3529 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3530 return log_error_errno(errno, "Failed to create id socket pair: %m");
3531
3532 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3533 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3534
3535 if (arg_userns_mode != USER_NAMESPACE_NO)
3536 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3537 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3538
8199d554
LP
3539 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3540 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3541 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3542
b0067625
ZJS
3543 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3544 * parent's blocking calls and give it a chance to call wait() and terminate. */
3545 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3546 if (r < 0)
3547 return log_error_errno(errno, "Failed to change the signal mask: %m");
3548
3549 r = sigaction(SIGCHLD, &sa, NULL);
3550 if (r < 0)
3551 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3552
d7bea6b6
DP
3553 if (arg_network_namespace_path) {
3554 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3555 if (netns_fd < 0)
3556 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3557
3558 r = fd_is_network_ns(netns_fd);
3559 if (r < 0 && r != -ENOTTY)
3560 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
3561 if (r == 0) {
3562 log_error("Path %s doesn't refer to a network namespace", arg_network_namespace_path);
3563 return -EINVAL;
3564 }
3565 }
3566
b0067625
ZJS
3567 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3568 if (*pid < 0)
3569 return log_error_errno(errno, "clone() failed%s: %m",
3570 errno == EINVAL ?
3571 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3572
3573 if (*pid == 0) {
3574 /* The outer child only has a file system namespace. */
3575 barrier_set_role(&barrier, BARRIER_CHILD);
3576
3577 master = safe_close(master);
3578
3579 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3580 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3581 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3582 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3583 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3584 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 3585 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
3586
3587 (void) reset_all_signal_handlers();
3588 (void) reset_signal_mask();
3589
3590 r = outer_child(&barrier,
3591 arg_directory,
3592 console,
2d845785 3593 dissected_image,
b0067625
ZJS
3594 interactive,
3595 secondary,
3596 pid_socket_pair[1],
3597 uuid_socket_pair[1],
3598 notify_socket_pair[1],
3599 kmsg_socket_pair[1],
3600 rtnl_socket_pair[1],
3601 uid_shift_socket_pair[1],
8199d554 3602 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
3603 fds,
3604 netns_fd);
b0067625
ZJS
3605 if (r < 0)
3606 _exit(EXIT_FAILURE);
3607
3608 _exit(EXIT_SUCCESS);
3609 }
3610
3611 barrier_set_role(&barrier, BARRIER_PARENT);
3612
3613 fds = fdset_free(fds);
3614
3615 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3616 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3617 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3618 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3619 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3620 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 3621 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
3622
3623 if (arg_userns_mode != USER_NAMESPACE_NO) {
3624 /* The child just let us know the UID shift it might have read from the image. */
3625 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3626 if (l < 0)
3627 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3628 if (l != sizeof arg_uid_shift) {
3629 log_error("Short read while reading UID shift.");
3630 return -EIO;
3631 }
3632
3633 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3634 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3635 * image, but if that's already in use, pick a new one, and report back to the child,
3636 * which one we now picked. */
3637
3638 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3639 if (r < 0)
3640 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3641
3642 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3643 if (l < 0)
3644 return log_error_errno(errno, "Failed to send UID shift: %m");
3645 if (l != sizeof arg_uid_shift) {
3646 log_error("Short write while writing UID shift.");
3647 return -EIO;
3648 }
3649 }
3650 }
3651
8199d554
LP
3652 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3653 /* The child let us know the support cgroup mode it might have read from the image. */
3654 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3655 if (l < 0)
3656 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3657 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
3658 log_error("Short read while reading cgroup mode.");
3659 return -EIO;
3660 }
3661 }
3662
b0067625 3663 /* Wait for the outer child. */
d2e0ac3d
LP
3664 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3665 if (r < 0)
3666 return r;
3667 if (r != EXIT_SUCCESS)
3668 return -EIO;
b0067625
ZJS
3669
3670 /* And now retrieve the PID of the inner child. */
3671 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3672 if (l < 0)
3673 return log_error_errno(errno, "Failed to read inner child PID: %m");
3674 if (l != sizeof *pid) {
3675 log_error("Short read while reading inner child PID.");
3676 return -EIO;
3677 }
3678
3679 /* We also retrieve container UUID in case it was generated by outer child */
3680 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3681 if (l < 0)
3682 return log_error_errno(errno, "Failed to read container machine ID: %m");
3683 if (l != sizeof(arg_uuid)) {
3684 log_error("Short read while reading container machined ID.");
3685 return -EIO;
3686 }
3687
3688 /* We also retrieve the socket used for notifications generated by outer child */
3689 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3690 if (notify_socket < 0)
3691 return log_error_errno(notify_socket,
3692 "Failed to receive notification socket from the outer child: %m");
3693
3694 log_debug("Init process invoked as PID "PID_FMT, *pid);
3695
3696 if (arg_userns_mode != USER_NAMESPACE_NO) {
3697 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3698 log_error("Child died too early.");
3699 return -ESRCH;
3700 }
3701
3702 r = setup_uid_map(*pid);
3703 if (r < 0)
3704 return r;
3705
3706 (void) barrier_place(&barrier); /* #2 */
3707 }
3708
3709 if (arg_private_network) {
3710
75116558
PS
3711 if (!arg_network_namespace_path) {
3712 /* Wait until the child has unshared its network namespace. */
3713 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3714 log_error("Child died too early");
3715 return -ESRCH;
3716 }
3717 }
3718
b0067625
ZJS
3719 r = move_network_interfaces(*pid, arg_network_interfaces);
3720 if (r < 0)
3721 return r;
3722
3723 if (arg_network_veth) {
3724 r = setup_veth(arg_machine, *pid, veth_name,
3725 arg_network_bridge || arg_network_zone);
3726 if (r < 0)
3727 return r;
3728 else if (r > 0)
3729 ifi = r;
3730
3731 if (arg_network_bridge) {
3732 /* Add the interface to a bridge */
3733 r = setup_bridge(veth_name, arg_network_bridge, false);
3734 if (r < 0)
3735 return r;
3736 if (r > 0)
3737 ifi = r;
3738 } else if (arg_network_zone) {
3739 /* Add the interface to a bridge, possibly creating it */
3740 r = setup_bridge(veth_name, arg_network_zone, true);
3741 if (r < 0)
3742 return r;
3743 if (r > 0)
3744 ifi = r;
3745 }
3746 }
3747
3748 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3749 if (r < 0)
3750 return r;
3751
3752 /* We created the primary and extra veth links now; let's remember this, so that we know to
3753 remove them later on. Note that we don't bother with removing veth links that were created
3754 here when their setup failed half-way, because in that case the kernel should be able to
3755 remove them on its own, since they cannot be referenced by anything yet. */
3756 *veth_created = true;
3757
3758 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3759 if (r < 0)
3760 return r;
3761
3762 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3763 if (r < 0)
3764 return r;
3765 }
3766
abdb9b08
LP
3767 if (arg_register || !arg_keep_unit) {
3768 r = sd_bus_default_system(&bus);
3769 if (r < 0)
3770 return log_error_errno(r, "Failed to open system bus: %m");
3771 }
3772
3773 if (!arg_keep_unit) {
3774 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3775 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3776 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3777
75152a4d
LP
3778 r = sd_bus_match_signal_async(
3779 bus,
3780 NULL,
3781 "org.freedesktop.systemd1",
3782 NULL,
3783 "org.freedesktop.systemd1.Scope",
3784 "RequestStop",
3785 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 3786 if (r < 0)
75152a4d 3787 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
3788 }
3789
b0067625 3790 if (arg_register) {
abdb9b08 3791
b0067625 3792 r = register_machine(
abdb9b08 3793 bus,
b0067625
ZJS
3794 arg_machine,
3795 *pid,
3796 arg_directory,
3797 arg_uuid,
3798 ifi,
3799 arg_slice,
3800 arg_custom_mounts, arg_n_custom_mounts,
3801 arg_kill_signal,
3802 arg_property,
3803 arg_keep_unit,
3804 arg_container_service_name);
3805 if (r < 0)
3806 return r;
abdb9b08 3807
cd2dfc6f 3808 } else if (!arg_keep_unit) {
abdb9b08 3809
cd2dfc6f 3810 r = allocate_scope(
abdb9b08 3811 bus,
cd2dfc6f
LP
3812 arg_machine,
3813 *pid,
3814 arg_slice,
3815 arg_custom_mounts, arg_n_custom_mounts,
3816 arg_kill_signal,
3817 arg_property);
3818 if (r < 0)
3819 return r;
3820
3821 } else if (arg_slice || arg_property)
3822 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3823
f0bef277 3824 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3825 if (r < 0)
3826 return r;
3827
720f0a2f
LP
3828 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
3829 if (r < 0)
3830 return r;
b0067625 3831
de54e02d 3832 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3833 if (r < 0)
3834 return r;
3835
3836 /* Notify the child that the parent is ready with all
3837 * its setup (including cgroup-ification), and that
3838 * the child can now hand over control to the code to
3839 * run inside the container. */
75116558 3840 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
3841
3842 /* Block SIGCHLD here, before notifying child.
3843 * process_pty() will handle it with the other signals. */
3844 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3845
3846 /* Reset signal to default */
3847 r = default_signals(SIGCHLD, -1);
3848 if (r < 0)
3849 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3850
3851 r = sd_event_new(&event);
3852 if (r < 0)
3853 return log_error_errno(r, "Failed to get default event source: %m");
3854
8fd010bb
LP
3855 (void) sd_event_set_watchdog(event, true);
3856
abdb9b08
LP
3857 if (bus) {
3858 r = sd_bus_attach_event(bus, event, 0);
3859 if (r < 0)
3860 return log_error_errno(r, "Failed to attach bus to event loop: %m");
3861 }
3862
5773024d 3863 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3864 if (r < 0)
3865 return r;
3866
3867 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 3868 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
3869 log_error("Child died too early.");
3870 return -ESRCH;
3871 }
3872
3873 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3874 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3875 etc_passwd_lock = safe_close(etc_passwd_lock);
3876
3877 sd_notifyf(false,
3878 "STATUS=Container running.\n"
3879 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3880 if (!arg_notify_ready)
919f5ae0 3881 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
3882
3883 if (arg_kill_signal > 0) {
3884 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
3885 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3886 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
3887 } else {
3888 /* Immediately exit */
919f5ae0
LP
3889 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3890 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
3891 }
3892
6916b164 3893 /* Exit when the child exits */
919f5ae0 3894 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3895
3896 if (arg_expose_ports) {
3897 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3898 if (r < 0)
3899 return r;
3900
3901 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3902 }
3903
3904 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3905
3906 r = pty_forward_new(event, master,
3907 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3908 &forward);
3909 if (r < 0)
3910 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3911
3912 r = sd_event_loop(event);
3913 if (r < 0)
3914 return log_error_errno(r, "Failed to run event loop: %m");
3915
3916 pty_forward_get_last_char(forward, &last_char);
3917
3918 forward = pty_forward_free(forward);
3919
3920 if (!arg_quiet && last_char != '\n')
3921 putc('\n', stdout);
3922
3923 /* Kill if it is not dead yet anyway */
abdb9b08
LP
3924 if (arg_register && !arg_keep_unit && bus)
3925 terminate_machine(bus, *pid);
b0067625
ZJS
3926
3927 /* Normally redundant, but better safe than sorry */
c67b0082 3928 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3929
3930 r = wait_for_container(*pid, &container_status);
3931 *pid = 0;
3932
3933 if (r < 0)
3934 /* We failed to wait for the container, or the container exited abnormally. */
3935 return r;
3936 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3937 /* r > 0 → The container exited with a non-zero status.
3938 * As a special case, we need to replace 133 with a different value,
3939 * because 133 is special-cased in the service file to reboot the container.
3940 * otherwise → The container exited with zero status and a reboot was not requested.
3941 */
2a49b612 3942 if (r == EXIT_FORCE_RESTART)
27e29a1e 3943 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3944 *ret = r;
b0067625
ZJS
3945 return 0; /* finito */
3946 }
3947
3948 /* CONTAINER_REBOOTED, loop again */
3949
3950 if (arg_keep_unit) {
3951 /* Special handling if we are running as a service: instead of simply
3952 * restarting the machine we want to restart the entire service, so let's
3953 * inform systemd about this with the special exit code 133. The service
3954 * file uses RestartForceExitStatus=133 so that this results in a full
3955 * nspawn restart. This is necessary since we might have cgroup parameters
3956 * set we want to have flushed out. */
2a49b612
ZJS
3957 *ret = EXIT_FORCE_RESTART;
3958 return 0; /* finito */
b0067625
ZJS
3959 }
3960
3961 expose_port_flush(arg_expose_ports, exposed);
3962
3963 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3964 *veth_created = false;
3965 return 1; /* loop again */
3966}
3967
bf428efb
LP
3968static int initialize_rlimits(void) {
3969
3970 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
3971 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
3972 * container execution environments. */
3973
3974 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
3975 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
3976 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
3977 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
3978 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
3979 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
3980 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
3981 [RLIMIT_MEMLOCK] = { 65536, 65536 },
3982 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
3983 [RLIMIT_NICE] = { 0, 0 },
3984 [RLIMIT_NOFILE] = { 1024, 4096 },
3985 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
3986 [RLIMIT_RTPRIO] = { 0, 0 },
3987 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
3988 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
3989
3990 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
3991 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
3992 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
3993 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
3994 * that PID 1 changes a number of other resource limits during early initialization which is why we
3995 * don't read the other limits from PID 1 but prefer the static table above. */
3996 };
3997
3998 int rl;
3999
4000 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
4001
4002 /* Let's only fill in what the user hasn't explicitly configured anyway */
4003 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4004 const struct rlimit *v;
4005 struct rlimit buffer;
4006
4007 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4008 /* For these two let's read the limits off PID 1. See above for an explanation. */
4009
4010 if (prlimit(1, rl, NULL, &buffer) < 0)
4011 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4012
4013 v = &buffer;
4014 } else
4015 v = kernel_defaults + rl;
4016
4017 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4018 if (!arg_rlimit[rl])
4019 return log_oom();
4020 }
4021
4022 if (DEBUG_LOGGING) {
4023 _cleanup_free_ char *k = NULL;
4024
4025 (void) rlimit_format(arg_rlimit[rl], &k);
4026 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4027 }
4028 }
4029
4030 return 0;
4031}
4032
03cfe0d5
LP
4033int main(int argc, char *argv[]) {
4034
2d845785
LP
4035 _cleanup_free_ char *console = NULL;
4036 _cleanup_close_ int master = -1;
03cfe0d5 4037 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4038 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4039 char veth_name[IFNAMSIZ] = "";
17cbb288 4040 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4041 pid_t pid = 0;
03cfe0d5 4042 union in_addr_union exposed = {};
8e766630 4043 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4044 bool interactive, veth_created = false, remove_tmprootdir = false;
4045 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4046 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4047 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4048 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4049
4050 log_parse_environment();
4051 log_open();
415fc41c 4052
7732f92b
LP
4053 /* Make sure rename_process() in the stub init process can work */
4054 saved_argv = argv;
4055 saved_argc = argc;
4056
03cfe0d5
LP
4057 r = parse_argv(argc, argv);
4058 if (r <= 0)
4059 goto finish;
4060
fba868fa
LP
4061 r = must_be_root();
4062 if (r < 0)
03cfe0d5 4063 goto finish;
fba868fa 4064
bf428efb
LP
4065 r = initialize_rlimits();
4066 if (r < 0)
4067 goto finish;
4068
f757855e
LP
4069 r = determine_names();
4070 if (r < 0)
4071 goto finish;
4072
4073 r = load_settings();
4074 if (r < 0)
4075 goto finish;
4076
4077 r = verify_arguments();
4078 if (r < 0)
4079 goto finish;
03cfe0d5 4080
8199d554
LP
4081 r = detect_unified_cgroup_hierarchy_from_environment();
4082 if (r < 0)
4083 goto finish;
4084
03cfe0d5
LP
4085 n_fd_passed = sd_listen_fds(false);
4086 if (n_fd_passed > 0) {
4087 r = fdset_new_listen_fds(&fds, false);
4088 if (r < 0) {
4089 log_error_errno(r, "Failed to collect file descriptors: %m");
4090 goto finish;
4091 }
4092 }
4093
4094 if (arg_directory) {
4095 assert(!arg_image);
4096
4097 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4098 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4099 r = -EINVAL;
4100 goto finish;
4101 }
4102
4103 if (arg_ephemeral) {
4104 _cleanup_free_ char *np = NULL;
4105
8d4aa2bb 4106 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4107 if (r < 0)
4108 goto finish;
4109
03cfe0d5
LP
4110 /* If the specified path is a mount point we
4111 * generate the new snapshot immediately
4112 * inside it under a random name. However if
4113 * the specified is not a mount point we
4114 * create the new snapshot in the parent
4115 * directory, just next to it. */
e1873695 4116 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4117 if (r < 0) {
4118 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4119 goto finish;
4120 }
4121 if (r > 0)
770b5ce4 4122 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4123 else
770b5ce4 4124 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4125 if (r < 0) {
0f3be6ca 4126 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4127 goto finish;
4128 }
4129
4130 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4131 if (r < 0) {
4132 log_error_errno(r, "Failed to lock %s: %m", np);
4133 goto finish;
4134 }
4135
17cbb288
LP
4136 r = btrfs_subvol_snapshot(arg_directory, np,
4137 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4138 BTRFS_SNAPSHOT_FALLBACK_COPY |
4139 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4140 BTRFS_SNAPSHOT_RECURSIVE |
4141 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4142 if (r < 0) {
4143 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4144 goto finish;
ec16945e
LP
4145 }
4146
1cc6c93a 4147 free_and_replace(arg_directory, np);
ec16945e 4148
17cbb288 4149 remove_directory = true;
30535c16
LP
4150
4151 } else {
cb638b5e 4152 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4153 if (r < 0)
4154 goto finish;
4155
30535c16
LP
4156 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4157 if (r == -EBUSY) {
4158 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4159 goto finish;
4160 }
4161 if (r < 0) {
4162 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4163 goto finish;
30535c16
LP
4164 }
4165
4166 if (arg_template) {
8d4aa2bb 4167 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4168 if (r < 0)
4169 goto finish;
4170
17cbb288
LP
4171 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4172 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4173 BTRFS_SNAPSHOT_FALLBACK_COPY |
4174 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4175 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4176 BTRFS_SNAPSHOT_RECURSIVE |
4177 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4178 if (r == -EEXIST) {
4179 if (!arg_quiet)
4180 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4181 } else if (r < 0) {
83521414 4182 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4183 goto finish;
4184 } else {
4185 if (!arg_quiet)
4186 log_info("Populated %s from template %s.", arg_directory, arg_template);
4187 }
4188 }
ec16945e
LP
4189 }
4190
7732f92b 4191 if (arg_start_mode == START_BOOT) {
c9fe05e0
AR
4192 const char *p;
4193
4194 if (arg_pivot_root_new)
4195 p = prefix_roota(arg_directory, arg_pivot_root_new);
4196 else
4197 p = arg_directory;
4198
4199 if (path_is_os_tree(p) <= 0) {
4200 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4201 r = -EINVAL;
1b9e5b12
LP
4202 goto finish;
4203 }
4204 } else {
c9fe05e0
AR
4205 const char *p, *q;
4206
4207 if (arg_pivot_root_new)
4208 p = prefix_roota(arg_directory, arg_pivot_root_new);
4209 else
4210 p = arg_directory;
4211
4212 q = strjoina(p, "/usr/");
1b9e5b12 4213
c9fe05e0
AR
4214 if (laccess(q, F_OK) < 0) {
4215 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4216 r = -EINVAL;
1b9e5b12 4217 goto finish;
1b9e5b12
LP
4218 }
4219 }
ec16945e 4220
6b9132a9 4221 } else {
ec16945e
LP
4222 assert(arg_image);
4223 assert(!arg_template);
4224
8d4aa2bb 4225 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4226 if (r < 0)
4227 goto finish;
4228
0f3be6ca
LP
4229 if (arg_ephemeral) {
4230 _cleanup_free_ char *np = NULL;
4231
4232 r = tempfn_random(arg_image, "machine.", &np);
4233 if (r < 0) {
4234 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4235 goto finish;
4236 }
4237
4238 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4239 if (r < 0) {
4240 r = log_error_errno(r, "Failed to create image lock: %m");
4241 goto finish;
4242 }
4243
1c876927 4244 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
4245 if (r < 0) {
4246 r = log_error_errno(r, "Failed to copy image file: %m");
4247 goto finish;
4248 }
4249
1cc6c93a 4250 free_and_replace(arg_image, np);
0f3be6ca
LP
4251
4252 remove_image = true;
4253 } else {
4254 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4255 if (r == -EBUSY) {
4256 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4257 goto finish;
4258 }
4259 if (r < 0) {
4260 r = log_error_errno(r, "Failed to create image lock: %m");
4261 goto finish;
4262 }
4623e8e6 4263
78ebe980
LP
4264 if (!arg_root_hash) {
4265 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4266 if (r < 0) {
4267 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4268 goto finish;
4269 }
4270 }
30535c16
LP
4271 }
4272
c67b0082 4273 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4274 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4275 goto finish;
1b9e5b12 4276 }
6b9132a9 4277
c67b0082
LP
4278 remove_tmprootdir = true;
4279
4280 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4281 if (!arg_directory) {
4282 r = log_oom();
4283 goto finish;
6b9132a9 4284 }
88213476 4285
2d845785
LP
4286 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4287 if (r < 0) {
4288 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4289 goto finish;
4290 }
1b9e5b12 4291
4526113f 4292 r = dissect_image_and_warn(
e0f9e7bd 4293 loop->fd,
4526113f 4294 arg_image,
e0f9e7bd
LP
4295 arg_root_hash, arg_root_hash_size,
4296 DISSECT_IMAGE_REQUIRE_ROOT,
4297 &dissected_image);
2d845785 4298 if (r == -ENOPKG) {
4526113f 4299 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4300 log_notice("Note that the disk image needs to\n"
4301 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4302 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4303 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4304 " d) or contain a file system without a partition table\n"
4305 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4306 goto finish;
2d845785 4307 }
4526113f 4308 if (r < 0)
842f3b0f 4309 goto finish;
1b9e5b12 4310
4623e8e6
LP
4311 if (!arg_root_hash && dissected_image->can_verity)
4312 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4313
4314 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4315 if (r < 0)
4316 goto finish;
0f3be6ca
LP
4317
4318 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4319 if (remove_image && unlink(arg_image) >= 0)
4320 remove_image = false;
842f3b0f 4321 }
842f3b0f 4322
86c0dd4a 4323 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4324 if (r < 0)
4325 goto finish;
4326
03cfe0d5
LP
4327 interactive =
4328 isatty(STDIN_FILENO) > 0 &&
4329 isatty(STDOUT_FILENO) > 0;
9c857b9d 4330
db7feb7e
LP
4331 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4332 if (master < 0) {
ec16945e 4333 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4334 goto finish;
4335 }
4336
611b312b
LP
4337 r = ptsname_malloc(master, &console);
4338 if (r < 0) {
4339 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4340 goto finish;
68b02049
DW
4341 }
4342
4343 if (arg_selinux_apifs_context) {
4344 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4345 if (r < 0)
4346 goto finish;
a258bf26
LP
4347 }
4348
a258bf26 4349 if (unlockpt(master) < 0) {
ec16945e 4350 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4351 goto finish;
4352 }
4353
9c857b9d
LP
4354 if (!arg_quiet)
4355 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4356 arg_machine, arg_image ?: arg_directory);
4357
72c0a2c2 4358 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4359
66edd963 4360 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4361 r = log_error_errno(errno, "Failed to become subreaper: %m");
4362 goto finish;
4363 }
4364
d87be9b0 4365 for (;;) {
b0067625
ZJS
4366 r = run(master,
4367 console,
2d845785 4368 dissected_image,
b0067625
ZJS
4369 interactive, secondary,
4370 fds,
4371 veth_name, &veth_created,
4372 &exposed,
4373 &pid, &ret);
4374 if (r <= 0)
d87be9b0 4375 break;
d87be9b0 4376 }
88213476
LP
4377
4378finish:
af4ec430 4379 sd_notify(false,
2a49b612
ZJS
4380 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4381 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4382
9444b1f2 4383 if (pid > 0)
c67b0082 4384 (void) kill(pid, SIGKILL);
88213476 4385
503546da 4386 /* Try to flush whatever is still queued in the pty */
6a0f896b 4387 if (master >= 0) {
1c876927 4388 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4389 master = safe_close(master);
4390 }
4391
4392 if (pid > 0)
4393 (void) wait_for_terminate(pid, NULL);
503546da 4394
50ebcf6c
LP
4395 pager_close();
4396
17cbb288 4397 if (remove_directory && arg_directory) {
ec16945e
LP
4398 int k;
4399
17cbb288 4400 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4401 if (k < 0)
17cbb288 4402 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4403 }
4404
0f3be6ca
LP
4405 if (remove_image && arg_image) {
4406 if (unlink(arg_image) < 0)
4407 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4408 }
4409
c67b0082
LP
4410 if (remove_tmprootdir) {
4411 if (rmdir(tmprootdir) < 0)
4412 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4413 }
4414
785890ac
LP
4415 if (arg_machine) {
4416 const char *p;
4417
63c372cb 4418 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4419 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4420 }
4421
7a8f6325 4422 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4423
4424 if (veth_created)
4425 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4426 (void) remove_bridge(arg_network_zone);
f757855e 4427
04d391da 4428 free(arg_directory);
ec16945e
LP
4429 free(arg_template);
4430 free(arg_image);
7027ff61 4431 free(arg_machine);
3a9530e5 4432 free(arg_hostname);
c74e630d 4433 free(arg_user);
b53ede69
PW
4434 free(arg_pivot_root_new);
4435 free(arg_pivot_root_old);
5f932eb9 4436 free(arg_chdir);
c74e630d 4437 strv_free(arg_setenv);
f757855e 4438 free(arg_network_bridge);
c74e630d
LP
4439 strv_free(arg_network_interfaces);
4440 strv_free(arg_network_macvlan);
4bbfe7ad 4441 strv_free(arg_network_ipvlan);
f6d6bad1 4442 strv_free(arg_network_veth_extra);
f757855e
LP
4443 strv_free(arg_parameters);
4444 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4445 expose_port_free_all(arg_expose_ports);
4623e8e6 4446 free(arg_root_hash);
bf428efb 4447 rlimit_free_all(arg_rlimit);
d107bb7d 4448 arg_cpuset = cpu_set_mfree(arg_cpuset);
6d0b55c2 4449
ec16945e 4450 return r < 0 ? EXIT_FAILURE : ret;
88213476 4451}