]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Split out part of mount-util.c into mountpoint-util.c
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476 2
349cc4a5 3#if HAVE_BLKID
6b5cf3ea 4#include <blkid.h>
8fe0087e 5#endif
88213476 6#include <errno.h>
88213476 7#include <getopt.h>
0e7ac751 8#include <grp.h>
1b9e5b12 9#include <linux/loop.h>
0e7ac751 10#include <pwd.h>
8fe0087e 11#include <sched.h>
349cc4a5 12#if HAVE_SELINUX
8fe0087e 13#include <selinux/selinux.h>
1b9e5b12 14#endif
8fe0087e
LP
15#include <signal.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
19#include <sys/file.h>
20#include <sys/mount.h>
21#include <sys/personality.h>
22#include <sys/prctl.h>
23#include <sys/types.h>
6916b164 24#include <sys/wait.h>
8fe0087e 25#include <unistd.h>
1b9e5b12 26
b053cd5f 27#include "sd-bus.h"
1f0cd86b 28#include "sd-daemon.h"
1f0cd86b 29#include "sd-id128.h"
8fe0087e 30
b5efdb8a 31#include "alloc-util.h"
8fe0087e
LP
32#include "barrier.h"
33#include "base-filesystem.h"
34#include "blkid-util.h"
35#include "btrfs-util.h"
b8ea7a6e 36#include "bus-error.h"
b053cd5f 37#include "bus-util.h"
8fe0087e 38#include "cap-list.h"
430f0182 39#include "capability-util.h"
04d391da 40#include "cgroup-util.h"
8fe0087e 41#include "copy.h"
d107bb7d 42#include "cpu-set-util.h"
4fc9982c 43#include "dev-setup.h"
2d845785 44#include "dissect-image.h"
8fe0087e 45#include "env-util.h"
3ffd4af2 46#include "fd-util.h"
842f3b0f 47#include "fdset.h"
a5c32cff 48#include "fileio.h"
f97b34a6 49#include "format-util.h"
f4f15635 50#include "fs-util.h"
1b9e5b12 51#include "gpt.h"
4623e8e6 52#include "hexdecoct.h"
8fe0087e 53#include "hostname-util.h"
910fd145 54#include "id128-util.h"
8fe0087e 55#include "log.h"
2d845785 56#include "loop-util.h"
8fe0087e 57#include "loopback-setup.h"
1b9cebf6 58#include "machine-image.h"
8fe0087e
LP
59#include "macro.h"
60#include "missing.h"
61#include "mkdir.h"
4349cd7c 62#include "mount-util.h"
049af8ad 63#include "mountpoint-util.h"
8fe0087e 64#include "netlink-util.h"
07630cea 65#include "nspawn-cgroup.h"
3603efde 66#include "nspawn-def.h"
07630cea
LP
67#include "nspawn-expose-ports.h"
68#include "nspawn-mount.h"
69#include "nspawn-network.h"
7336138e 70#include "nspawn-patch-uid.h"
07630cea 71#include "nspawn-register.h"
910fd145 72#include "nspawn-seccomp.h"
07630cea
LP
73#include "nspawn-settings.h"
74#include "nspawn-setuid.h"
7732f92b 75#include "nspawn-stub-pid1.h"
d58ad743 76#include "os-util.h"
50ebcf6c 77#include "pager.h"
6bedfcbb 78#include "parse-util.h"
8fe0087e 79#include "path-util.h"
294bf0c3 80#include "pretty-print.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
8869a0b4 84#include "raw-clone.h"
bf428efb 85#include "rlimit-util.h"
8fe0087e 86#include "rm-rf.h"
68b02049 87#include "selinux-util.h"
8fe0087e 88#include "signal-util.h"
2583fbea 89#include "socket-util.h"
8fcde012 90#include "stat-util.h"
15a5e950 91#include "stdio-util.h"
5c828e66 92#include "string-table.h"
07630cea 93#include "string-util.h"
8fe0087e
LP
94#include "strv.h"
95#include "terminal-util.h"
affb60b1 96#include "umask-util.h"
b1d4f8e1 97#include "user-util.h"
8fe0087e 98#include "util.h"
e9642be2 99
62b1e758
YW
100#if HAVE_SPLIT_USR
101#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf"
102#else
103#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf"
104#endif
105
9c1e04d0
AP
106/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
107 * nspawn_notify_socket_path is relative to the container
108 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
109#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 110
2a49b612
ZJS
111#define EXIT_FORCE_RESTART 133
112
113cea80
DH
113typedef enum ContainerStatus {
114 CONTAINER_TERMINATED,
115 CONTAINER_REBOOTED
116} ContainerStatus;
117
88213476 118static char *arg_directory = NULL;
ec16945e 119static char *arg_template = NULL;
5f932eb9 120static char *arg_chdir = NULL;
b53ede69
PW
121static char *arg_pivot_root_new = NULL;
122static char *arg_pivot_root_old = NULL;
687d0825 123static char *arg_user = NULL;
9444b1f2 124static sd_id128_t arg_uuid = {};
3a9530e5
LP
125static char *arg_machine = NULL; /* The name used by the host to refer to this */
126static char *arg_hostname = NULL; /* The name the payload sees by default */
c74e630d
LP
127static const char *arg_selinux_context = NULL;
128static const char *arg_selinux_apifs_context = NULL;
9444b1f2 129static const char *arg_slice = NULL;
ff01d048 130static bool arg_private_network = false;
bc2f673e 131static bool arg_read_only = false;
7732f92b 132static StartMode arg_start_mode = START_PID1;
ec16945e 133static bool arg_ephemeral = false;
57fb9fb5 134static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 135static bool arg_link_journal_try = false;
520e0d54 136static uint64_t arg_caps_retain =
50b52222
LP
137 (1ULL << CAP_AUDIT_CONTROL) |
138 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
139 (1ULL << CAP_CHOWN) |
140 (1ULL << CAP_DAC_OVERRIDE) |
141 (1ULL << CAP_DAC_READ_SEARCH) |
142 (1ULL << CAP_FOWNER) |
143 (1ULL << CAP_FSETID) |
144 (1ULL << CAP_IPC_OWNER) |
145 (1ULL << CAP_KILL) |
146 (1ULL << CAP_LEASE) |
147 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 148 (1ULL << CAP_MKNOD) |
5076f0cc
LP
149 (1ULL << CAP_NET_BIND_SERVICE) |
150 (1ULL << CAP_NET_BROADCAST) |
151 (1ULL << CAP_NET_RAW) |
5076f0cc 152 (1ULL << CAP_SETFCAP) |
50b52222 153 (1ULL << CAP_SETGID) |
5076f0cc
LP
154 (1ULL << CAP_SETPCAP) |
155 (1ULL << CAP_SETUID) |
156 (1ULL << CAP_SYS_ADMIN) |
50b52222 157 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
158 (1ULL << CAP_SYS_CHROOT) |
159 (1ULL << CAP_SYS_NICE) |
160 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 161 (1ULL << CAP_SYS_RESOURCE) |
50b52222 162 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538 163static CustomMount *arg_custom_mounts = NULL;
88614c8a 164static size_t arg_n_custom_mounts = 0;
f4889f65 165static char **arg_setenv = NULL;
284c0b91 166static bool arg_quiet = false;
eb91eb18 167static bool arg_register = true;
89f7c846 168static bool arg_keep_unit = false;
aa28aefe 169static char **arg_network_interfaces = NULL;
c74e630d 170static char **arg_network_macvlan = NULL;
4bbfe7ad 171static char **arg_network_ipvlan = NULL;
69c79d3c 172static bool arg_network_veth = false;
f6d6bad1 173static char **arg_network_veth_extra = NULL;
f757855e 174static char *arg_network_bridge = NULL;
22b28dfd 175static char *arg_network_zone = NULL;
d7bea6b6 176static char *arg_network_namespace_path = NULL;
050f7277 177static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 178static char *arg_image = NULL;
f757855e 179static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 180static ExposePort *arg_expose_ports = NULL;
f36933fe 181static char **arg_property = NULL;
0de7acce 182static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 183static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 184static bool arg_userns_chown = false;
c6c8f6e2 185static int arg_kill_signal = 0;
5da38d07 186static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
187static SettingsMask arg_settings_mask = 0;
188static int arg_settings_trusted = -1;
189static char **arg_parameters = NULL;
6aadfa4c 190static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 191static bool arg_notify_ready = false;
5a8ff0e6 192static bool arg_use_cgns = true;
0c582db0 193static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
1099ceeb 194static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
4623e8e6
LP
195static void *arg_root_hash = NULL;
196static size_t arg_root_hash_size = 0;
960e4569
LP
197static char **arg_syscall_whitelist = NULL;
198static char **arg_syscall_blacklist = NULL;
bf428efb 199static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
66edd963 200static bool arg_no_new_privileges = false;
81f345df
LP
201static int arg_oom_score_adjust = 0;
202static bool arg_oom_score_adjust_set = false;
d107bb7d
LP
203static cpu_set_t *arg_cpuset = NULL;
204static unsigned arg_cpuset_ncpus = 0;
09d423e9 205static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
1688841f 206static TimezoneMode arg_timezone = TIMEZONE_AUTO;
88213476 207
37ec0fdd
LP
208static int help(void) {
209 _cleanup_free_ char *link = NULL;
210 int r;
211
0221d68a 212 (void) pager_open(false);
50ebcf6c 213
37ec0fdd
LP
214 r = terminal_urlify_man("systemd-nspawn", "1", &link);
215 if (r < 0)
216 return log_oom();
217
88213476 218 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
a7e2e50d 219 "Spawn a command or OS in a light-weight container.\n\n"
a8828ed9
DW
220 " -h --help Show this help\n"
221 " --version Print version string\n"
69c79d3c 222 " -q --quiet Do not show status information\n"
1b9e5b12 223 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
224 " --template=PATH Initialize root directory from template directory,\n"
225 " if missing\n"
226 " -x --ephemeral Run container with snapshot of root directory, and\n"
227 " remove it after exit\n"
228 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 229 " --root-hash=HASH Specify verity root hash\n"
7732f92b 230 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 231 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 232 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
233 " --pivot-root=PATH[:PATH]\n"
234 " Pivot root to given directory in the container\n"
a8828ed9 235 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 236 " -M --machine=NAME Set the machine name for the container\n"
3a9530e5 237 " --hostname=NAME Override the hostname for the container\n"
69c79d3c 238 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 239 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 240 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 241 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 242 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 243 " Similar, but with user configured UID/GID range\n"
24597ee0 244 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
245 " --private-network Disable network in container\n"
246 " --network-interface=INTERFACE\n"
247 " Assign an existing network interface to the\n"
248 " container\n"
c74e630d
LP
249 " --network-macvlan=INTERFACE\n"
250 " Create a macvlan network interface based on an\n"
251 " existing network interface to the container\n"
4bbfe7ad
TG
252 " --network-ipvlan=INTERFACE\n"
253 " Create a ipvlan network interface based on an\n"
254 " existing network interface to the container\n"
a8eaaee7 255 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 256 " and container\n"
f6d6bad1
LP
257 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
258 " Add an additional virtual Ethernet link between\n"
259 " host and container\n"
ab046dde 260 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
261 " Add a virtual Ethernet connection to the container\n"
262 " and attach it to an existing bridge on the host\n"
263 " --network-zone=NAME Similar, but attach the new interface to an\n"
264 " an automatically managed bridge interface\n"
d7bea6b6
DP
265 " --network-namespace-path=PATH\n"
266 " Set network namespace to the one represented by\n"
267 " the specified kernel namespace file node\n"
6d0b55c2 268 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 269 " Expose a container IP port on the host\n"
82adf6af
LP
270 " -Z --selinux-context=SECLABEL\n"
271 " Set the SELinux security context to be used by\n"
272 " processes in the container\n"
273 " -L --selinux-apifs-context=SECLABEL\n"
274 " Set the SELinux security context to be used by\n"
275 " API/tmpfs file systems in the container\n"
a8828ed9
DW
276 " --capability=CAP In addition to the default, retain specified\n"
277 " capability\n"
278 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
279 " --system-call-filter=LIST|~LIST\n"
280 " Permit/prohibit specific system calls\n"
bf428efb 281 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
81f345df
LP
282 " --oom-score-adjust=VALUE\n"
283 " Adjust the OOM score value for the payload\n"
d107bb7d 284 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
c6c8f6e2 285 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
286 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
287 " host, try-guest, try-host\n"
574edc90 288 " -j Equivalent to --link-journal=try-guest\n"
09d423e9 289 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
1688841f 290 " --timezone=MODE Select mode of /etc/localtime initialization\n"
69c79d3c 291 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
292 " --bind=PATH[:PATH[:OPTIONS]]\n"
293 " Bind mount a file or directory from the host into\n"
a8828ed9 294 " the container\n"
5e5bfa6e
EY
295 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
296 " Similar, but creates a read-only bind mount\n"
06c17c39 297 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
298 " --overlay=PATH[:PATH...]:PATH\n"
299 " Create an overlay mount from the host to \n"
300 " the container\n"
301 " --overlay-ro=PATH[:PATH...]:PATH\n"
302 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 303 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 304 " --register=BOOLEAN Register container as machine\n"
89f7c846 305 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 306 " the service unit nspawn is running in\n"
6d0b55c2 307 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 308 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 309 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
37ec0fdd
LP
310 "\nSee the %s for details.\n"
311 , program_invocation_short_name
312 , link
313 );
314
315 return 0;
88213476
LP
316}
317
86c0dd4a 318static int custom_mount_check_all(void) {
88614c8a 319 size_t i;
5a8af538 320
5a8af538
LP
321 for (i = 0; i < arg_n_custom_mounts; i++) {
322 CustomMount *m = &arg_custom_mounts[i];
323
0de7acce 324 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
baaa35ad
ZJS
325 if (arg_userns_chown)
326 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
327 "--private-users-chown may not be combined with custom root mounts.");
328 else if (arg_uid_shift == UID_INVALID)
329 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
330 "--private-users with automatic UID shift may not be combined with custom root mounts.");
825d5287 331 }
5a8af538
LP
332 }
333
334 return 0;
335}
336
8199d554 337static int detect_unified_cgroup_hierarchy_from_environment(void) {
efdb0237 338 const char *e;
415fc41c 339 int r;
5da38d07 340
efdb0237
LP
341 /* Allow the user to control whether the unified hierarchy is used */
342 e = getenv("UNIFIED_CGROUP_HIERARCHY");
343 if (e) {
344 r = parse_boolean(e);
345 if (r < 0)
346 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
347 if (r > 0)
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
349 else
350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237
LP
351 }
352
8199d554
LP
353 return 0;
354}
355
356static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
357 int r;
358
359 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the
360 * image actually supports. */
b4cccbc1
LP
361 r = cg_all_unified();
362 if (r < 0)
363 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
364 if (r > 0) {
a8725a06
ZJS
365 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
366 * routine only detects 231, so we'll have a false negative here for 230. */
367 r = systemd_installation_has_version(directory, 230);
368 if (r < 0)
369 return log_error_errno(r, "Failed to determine systemd version in container: %m");
370 if (r > 0)
371 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
372 else
373 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 374 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
375 /* Mixed cgroup hierarchy support was added in 233 */
376 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
377 if (r < 0)
378 return log_error_errno(r, "Failed to determine systemd version in container: %m");
379 if (r > 0)
380 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
381 else
382 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
383 } else
5da38d07 384 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 385
8199d554
LP
386 log_debug("Using %s hierarchy for container.",
387 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
388 arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
389
efdb0237
LP
390 return 0;
391}
392
0c582db0
LB
393static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
394 int r;
395
396 r = getenv_bool(name);
397 if (r == -ENXIO)
398 return;
399 if (r < 0)
400 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
401 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
402}
403
4f086aab 404static void parse_mount_settings_env(void) {
4f086aab 405 const char *e;
1099ceeb
LP
406 int r;
407
408 r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
409 if (r >= 0)
410 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
411 else if (r != -ENXIO)
412 log_warning_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP, ignoring: %m");
4f086aab
SU
413
414 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
415 if (!e)
416 return;
417
418 if (streq(e, "network")) {
419 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
420 return;
421 }
422
423 r = parse_boolean(e);
424 if (r < 0) {
425 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
426 return;
ab8ee0f2 427 }
4f086aab 428
ab8ee0f2
ZJS
429 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
430 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
431}
432
88213476 433static int parse_argv(int argc, char *argv[]) {
a41fe3a2 434 enum {
acbeb427
ZJS
435 ARG_VERSION = 0x100,
436 ARG_PRIVATE_NETWORK,
bc2f673e 437 ARG_UUID,
5076f0cc 438 ARG_READ_ONLY,
57fb9fb5 439 ARG_CAPABILITY,
420c7379 440 ARG_DROP_CAPABILITY,
17fe0523
LP
441 ARG_LINK_JOURNAL,
442 ARG_BIND,
f4889f65 443 ARG_BIND_RO,
06c17c39 444 ARG_TMPFS,
5a8af538
LP
445 ARG_OVERLAY,
446 ARG_OVERLAY_RO,
eb91eb18 447 ARG_SHARE_SYSTEM,
89f7c846 448 ARG_REGISTER,
aa28aefe 449 ARG_KEEP_UNIT,
69c79d3c 450 ARG_NETWORK_INTERFACE,
c74e630d 451 ARG_NETWORK_MACVLAN,
4bbfe7ad 452 ARG_NETWORK_IPVLAN,
ab046dde 453 ARG_NETWORK_BRIDGE,
22b28dfd 454 ARG_NETWORK_ZONE,
f6d6bad1 455 ARG_NETWORK_VETH_EXTRA,
d7bea6b6 456 ARG_NETWORK_NAMESPACE_PATH,
6afc95b7 457 ARG_PERSONALITY,
4d9f07b4 458 ARG_VOLATILE,
ec16945e 459 ARG_TEMPLATE,
f36933fe 460 ARG_PROPERTY,
6dac160c 461 ARG_PRIVATE_USERS,
c6c8f6e2 462 ARG_KILL_SIGNAL,
f757855e 463 ARG_SETTINGS,
5f932eb9 464 ARG_CHDIR,
b53ede69 465 ARG_PIVOT_ROOT,
7336138e 466 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 467 ARG_NOTIFY_READY,
4623e8e6 468 ARG_ROOT_HASH,
960e4569 469 ARG_SYSTEM_CALL_FILTER,
bf428efb 470 ARG_RLIMIT,
3a9530e5 471 ARG_HOSTNAME,
66edd963 472 ARG_NO_NEW_PRIVILEGES,
81f345df 473 ARG_OOM_SCORE_ADJUST,
d107bb7d 474 ARG_CPU_AFFINITY,
09d423e9 475 ARG_RESOLV_CONF,
1688841f 476 ARG_TIMEZONE,
a41fe3a2
LP
477 };
478
88213476 479 static const struct option options[] = {
d7bea6b6
DP
480 { "help", no_argument, NULL, 'h' },
481 { "version", no_argument, NULL, ARG_VERSION },
482 { "directory", required_argument, NULL, 'D' },
483 { "template", required_argument, NULL, ARG_TEMPLATE },
484 { "ephemeral", no_argument, NULL, 'x' },
485 { "user", required_argument, NULL, 'u' },
486 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
487 { "as-pid2", no_argument, NULL, 'a' },
488 { "boot", no_argument, NULL, 'b' },
489 { "uuid", required_argument, NULL, ARG_UUID },
490 { "read-only", no_argument, NULL, ARG_READ_ONLY },
491 { "capability", required_argument, NULL, ARG_CAPABILITY },
492 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
66edd963 493 { "no-new-privileges", required_argument, NULL, ARG_NO_NEW_PRIVILEGES },
d7bea6b6
DP
494 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
495 { "bind", required_argument, NULL, ARG_BIND },
496 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
497 { "tmpfs", required_argument, NULL, ARG_TMPFS },
498 { "overlay", required_argument, NULL, ARG_OVERLAY },
499 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
500 { "machine", required_argument, NULL, 'M' },
3a9530e5 501 { "hostname", required_argument, NULL, ARG_HOSTNAME },
d7bea6b6
DP
502 { "slice", required_argument, NULL, 'S' },
503 { "setenv", required_argument, NULL, 'E' },
504 { "selinux-context", required_argument, NULL, 'Z' },
505 { "selinux-apifs-context", required_argument, NULL, 'L' },
506 { "quiet", no_argument, NULL, 'q' },
507 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
508 { "register", required_argument, NULL, ARG_REGISTER },
509 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
510 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
511 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
512 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
513 { "network-veth", no_argument, NULL, 'n' },
514 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
515 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
516 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
517 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
518 { "personality", required_argument, NULL, ARG_PERSONALITY },
519 { "image", required_argument, NULL, 'i' },
520 { "volatile", optional_argument, NULL, ARG_VOLATILE },
521 { "port", required_argument, NULL, 'p' },
522 { "property", required_argument, NULL, ARG_PROPERTY },
523 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
524 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
525 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
526 { "settings", required_argument, NULL, ARG_SETTINGS },
527 { "chdir", required_argument, NULL, ARG_CHDIR },
528 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
529 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
530 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
531 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
bf428efb 532 { "rlimit", required_argument, NULL, ARG_RLIMIT },
81f345df 533 { "oom-score-adjust", required_argument, NULL, ARG_OOM_SCORE_ADJUST },
d107bb7d 534 { "cpu-affinity", required_argument, NULL, ARG_CPU_AFFINITY },
09d423e9 535 { "resolv-conf", required_argument, NULL, ARG_RESOLV_CONF },
1688841f 536 { "timezone", required_argument, NULL, ARG_TIMEZONE },
eb9da376 537 {}
88213476
LP
538 };
539
9444b1f2 540 int c, r;
6aadfa4c 541 const char *p, *e;
a42c8b54 542 uint64_t plus = 0, minus = 0;
f757855e 543 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
544
545 assert(argc >= 0);
546 assert(argv);
547
2e1f244e 548 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
549 switch (c) {
550
551 case 'h':
37ec0fdd 552 return help();
88213476 553
acbeb427 554 case ARG_VERSION:
3f6fd1ba 555 return version();
acbeb427 556
88213476 557 case 'D':
0f03c2a4 558 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 559 if (r < 0)
0f03c2a4 560 return r;
ec16945e
LP
561 break;
562
563 case ARG_TEMPLATE:
0f03c2a4 564 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 565 if (r < 0)
0f03c2a4 566 return r;
88213476
LP
567 break;
568
1b9e5b12 569 case 'i':
0f03c2a4 570 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 571 if (r < 0)
0f03c2a4 572 return r;
ec16945e
LP
573 break;
574
575 case 'x':
576 arg_ephemeral = true;
a2f577fc 577 arg_settings_mask |= SETTING_EPHEMERAL;
1b9e5b12
LP
578 break;
579
687d0825 580 case 'u':
2fc09a9c
DM
581 r = free_and_strdup(&arg_user, optarg);
582 if (r < 0)
7027ff61 583 return log_oom();
687d0825 584
f757855e 585 arg_settings_mask |= SETTING_USER;
687d0825
MV
586 break;
587
22b28dfd
LP
588 case ARG_NETWORK_ZONE: {
589 char *j;
590
591 j = strappend("vz-", optarg);
592 if (!j)
593 return log_oom();
594
595 if (!ifname_valid(j)) {
596 log_error("Network zone name not valid: %s", j);
597 free(j);
598 return -EINVAL;
599 }
600
df1fac6d 601 free_and_replace(arg_network_zone, j);
22b28dfd
LP
602
603 arg_network_veth = true;
604 arg_private_network = true;
605 arg_settings_mask |= SETTING_NETWORK;
606 break;
607 }
608
ab046dde 609 case ARG_NETWORK_BRIDGE:
ef76dff2 610
baaa35ad
ZJS
611 if (!ifname_valid(optarg))
612 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
613 "Bridge interface name not valid: %s", optarg);
ef76dff2 614
f757855e
LP
615 r = free_and_strdup(&arg_network_bridge, optarg);
616 if (r < 0)
617 return log_oom();
ab046dde 618
4831981d 619 _fallthrough_;
0dfaa006 620 case 'n':
69c79d3c
LP
621 arg_network_veth = true;
622 arg_private_network = true;
f757855e 623 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
624 break;
625
f6d6bad1
LP
626 case ARG_NETWORK_VETH_EXTRA:
627 r = veth_extra_parse(&arg_network_veth_extra, optarg);
628 if (r < 0)
629 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
630
631 arg_private_network = true;
632 arg_settings_mask |= SETTING_NETWORK;
633 break;
634
aa28aefe 635 case ARG_NETWORK_INTERFACE:
baaa35ad
ZJS
636 if (!ifname_valid(optarg))
637 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
638 "Network interface name not valid: %s", optarg);
ef76dff2 639
c74e630d
LP
640 if (strv_extend(&arg_network_interfaces, optarg) < 0)
641 return log_oom();
642
643 arg_private_network = true;
f757855e 644 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
645 break;
646
647 case ARG_NETWORK_MACVLAN:
ef76dff2 648
baaa35ad
ZJS
649 if (!ifname_valid(optarg))
650 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
651 "MACVLAN network interface name not valid: %s", optarg);
ef76dff2 652
c74e630d 653 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
654 return log_oom();
655
4bbfe7ad 656 arg_private_network = true;
f757855e 657 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
658 break;
659
660 case ARG_NETWORK_IPVLAN:
ef76dff2 661
baaa35ad
ZJS
662 if (!ifname_valid(optarg))
663 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
664 "IPVLAN network interface name not valid: %s", optarg);
ef76dff2 665
4bbfe7ad
TG
666 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
667 return log_oom();
668
4831981d 669 _fallthrough_;
ff01d048
LP
670 case ARG_PRIVATE_NETWORK:
671 arg_private_network = true;
f757855e 672 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
673 break;
674
d7bea6b6
DP
675 case ARG_NETWORK_NAMESPACE_PATH:
676 r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path);
677 if (r < 0)
678 return r;
679
680 break;
681
0f0dbc46 682 case 'b':
baaa35ad
ZJS
683 if (arg_start_mode == START_PID2)
684 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
685 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
686
687 arg_start_mode = START_BOOT;
688 arg_settings_mask |= SETTING_START_MODE;
689 break;
690
691 case 'a':
baaa35ad
ZJS
692 if (arg_start_mode == START_BOOT)
693 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
694 "--boot and --as-pid2 may not be combined.");
7732f92b
LP
695
696 arg_start_mode = START_PID2;
697 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
698 break;
699
144f0fc0 700 case ARG_UUID:
9444b1f2 701 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
702 if (r < 0)
703 return log_error_errno(r, "Invalid UUID: %s", optarg);
704
baaa35ad
ZJS
705 if (sd_id128_is_null(arg_uuid))
706 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
707 "Machine UUID may not be all zeroes.");
f757855e
LP
708
709 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 710 break;
aa96c6cb 711
9444b1f2 712 case 'S':
c74e630d 713 arg_slice = optarg;
144f0fc0
LP
714 break;
715
7027ff61 716 case 'M':
c1521918 717 if (isempty(optarg))
97b11eed 718 arg_machine = mfree(arg_machine);
c1521918 719 else {
baaa35ad
ZJS
720 if (!machine_name_is_valid(optarg))
721 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
722 "Invalid machine name: %s", optarg);
7027ff61 723
0c3c4284
LP
724 r = free_and_strdup(&arg_machine, optarg);
725 if (r < 0)
eb91eb18 726 return log_oom();
eb91eb18 727 }
9ce6d1b3 728 break;
7027ff61 729
3a9530e5
LP
730 case ARG_HOSTNAME:
731 if (isempty(optarg))
732 arg_hostname = mfree(arg_hostname);
733 else {
baaa35ad
ZJS
734 if (!hostname_is_valid(optarg, false))
735 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
736 "Invalid hostname: %s", optarg);
3a9530e5
LP
737
738 r = free_and_strdup(&arg_hostname, optarg);
739 if (r < 0)
740 return log_oom();
741 }
742
743 arg_settings_mask |= SETTING_HOSTNAME;
744 break;
745
82adf6af
LP
746 case 'Z':
747 arg_selinux_context = optarg;
a8828ed9
DW
748 break;
749
82adf6af
LP
750 case 'L':
751 arg_selinux_apifs_context = optarg;
a8828ed9
DW
752 break;
753
bc2f673e
LP
754 case ARG_READ_ONLY:
755 arg_read_only = true;
f757855e 756 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
757 break;
758
420c7379
LP
759 case ARG_CAPABILITY:
760 case ARG_DROP_CAPABILITY: {
6cbe4ed1 761 p = optarg;
9ed794a3 762 for (;;) {
6cbe4ed1 763 _cleanup_free_ char *t = NULL;
5076f0cc 764
6cbe4ed1
SS
765 r = extract_first_word(&p, &t, ",", 0);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 768
6cbe4ed1
SS
769 if (r == 0)
770 break;
5076f0cc 771
39ed67d1
LP
772 if (streq(t, "all")) {
773 if (c == ARG_CAPABILITY)
a42c8b54 774 plus = (uint64_t) -1;
39ed67d1 775 else
a42c8b54 776 minus = (uint64_t) -1;
39ed67d1 777 } else {
acf4d158
YW
778 r = capability_from_name(t);
779 if (r < 0)
780 return log_error_errno(r, "Failed to parse capability %s.", t);
39ed67d1
LP
781
782 if (c == ARG_CAPABILITY)
acf4d158 783 plus |= 1ULL << r;
39ed67d1 784 else
acf4d158 785 minus |= 1ULL << r;
5076f0cc 786 }
5076f0cc
LP
787 }
788
f757855e 789 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
790 break;
791 }
792
66edd963
LP
793 case ARG_NO_NEW_PRIVILEGES:
794 r = parse_boolean(optarg);
795 if (r < 0)
796 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
797
798 arg_no_new_privileges = r;
799 arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
800 break;
801
57fb9fb5
LP
802 case 'j':
803 arg_link_journal = LINK_GUEST;
574edc90 804 arg_link_journal_try = true;
4e1d6aa9 805 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
806 break;
807
808 case ARG_LINK_JOURNAL:
4e1d6aa9
LP
809 r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
810 if (r < 0) {
811 log_error_errno(r, "Failed to parse link journal mode %s", optarg);
57fb9fb5
LP
812 return -EINVAL;
813 }
814
4e1d6aa9 815 arg_settings_mask |= SETTING_LINK_JOURNAL;
57fb9fb5
LP
816 break;
817
17fe0523 818 case ARG_BIND:
f757855e
LP
819 case ARG_BIND_RO:
820 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
821 if (r < 0)
822 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 823
f757855e 824 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 825 break;
06c17c39 826
f757855e
LP
827 case ARG_TMPFS:
828 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
829 if (r < 0)
830 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 831
f757855e 832 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 833 break;
5a8af538
LP
834
835 case ARG_OVERLAY:
ad85779a
LP
836 case ARG_OVERLAY_RO:
837 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
838 if (r == -EADDRNOTAVAIL)
839 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
840 if (r < 0)
841 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 842
f757855e 843 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 844 break;
06c17c39 845
a5f1cb3b 846 case 'E': {
f4889f65
LP
847 char **n;
848
baaa35ad
ZJS
849 if (!env_assignment_is_valid(optarg))
850 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
851 "Environment variable assignment '%s' is not valid.", optarg);
f4889f65
LP
852
853 n = strv_env_set(arg_setenv, optarg);
854 if (!n)
855 return log_oom();
856
130d3d22 857 strv_free_and_replace(arg_setenv, n);
f757855e 858 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
859 break;
860 }
861
284c0b91
LP
862 case 'q':
863 arg_quiet = true;
864 break;
865
8a96d94e 866 case ARG_SHARE_SYSTEM:
a6b5216c 867 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0 868 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
63d1c29f 869 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
0c582db0 870 arg_clone_ns_flags = 0;
8a96d94e
LP
871 break;
872
eb91eb18
LP
873 case ARG_REGISTER:
874 r = parse_boolean(optarg);
875 if (r < 0) {
876 log_error("Failed to parse --register= argument: %s", optarg);
877 return r;
878 }
879
880 arg_register = r;
881 break;
882
89f7c846
LP
883 case ARG_KEEP_UNIT:
884 arg_keep_unit = true;
885 break;
886
6afc95b7
LP
887 case ARG_PERSONALITY:
888
ac45f971 889 arg_personality = personality_from_string(optarg);
baaa35ad
ZJS
890 if (arg_personality == PERSONALITY_INVALID)
891 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
892 "Unknown or unsupported personality '%s'.", optarg);
6afc95b7 893
f757855e 894 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
895 break;
896
4d9f07b4
LP
897 case ARG_VOLATILE:
898
899 if (!optarg)
f757855e 900 arg_volatile_mode = VOLATILE_YES;
5c828e66
LP
901 else if (streq(optarg, "help")) {
902 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
903 return 0;
904 } else {
f757855e 905 VolatileMode m;
4d9f07b4 906
f757855e 907 m = volatile_mode_from_string(optarg);
baaa35ad
ZJS
908 if (m < 0)
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
910 "Failed to parse --volatile= argument: %s", optarg);
911 else
f757855e 912 arg_volatile_mode = m;
6d0b55c2
LP
913 }
914
f757855e
LP
915 arg_settings_mask |= SETTING_VOLATILE_MODE;
916 break;
6d0b55c2 917
f757855e
LP
918 case 'p':
919 r = expose_port_parse(&arg_expose_ports, optarg);
920 if (r == -EEXIST)
921 return log_error_errno(r, "Duplicate port specification: %s", optarg);
922 if (r < 0)
923 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 924
f757855e 925 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 926 break;
6d0b55c2 927
f36933fe
LP
928 case ARG_PROPERTY:
929 if (strv_extend(&arg_property, optarg) < 0)
930 return log_oom();
931
932 break;
933
ae209204
ZJS
934 case ARG_PRIVATE_USERS: {
935 int boolean = -1;
0de7acce 936
ae209204
ZJS
937 if (!optarg)
938 boolean = true;
939 else if (!in_charset(optarg, DIGITS))
940 /* do *not* parse numbers as booleans */
941 boolean = parse_boolean(optarg);
942
943 if (boolean == false) {
0de7acce
LP
944 /* no: User namespacing off */
945 arg_userns_mode = USER_NAMESPACE_NO;
946 arg_uid_shift = UID_INVALID;
947 arg_uid_range = UINT32_C(0x10000);
ae209204 948 } else if (boolean == true) {
0de7acce
LP
949 /* yes: User namespacing on, UID range is read from root dir */
950 arg_userns_mode = USER_NAMESPACE_FIXED;
951 arg_uid_shift = UID_INVALID;
952 arg_uid_range = UINT32_C(0x10000);
953 } else if (streq(optarg, "pick")) {
954 /* pick: User namespacing on, UID range is picked randomly */
955 arg_userns_mode = USER_NAMESPACE_PICK;
956 arg_uid_shift = UID_INVALID;
957 arg_uid_range = UINT32_C(0x10000);
958 } else {
6c2058b3 959 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
960 const char *range, *shift;
961
0de7acce
LP
962 /* anything else: User namespacing on, UID range is explicitly configured */
963
6dac160c
LP
964 range = strchr(optarg, ':');
965 if (range) {
6c2058b3
ZJS
966 buffer = strndup(optarg, range - optarg);
967 if (!buffer)
968 return log_oom();
969 shift = buffer;
6dac160c
LP
970
971 range++;
bfd292ec
ZJS
972 r = safe_atou32(range, &arg_uid_range);
973 if (r < 0)
be715731 974 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
975 } else
976 shift = optarg;
977
be715731
ZJS
978 r = parse_uid(shift, &arg_uid_shift);
979 if (r < 0)
980 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
981
982 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
983 }
984
baaa35ad
ZJS
985 if (arg_uid_range <= 0)
986 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
987 "UID range cannot be 0.");
be715731 988
0de7acce 989 arg_settings_mask |= SETTING_USERNS;
6dac160c 990 break;
ae209204 991 }
6dac160c 992
0de7acce 993 case 'U':
ccabee0d
LP
994 if (userns_supported()) {
995 arg_userns_mode = USER_NAMESPACE_PICK;
996 arg_uid_shift = UID_INVALID;
997 arg_uid_range = UINT32_C(0x10000);
998
999 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1000 }
1001
7336138e
LP
1002 break;
1003
0de7acce 1004 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1005 arg_userns_chown = true;
0de7acce
LP
1006
1007 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1008 break;
1009
c6c8f6e2 1010 case ARG_KILL_SIGNAL:
5c828e66
LP
1011 if (streq(optarg, "help")) {
1012 DUMP_STRING_TABLE(signal, int, _NSIG);
1013 return 0;
1014 }
1015
29a3db75 1016 arg_kill_signal = signal_from_string(optarg);
baaa35ad
ZJS
1017 if (arg_kill_signal < 0)
1018 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019 "Cannot parse signal: %s", optarg);
c6c8f6e2 1020
f757855e
LP
1021 arg_settings_mask |= SETTING_KILL_SIGNAL;
1022 break;
1023
1024 case ARG_SETTINGS:
1025
1026 /* no → do not read files
1027 * yes → read files, do not override cmdline, trust only subset
1028 * override → read files, override cmdline, trust only subset
1029 * trusted → read files, do not override cmdline, trust all
1030 */
1031
1032 r = parse_boolean(optarg);
1033 if (r < 0) {
1034 if (streq(optarg, "trusted")) {
1035 mask_all_settings = false;
1036 mask_no_settings = false;
1037 arg_settings_trusted = true;
1038
1039 } else if (streq(optarg, "override")) {
1040 mask_all_settings = false;
1041 mask_no_settings = true;
1042 arg_settings_trusted = -1;
1043 } else
1044 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1045 } else if (r > 0) {
1046 /* yes */
1047 mask_all_settings = false;
1048 mask_no_settings = false;
1049 arg_settings_trusted = -1;
1050 } else {
1051 /* no */
1052 mask_all_settings = true;
1053 mask_no_settings = false;
1054 arg_settings_trusted = false;
1055 }
1056
c6c8f6e2
LP
1057 break;
1058
5f932eb9 1059 case ARG_CHDIR:
baaa35ad
ZJS
1060 if (!path_is_absolute(optarg))
1061 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1062 "Working directory %s is not an absolute path.", optarg);
5f932eb9
LP
1063
1064 r = free_and_strdup(&arg_chdir, optarg);
1065 if (r < 0)
1066 return log_oom();
1067
1068 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1069 break;
1070
b53ede69
PW
1071 case ARG_PIVOT_ROOT:
1072 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1073 if (r < 0)
1074 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1075
1076 arg_settings_mask |= SETTING_PIVOT_ROOT;
1077 break;
1078
9c1e04d0
AP
1079 case ARG_NOTIFY_READY:
1080 r = parse_boolean(optarg);
baaa35ad
ZJS
1081 if (r < 0)
1082 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1083 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
9c1e04d0
AP
1084 arg_notify_ready = r;
1085 arg_settings_mask |= SETTING_NOTIFY_READY;
1086 break;
1087
4623e8e6
LP
1088 case ARG_ROOT_HASH: {
1089 void *k;
1090 size_t l;
1091
1092 r = unhexmem(optarg, strlen(optarg), &k, &l);
1093 if (r < 0)
1094 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1095 if (l < sizeof(sd_id128_t)) {
1096 log_error("Root hash must be at least 128bit long: %s", optarg);
1097 free(k);
1098 return -EINVAL;
1099 }
1100
1101 free(arg_root_hash);
1102 arg_root_hash = k;
1103 arg_root_hash_size = l;
1104 break;
1105 }
1106
960e4569
LP
1107 case ARG_SYSTEM_CALL_FILTER: {
1108 bool negative;
1109 const char *items;
1110
1111 negative = optarg[0] == '~';
1112 items = negative ? optarg + 1 : optarg;
1113
1114 for (;;) {
1115 _cleanup_free_ char *word = NULL;
1116
1117 r = extract_first_word(&items, &word, NULL, 0);
1118 if (r == 0)
1119 break;
1120 if (r == -ENOMEM)
1121 return log_oom();
1122 if (r < 0)
1123 return log_error_errno(r, "Failed to parse system call filter: %m");
1124
1125 if (negative)
1126 r = strv_extend(&arg_syscall_blacklist, word);
1127 else
1128 r = strv_extend(&arg_syscall_whitelist, word);
1129 if (r < 0)
1130 return log_oom();
1131 }
1132
1133 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1134 break;
1135 }
1136
bf428efb
LP
1137 case ARG_RLIMIT: {
1138 const char *eq;
1139 char *name;
1140 int rl;
1141
5c828e66
LP
1142 if (streq(optarg, "help")) {
1143 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1144 return 0;
1145 }
1146
bf428efb 1147 eq = strchr(optarg, '=');
baaa35ad
ZJS
1148 if (!eq)
1149 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1150 "--rlimit= expects an '=' assignment.");
bf428efb
LP
1151
1152 name = strndup(optarg, eq - optarg);
1153 if (!name)
1154 return log_oom();
1155
1156 rl = rlimit_from_string_harder(name);
baaa35ad
ZJS
1157 if (rl < 0)
1158 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1159 "Unknown resource limit: %s", name);
bf428efb
LP
1160
1161 if (!arg_rlimit[rl]) {
1162 arg_rlimit[rl] = new0(struct rlimit, 1);
1163 if (!arg_rlimit[rl])
1164 return log_oom();
1165 }
1166
1167 r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1168 if (r < 0)
1169 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1170
1171 arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1172 break;
1173 }
1174
81f345df
LP
1175 case ARG_OOM_SCORE_ADJUST:
1176 r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1177 if (r < 0)
1178 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1179
1180 arg_oom_score_adjust_set = true;
1181 arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1182 break;
1183
d107bb7d
LP
1184 case ARG_CPU_AFFINITY: {
1185 _cleanup_cpu_free_ cpu_set_t *cpuset = NULL;
1186
1187 r = parse_cpu_set(optarg, &cpuset);
1188 if (r < 0)
1189 return log_error_errno(r, "Failed to parse CPU affinity mask: %s", optarg);
1190
1191 if (arg_cpuset)
1192 CPU_FREE(arg_cpuset);
1193
1194 arg_cpuset = TAKE_PTR(cpuset);
1195 arg_cpuset_ncpus = r;
1196 arg_settings_mask |= SETTING_CPU_AFFINITY;
1197 break;
1198 }
1199
09d423e9
LP
1200 case ARG_RESOLV_CONF:
1201 if (streq(optarg, "help")) {
1202 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1203 return 0;
1204 }
1205
1206 arg_resolv_conf = resolv_conf_mode_from_string(optarg);
baaa35ad
ZJS
1207 if (arg_resolv_conf < 0)
1208 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1209 "Failed to parse /etc/resolv.conf mode: %s", optarg);
09d423e9
LP
1210
1211 arg_settings_mask |= SETTING_RESOLV_CONF;
1212 break;
1213
1688841f
LP
1214 case ARG_TIMEZONE:
1215 if (streq(optarg, "help")) {
1216 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1217 return 0;
1218 }
1219
1220 arg_timezone = timezone_mode_from_string(optarg);
baaa35ad
ZJS
1221 if (arg_timezone < 0)
1222 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1223 "Failed to parse /etc/localtime mode: %s", optarg);
1688841f
LP
1224
1225 arg_settings_mask |= SETTING_TIMEZONE;
1226 break;
1227
88213476
LP
1228 case '?':
1229 return -EINVAL;
1230
1231 default:
eb9da376 1232 assert_not_reached("Unhandled option");
88213476 1233 }
88213476 1234
d7bea6b6
DP
1235 /* If --network-namespace-path is given with any other network-related option,
1236 * we need to error out, to avoid conflicts between different network options. */
1237 if (arg_network_namespace_path &&
1238 (arg_network_interfaces || arg_network_macvlan ||
1239 arg_network_ipvlan || arg_network_veth_extra ||
1240 arg_network_bridge || arg_network_zone ||
baaa35ad
ZJS
1241 arg_network_veth || arg_private_network))
1242 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1243 "--network-namespace-path cannot be combined with other network options.");
d7bea6b6 1244
0c582db0
LB
1245 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1246 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1247 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1248 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1249
4f086aab
SU
1250 if (arg_userns_mode != USER_NAMESPACE_NO)
1251 arg_mount_settings |= MOUNT_USE_USERNS;
1252
1253 if (arg_private_network)
1254 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1255
1256 parse_mount_settings_env();
1257
48a8d337
LB
1258 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1259 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1260 arg_register = false;
baaa35ad
ZJS
1261 if (arg_start_mode != START_PID1)
1262 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1263 "--boot cannot be used without namespacing.");
0c582db0 1264 }
eb91eb18 1265
0de7acce 1266 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1267 arg_userns_chown = true;
1268
baaa35ad 1269 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
8d9c2bca
AJ
1270 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1271 * The latter is not technically a user session, but we don't need to labour the point. */
baaa35ad
ZJS
1272 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1273 "--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846 1274
baaa35ad
ZJS
1275 if (arg_directory && arg_image)
1276 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1277 "--directory= and --image= may not be combined.");
1b9e5b12 1278
baaa35ad
ZJS
1279 if (arg_template && arg_image)
1280 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1281 "--template= and --image= may not be combined.");
ec16945e 1282
8cd328d8
LP
1283 if (arg_ephemeral && arg_template && !arg_directory) {
1284 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1285 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1286 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1287 * --directory=". */
1288
ae2a15bc 1289 arg_directory = TAKE_PTR(arg_template);
8cd328d8
LP
1290 }
1291
baaa35ad
ZJS
1292 if (arg_template && !(arg_directory || arg_machine))
1293 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1294 "--template= needs --directory= or --machine=.");
ec16945e 1295
baaa35ad
ZJS
1296 if (arg_ephemeral && arg_template)
1297 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1298 "--ephemeral and --template= may not be combined.");
ec16945e 1299
baaa35ad
ZJS
1300 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1301 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1302 "--ephemeral and --link-journal= may not be combined.");
df9a75e4 1303
baaa35ad
ZJS
1304 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1305 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
1306 "--private-users= is not supported, kernel compiled without user namespace support.");
7336138e 1307
baaa35ad
ZJS
1308 if (arg_userns_chown && arg_read_only)
1309 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1310 "--read-only and --private-users-chown may not be combined.");
f757855e 1311
baaa35ad
ZJS
1312 if (arg_network_bridge && arg_network_zone)
1313 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1314 "--network-bridge= and --network-zone= may not be combined.");
22b28dfd 1315
f757855e
LP
1316 if (argc > optind) {
1317 arg_parameters = strv_copy(argv + optind);
1318 if (!arg_parameters)
1319 return log_oom();
1320
7732f92b 1321 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1322 }
1323
1324 /* Load all settings from .nspawn files */
1325 if (mask_no_settings)
1326 arg_settings_mask = 0;
1327
1328 /* Don't load any settings from .nspawn files */
1329 if (mask_all_settings)
1330 arg_settings_mask = _SETTINGS_MASK_ALL;
1331
520e0d54 1332 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1333
399e391f
ZJS
1334 r = cg_unified_flush();
1335 if (r < 0)
1336 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1337
6aadfa4c
ILG
1338 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1339 if (e)
1340 arg_container_service_name = e;
1341
5a8ff0e6
CB
1342 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1343 if (r < 0)
1344 arg_use_cgns = cg_ns_supported();
1345 else
1346 arg_use_cgns = r;
1347
86c0dd4a
LP
1348 r = custom_mount_check_all();
1349 if (r < 0)
1350 return r;
1351
f757855e
LP
1352 return 1;
1353}
1354
1355static int verify_arguments(void) {
baaa35ad
ZJS
1356 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1357 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1358 "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
4f086aab 1359
baaa35ad
ZJS
1360 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1361 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1362 "Cannot combine --private-users with read-write mounts.");
f757855e 1363
baaa35ad
ZJS
1364 if (arg_volatile_mode != VOLATILE_NO && arg_read_only)
1365 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1366 "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
4d9f07b4 1367
baaa35ad
ZJS
1368 if (arg_expose_ports && !arg_private_network)
1369 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1370 "Cannot use --port= without private networking.");
6d0b55c2 1371
349cc4a5 1372#if ! HAVE_LIBIPTC
baaa35ad
ZJS
1373 if (arg_expose_ports)
1374 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
1375 "--port= is not supported, compiled without libiptc support.");
1c1ea217
EV
1376#endif
1377
7732f92b 1378 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1379 arg_kill_signal = SIGRTMIN+3;
1380
f757855e 1381 return 0;
88213476
LP
1382}
1383
03cfe0d5
LP
1384static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1385 assert(p);
1386
0de7acce 1387 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1388 return 0;
1389
1390 if (uid == UID_INVALID && gid == GID_INVALID)
1391 return 0;
1392
1393 if (uid != UID_INVALID) {
1394 uid += arg_uid_shift;
1395
1396 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1397 return -EOVERFLOW;
1398 }
1399
1400 if (gid != GID_INVALID) {
1401 gid += (gid_t) arg_uid_shift;
1402
1403 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1404 return -EOVERFLOW;
1405 }
1406
1407 if (lchown(p, uid, gid) < 0)
1408 return -errno;
b12afc8c
LP
1409
1410 return 0;
1411}
1412
03cfe0d5
LP
1413static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1414 const char *q;
dae8b82e 1415 int r;
03cfe0d5
LP
1416
1417 q = prefix_roota(root, path);
dae8b82e
ZJS
1418 r = mkdir_errno_wrapper(q, mode);
1419 if (r == -EEXIST)
1420 return 0;
1421 if (r < 0)
1422 return r;
03cfe0d5
LP
1423
1424 return userns_lchown(q, uid, gid);
1425}
1426
1688841f 1427static const char *timezone_from_path(const char *path) {
da9fc98d
LP
1428 return PATH_STARTSWITH_SET(
1429 path,
1430 "../usr/share/zoneinfo/",
1431 "/usr/share/zoneinfo/");
1688841f
LP
1432}
1433
e58a1277 1434static int setup_timezone(const char *dest) {
1688841f
LP
1435 _cleanup_free_ char *p = NULL, *etc = NULL;
1436 const char *where, *check;
1437 TimezoneMode m;
d4036145 1438 int r;
f8440af5 1439
e58a1277
LP
1440 assert(dest);
1441
1688841f 1442 if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1688841f
LP
1443 r = readlink_malloc("/etc/localtime", &p);
1444 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1445 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE;
1446 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1447 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY;
1448 else if (r < 0) {
1449 log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1450 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1451 * file.
1452 *
1453 * Example:
1454 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1455 */
1456 return 0;
1457 } else if (arg_timezone == TIMEZONE_AUTO)
1458 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK;
1459 else
1460 m = arg_timezone;
1461 } else
1462 m = arg_timezone;
1463
1464 if (m == TIMEZONE_OFF)
1465 return 0;
1466
1467 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
d4036145 1468 if (r < 0) {
1688841f 1469 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
d4036145
LP
1470 return 0;
1471 }
1472
1688841f
LP
1473 where = strjoina(etc, "/localtime");
1474
1475 switch (m) {
1476
1477 case TIMEZONE_DELETE:
1478 if (unlink(where) < 0)
1479 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1480
d4036145 1481 return 0;
d4036145 1482
1688841f
LP
1483 case TIMEZONE_SYMLINK: {
1484 _cleanup_free_ char *q = NULL;
1485 const char *z, *what;
4d1c38b8 1486
1688841f
LP
1487 z = timezone_from_path(p);
1488 if (!z) {
1489 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
d4036145 1490 return 0;
1688841f 1491 }
d4036145 1492
1688841f
LP
1493 r = readlink_malloc(where, &q);
1494 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1495 return 0; /* Already pointing to the right place? Then do nothing .. */
1496
1497 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1498 r = chase_symlinks(check, dest, 0, NULL);
1499 if (r < 0)
1500 log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1501 else {
1502 if (unlink(where) < 0 && errno != ENOENT) {
1503 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1504 errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1505 return 0;
1506 }
1507
1508 what = strjoina("../usr/share/zoneinfo/", z);
1509 if (symlink(what, where) < 0) {
1510 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1511 errno, "Failed to correct timezone of container, ignoring: %m");
1512 return 0;
1513 }
1514
1515 break;
1516 }
1517
1518 _fallthrough_;
d4036145 1519 }
68fb0892 1520
1688841f
LP
1521 case TIMEZONE_BIND: {
1522 _cleanup_free_ char *resolved = NULL;
1523 int found;
1524
1525 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1526 if (found < 0) {
1527 log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
1528 return 0;
1529 }
1530
1531 if (found == 0) /* missing? */
1532 (void) touch(resolved);
1533
1534 r = mount_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
1535 if (r >= 0)
1536 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1537
1538 _fallthrough_;
79d80fc1 1539 }
4d9f07b4 1540
1688841f
LP
1541 case TIMEZONE_COPY:
1542 /* If mounting failed, try to copy */
1543 r = copy_file_atomic("/etc/localtime", where, 0644, 0, COPY_REFLINK|COPY_REPLACE);
1544 if (r < 0) {
1545 log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
1546 "Failed to copy /etc/localtime to %s, ignoring: %m", where);
1547 return 0;
1548 }
1549
1550 break;
1551
1552 default:
1553 assert_not_reached("unexpected mode");
d4036145 1554 }
e58a1277 1555
1688841f 1556 /* Fix permissions of the symlink or file copy we just created */
03cfe0d5
LP
1557 r = userns_lchown(where, 0, 0);
1558 if (r < 0)
1688841f 1559 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
03cfe0d5 1560
e58a1277 1561 return 0;
88213476
LP
1562}
1563
09d423e9
LP
1564static int have_resolv_conf(const char *path) {
1565 assert(path);
1566
1567 if (access(path, F_OK) < 0) {
1568 if (errno == ENOENT)
1569 return 0;
1570
1571 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
1572 }
1573
1574 return 1;
1575}
1576
7357272e 1577static int resolved_listening(void) {
b8ea7a6e 1578 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
b053cd5f 1579 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1580 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1581 int r;
1582
7357272e 1583 /* Check if resolved is listening */
b053cd5f
LP
1584
1585 r = sd_bus_open_system(&bus);
1586 if (r < 0)
b8ea7a6e 1587 return log_debug_errno(r, "Failed to open system bus: %m");
b053cd5f 1588
7357272e 1589 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
b8ea7a6e
LP
1590 if (r < 0)
1591 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
1592 if (r == 0)
1593 return 0;
7357272e
DM
1594
1595 r = sd_bus_get_property_string(bus,
1596 "org.freedesktop.resolve1",
1597 "/org/freedesktop/resolve1",
1598 "org.freedesktop.resolve1.Manager",
1599 "DNSStubListener",
b8ea7a6e 1600 &error,
7357272e
DM
1601 &dns_stub_listener_mode);
1602 if (r < 0)
b8ea7a6e 1603 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
7357272e
DM
1604
1605 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1606}
1607
2547bb41 1608static int setup_resolv_conf(const char *dest) {
09d423e9
LP
1609 _cleanup_free_ char *etc = NULL;
1610 const char *where, *what;
1611 ResolvConfMode m;
1612 int r;
2547bb41
LP
1613
1614 assert(dest);
1615
09d423e9
LP
1616 if (arg_resolv_conf == RESOLV_CONF_AUTO) {
1617 if (arg_private_network)
1618 m = RESOLV_CONF_OFF;
1619 else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
27b620b7 1620 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC;
09d423e9
LP
1621 else if (have_resolv_conf("/etc/resolv.conf") > 0)
1622 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
1623 else
1624 m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
1625 } else
1626 m = arg_resolv_conf;
1627
1628 if (m == RESOLV_CONF_OFF)
2547bb41
LP
1629 return 0;
1630
87447ae4
LP
1631 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1632 if (r < 0) {
1633 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1634 return 0;
1635 }
1636
1637 where = strjoina(etc, "/resolv.conf");
09d423e9
LP
1638
1639 if (m == RESOLV_CONF_DELETE) {
1640 if (unlink(where) < 0)
1641 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1642
87447ae4
LP
1643 return 0;
1644 }
79d80fc1 1645
09d423e9
LP
1646 if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_COPY_STATIC))
1647 what = STATIC_RESOLV_CONF;
1648 else
1649 what = "/etc/resolv.conf";
87447ae4 1650
09d423e9
LP
1651 if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC)) {
1652 _cleanup_free_ char *resolved = NULL;
1653 int found;
1654
1655 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1656 if (found < 0) {
1657 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1658 return 0;
1659 }
3539724c 1660
87447ae4
LP
1661 if (found == 0) /* missing? */
1662 (void) touch(resolved);
5367354d 1663
09d423e9 1664 r = mount_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
60e76d48 1665 if (r >= 0)
87447ae4 1666 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1667 }
1668
1669 /* If that didn't work, let's copy the file */
09d423e9 1670 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1671 if (r < 0) {
3539724c
LP
1672 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1673 * resolved or something similar runs inside and the symlink points there.
68a313c5 1674 *
3539724c 1675 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1676 */
09d423e9 1677 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC) && IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1678 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1679 return 0;
1680 }
2547bb41 1681
03cfe0d5
LP
1682 r = userns_lchown(where, 0, 0);
1683 if (r < 0)
3539724c 1684 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1685
2547bb41
LP
1686 return 0;
1687}
1688
1e4f1671 1689static int setup_boot_id(void) {
cdde6ba6
LP
1690 _cleanup_(unlink_and_freep) char *from = NULL;
1691 _cleanup_free_ char *path = NULL;
3bbaff3e 1692 sd_id128_t rnd = SD_ID128_NULL;
cdde6ba6 1693 const char *to;
04bc4a3f
LP
1694 int r;
1695
04bc4a3f
LP
1696 /* Generate a new randomized boot ID, so that each boot-up of
1697 * the container gets a new one */
1698
cdde6ba6
LP
1699 r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
1700 if (r < 0)
1701 return log_error_errno(r, "Failed to generate random boot ID path: %m");
04bc4a3f
LP
1702
1703 r = sd_id128_randomize(&rnd);
f647962d
MS
1704 if (r < 0)
1705 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1706
cdde6ba6 1707 r = id128_write(path, ID128_UUID, rnd, false);
f647962d
MS
1708 if (r < 0)
1709 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1710
cdde6ba6
LP
1711 from = TAKE_PTR(path);
1712 to = "/proc/sys/kernel/random/boot_id";
1713
60e76d48 1714 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
cdde6ba6
LP
1715 if (r < 0)
1716 return r;
04bc4a3f 1717
cdde6ba6 1718 return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
04bc4a3f
LP
1719}
1720
e58a1277 1721static int copy_devnodes(const char *dest) {
88213476
LP
1722 static const char devnodes[] =
1723 "null\0"
1724 "zero\0"
1725 "full\0"
1726 "random\0"
1727 "urandom\0"
85614d66
TG
1728 "tty\0"
1729 "net/tun\0";
88213476
LP
1730
1731 const char *d;
e58a1277 1732 int r = 0;
7fd1b19b 1733 _cleanup_umask_ mode_t u;
a258bf26
LP
1734
1735 assert(dest);
124640f1
LP
1736
1737 u = umask(0000);
88213476 1738
03cfe0d5
LP
1739 /* Create /dev/net, so that we can create /dev/net/tun in it */
1740 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1741 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1742
88213476 1743 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1744 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1745 struct stat st;
88213476 1746
7f112f50 1747 from = strappend("/dev/", d);
8967f291
LP
1748 if (!from)
1749 return log_oom();
1750
03cfe0d5 1751 to = prefix_root(dest, from);
8967f291
LP
1752 if (!to)
1753 return log_oom();
88213476
LP
1754
1755 if (stat(from, &st) < 0) {
1756
4a62c710
MS
1757 if (errno != ENOENT)
1758 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1759
baaa35ad
ZJS
1760 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
1761 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1762 "%s is not a char or block device, cannot copy.", from);
1763 else {
8dfce114
LP
1764 _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
1765
81f5049b 1766 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1767 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1768 if (errno == EEXIST)
8dbf71ec 1769 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1770 if (errno != EPERM)
1771 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1772
8dfce114 1773 /* Some systems abusively restrict mknod but allow bind mounts. */
81f5049b
AC
1774 r = touch(to);
1775 if (r < 0)
1776 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1777 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1778 if (r < 0)
1779 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1780 }
6278cf60 1781
03cfe0d5
LP
1782 r = userns_lchown(to, 0, 0);
1783 if (r < 0)
1784 return log_error_errno(r, "chown() of device node %s failed: %m", to);
8dfce114
LP
1785
1786 dn = strjoin("/dev/", S_ISCHR(st.st_mode) ? "char" : "block");
1787 if (!dn)
1788 return log_oom();
1789
1790 r = userns_mkdir(dest, dn, 0755, 0, 0);
1791 if (r < 0)
1792 return log_error_errno(r, "Failed to create '%s': %m", dn);
1793
1794 if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
1795 return log_oom();
1796
1797 prefixed = prefix_root(dest, sl);
1798 if (!prefixed)
1799 return log_oom();
1800
1801 t = strjoin("../", d);
1802 if (!t)
1803 return log_oom();
1804
1805 if (symlink(t, prefixed) < 0)
1806 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
88213476 1807 }
88213476
LP
1808 }
1809
e58a1277
LP
1810 return r;
1811}
88213476 1812
03cfe0d5
LP
1813static int setup_pts(const char *dest) {
1814 _cleanup_free_ char *options = NULL;
1815 const char *p;
709f6e46 1816 int r;
03cfe0d5 1817
349cc4a5 1818#if HAVE_SELINUX
03cfe0d5
LP
1819 if (arg_selinux_apifs_context)
1820 (void) asprintf(&options,
3dce8915 1821 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1822 arg_uid_shift + TTY_GID,
1823 arg_selinux_apifs_context);
1824 else
1825#endif
1826 (void) asprintf(&options,
3dce8915 1827 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1828 arg_uid_shift + TTY_GID);
f2d88580 1829
03cfe0d5 1830 if (!options)
f2d88580
LP
1831 return log_oom();
1832
03cfe0d5 1833 /* Mount /dev/pts itself */
cc9fce65 1834 p = prefix_roota(dest, "/dev/pts");
dae8b82e
ZJS
1835 r = mkdir_errno_wrapper(p, 0755);
1836 if (r < 0)
1837 return log_error_errno(r, "Failed to create /dev/pts: %m");
1838
60e76d48
ZJS
1839 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1840 if (r < 0)
1841 return r;
709f6e46
MS
1842 r = userns_lchown(p, 0, 0);
1843 if (r < 0)
1844 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1845
1846 /* Create /dev/ptmx symlink */
1847 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1848 if (symlink("pts/ptmx", p) < 0)
1849 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1850 r = userns_lchown(p, 0, 0);
1851 if (r < 0)
1852 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1853
03cfe0d5
LP
1854 /* And fix /dev/pts/ptmx ownership */
1855 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1856 r = userns_lchown(p, 0, 0);
1857 if (r < 0)
1858 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1859
f2d88580
LP
1860 return 0;
1861}
1862
e58a1277 1863static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1864 _cleanup_umask_ mode_t u;
1865 const char *to;
e58a1277 1866 int r;
e58a1277
LP
1867
1868 assert(dest);
1869 assert(console);
1870
1871 u = umask(0000);
1872
03cfe0d5 1873 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1874 if (r < 0)
1875 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1876
a258bf26
LP
1877 /* We need to bind mount the right tty to /dev/console since
1878 * ptys can only exist on pts file systems. To have something
81f5049b 1879 * to bind mount things on we create a empty regular file. */
a258bf26 1880
03cfe0d5 1881 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1882 r = touch(to);
1883 if (r < 0)
1884 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1885
60e76d48 1886 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1887}
1888
8e5430c4
LP
1889static int setup_keyring(void) {
1890 key_serial_t keyring;
1891
1892 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1893 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1894 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1895 * these system calls let's make sure we don't leak anything into the container. */
1896
1897 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1898 if (keyring == -1) {
1899 if (errno == ENOSYS)
1900 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1901 else if (IN_SET(errno, EACCES, EPERM))
1902 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1903 else
1904 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1905 }
1906
1907 return 0;
1908}
1909
1e4f1671 1910static int setup_kmsg(int kmsg_socket) {
9ec5a93c
LP
1911 _cleanup_(unlink_and_freep) char *from = NULL;
1912 _cleanup_free_ char *fifo = NULL;
1913 _cleanup_close_ int fd = -1;
7fd1b19b 1914 _cleanup_umask_ mode_t u;
9ec5a93c
LP
1915 const char *to;
1916 int r;
e58a1277 1917
e58a1277 1918 assert(kmsg_socket >= 0);
a258bf26 1919
e58a1277 1920 u = umask(0000);
a258bf26 1921
9ec5a93c
LP
1922 /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
1923 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
1924 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
1925 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
1926
1927 r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
1928 if (r < 0)
1929 return log_error_errno(r, "Failed to generate kmsg path: %m");
e58a1277 1930
9ec5a93c 1931 if (mkfifo(fifo, 0600) < 0)
03cfe0d5 1932 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
9ec5a93c
LP
1933
1934 from = TAKE_PTR(fifo);
1935 to = "/proc/kmsg";
1936
60e76d48
ZJS
1937 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1938 if (r < 0)
1939 return r;
e58a1277 1940
669fc4e5 1941 fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
4a62c710
MS
1942 if (fd < 0)
1943 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1944
9ec5a93c 1945 /* Store away the fd in the socket, so that it stays open as long as we run the child */
3ee897d6 1946 r = send_one_fd(kmsg_socket, fd, 0);
d9603714
DH
1947 if (r < 0)
1948 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1949
25ea79fe 1950 return 0;
88213476
LP
1951}
1952
1c4baffc 1953static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1954 union in_addr_union *exposed = userdata;
1955
1956 assert(rtnl);
1957 assert(m);
1958 assert(exposed);
1959
7a8f6325 1960 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1961 return 0;
1962}
1963
3a74cea5 1964static int setup_hostname(void) {
c818eef1 1965 int r;
3a74cea5 1966
0c582db0 1967 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1968 return 0;
1969
c818eef1
LP
1970 r = sethostname_idempotent(arg_hostname ?: arg_machine);
1971 if (r < 0)
1972 return log_error_errno(r, "Failed to set hostname: %m");
3a74cea5 1973
7027ff61 1974 return 0;
3a74cea5
LP
1975}
1976
57fb9fb5 1977static int setup_journal(const char *directory) {
e01ff70a 1978 sd_id128_t this_id;
0f5e1382 1979 _cleanup_free_ char *d = NULL;
e01ff70a 1980 const char *p, *q;
8054d749 1981 bool try;
369ca6da 1982 char id[33], *dirname;
57fb9fb5
LP
1983 int r;
1984
df9a75e4
LP
1985 /* Don't link journals in ephemeral mode */
1986 if (arg_ephemeral)
1987 return 0;
1988
8054d749
LP
1989 if (arg_link_journal == LINK_NO)
1990 return 0;
1991
1992 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1993
4d680aee 1994 r = sd_id128_get_machine(&this_id);
f647962d
MS
1995 if (r < 0)
1996 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1997
e01ff70a 1998 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1999 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 2000 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 2001 if (try)
4d680aee 2002 return 0;
df9a75e4 2003 return -EEXIST;
4d680aee
ZJS
2004 }
2005
369ca6da
ZJS
2006 FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2007 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2008 if (r < 0) {
2009 bool ignore = r == -EROFS && try;
2010 log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2011 "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2012 return ignore ? 0 : r;
2013 }
2014 }
03cfe0d5 2015
e01ff70a
MS
2016 (void) sd_id128_to_string(arg_uuid, id);
2017
03cfe0d5
LP
2018 p = strjoina("/var/log/journal/", id);
2019 q = prefix_roota(directory, p);
27407a01 2020
e1873695 2021 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
2022 if (try)
2023 return 0;
27407a01 2024
baaa35ad
ZJS
2025 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2026 "%s: already a mount point, refusing to use for journal", p);
57fb9fb5
LP
2027 }
2028
e1873695 2029 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
2030 if (try)
2031 return 0;
57fb9fb5 2032
baaa35ad
ZJS
2033 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2034 "%s: already a mount point, refusing to use for journal", q);
57fb9fb5
LP
2035 }
2036
2037 r = readlink_and_make_absolute(p, &d);
2038 if (r >= 0) {
3742095b 2039 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
2040 path_equal(d, q)) {
2041
03cfe0d5 2042 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2043 if (r < 0)
709f6e46 2044 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2045 return 0;
57fb9fb5
LP
2046 }
2047
4a62c710
MS
2048 if (unlink(p) < 0)
2049 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2050 } else if (r == -EINVAL) {
2051
2052 if (arg_link_journal == LINK_GUEST &&
2053 rmdir(p) < 0) {
2054
27407a01
ZJS
2055 if (errno == ENOTDIR) {
2056 log_error("%s already exists and is neither a symlink nor a directory", p);
2057 return r;
4314d33f
MS
2058 } else
2059 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 2060 }
4314d33f
MS
2061 } else if (r != -ENOENT)
2062 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
2063
2064 if (arg_link_journal == LINK_GUEST) {
2065
2066 if (symlink(q, p) < 0) {
8054d749 2067 if (try) {
56f64d95 2068 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 2069 return 0;
4314d33f
MS
2070 } else
2071 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
2072 }
2073
03cfe0d5 2074 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2075 if (r < 0)
709f6e46 2076 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 2077 return 0;
57fb9fb5
LP
2078 }
2079
2080 if (arg_link_journal == LINK_HOST) {
ccddd104 2081 /* don't create parents here — if the host doesn't have
574edc90 2082 * permanent journal set up, don't force it here */
ba8e6c4d 2083
dae8b82e
ZJS
2084 r = mkdir_errno_wrapper(p, 0755);
2085 if (r < 0 && r != -EEXIST) {
8054d749 2086 if (try) {
dae8b82e 2087 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
574edc90 2088 return 0;
4314d33f 2089 } else
dae8b82e 2090 return log_error_errno(r, "Failed to create %s: %m", p);
57fb9fb5
LP
2091 }
2092
27407a01
ZJS
2093 } else if (access(p, F_OK) < 0)
2094 return 0;
57fb9fb5 2095
cdb2b9d0
LP
2096 if (dir_is_empty(q) == 0)
2097 log_warning("%s is not empty, proceeding anyway.", q);
2098
03cfe0d5 2099 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
2100 if (r < 0)
2101 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 2102
60e76d48
ZJS
2103 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2104 if (r < 0)
4a62c710 2105 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2106
27407a01 2107 return 0;
57fb9fb5
LP
2108}
2109
88213476 2110static int drop_capabilities(void) {
520e0d54 2111 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
2112}
2113
db999e0f
LP
2114static int reset_audit_loginuid(void) {
2115 _cleanup_free_ char *p = NULL;
2116 int r;
2117
0c582db0 2118 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
2119 return 0;
2120
2121 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2122 if (r == -ENOENT)
db999e0f 2123 return 0;
f647962d
MS
2124 if (r < 0)
2125 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2126
2127 /* Already reset? */
2128 if (streq(p, "4294967295"))
2129 return 0;
2130
57512c89 2131 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
db999e0f 2132 if (r < 0) {
10a87006
LP
2133 log_error_errno(r,
2134 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2135 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2139
db999e0f 2140 sleep(5);
77b6e194 2141 }
db999e0f
LP
2142
2143 return 0;
77b6e194
LP
2144}
2145
785890ac
LP
2146static int setup_propagate(const char *root) {
2147 const char *p, *q;
709f6e46 2148 int r;
785890ac
LP
2149
2150 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2151 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2152 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2153 (void) mkdir_p(p, 0600);
2154
709f6e46
MS
2155 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 2158
709f6e46
MS
2159 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 2162
709f6e46
MS
2163 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2166
03cfe0d5 2167 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
2168 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2169 if (r < 0)
2170 return r;
785890ac 2171
60e76d48
ZJS
2172 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2173 if (r < 0)
2174 return r;
785890ac 2175
19caffac
AC
2176 /* machined will MS_MOVE into that directory, and that's only
2177 * supported for non-shared mounts. */
60e76d48 2178 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
2179}
2180
317feb4d 2181static int setup_machine_id(const char *directory) {
691675ba
LP
2182 const char *etc_machine_id;
2183 sd_id128_t id;
3bbaff3e 2184 int r;
e01ff70a 2185
317feb4d
LP
2186 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2187 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2188 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2189 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2190 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2191 * container behaves nicely). */
2192
e01ff70a
MS
2193 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2194
691675ba 2195 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2196 if (r < 0) {
2197 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2198 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2199
317feb4d
LP
2200 if (sd_id128_is_null(arg_uuid)) {
2201 r = sd_id128_randomize(&arg_uuid);
2202 if (r < 0)
2203 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2204 }
2205 } else {
baaa35ad
ZJS
2206 if (sd_id128_is_null(id))
2207 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2208 "Machine ID in container image is zero, refusing.");
e01ff70a 2209
317feb4d
LP
2210 arg_uuid = id;
2211 }
691675ba 2212
e01ff70a
MS
2213 return 0;
2214}
2215
7336138e
LP
2216static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2217 int r;
2218
2219 assert(directory);
2220
0de7acce 2221 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2222 return 0;
2223
2224 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2225 if (r == -EOPNOTSUPP)
2226 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2227 if (r == -EBADE)
2228 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2229 if (r < 0)
2230 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2231 if (r == 0)
2232 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2233 else
2234 log_debug("Patched directory tree to match UID/GID range.");
2235
2236 return r;
2237}
2238
113cea80 2239/*
6d416b9c
LS
2240 * Return values:
2241 * < 0 : wait_for_terminate() failed to get the state of the
2242 * container, the container was terminated by a signal, or
2243 * failed for an unknown reason. No change is made to the
2244 * container argument.
2245 * > 0 : The program executed in the container terminated with an
2246 * error. The exit code of the program executed in the
919699ec
LP
2247 * container is returned. The container argument has been set
2248 * to CONTAINER_TERMINATED.
6d416b9c
LS
2249 * 0 : The container is being rebooted, has been shut down or exited
2250 * successfully. The container argument has been set to either
2251 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2252 *
6d416b9c
LS
2253 * That is, success is indicated by a return value of zero, and an
2254 * error is indicated by a non-zero value.
113cea80
DH
2255 */
2256static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2257 siginfo_t status;
919699ec 2258 int r;
113cea80
DH
2259
2260 r = wait_for_terminate(pid, &status);
f647962d
MS
2261 if (r < 0)
2262 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2263
2264 switch (status.si_code) {
fddbb89c 2265
113cea80 2266 case CLD_EXITED:
b5a2179b 2267 if (status.si_status == 0)
919699ec 2268 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2269 else
919699ec 2270 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2271
919699ec
LP
2272 *container = CONTAINER_TERMINATED;
2273 return status.si_status;
113cea80
DH
2274
2275 case CLD_KILLED:
2276 if (status.si_status == SIGINT) {
919699ec 2277 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2278 *container = CONTAINER_TERMINATED;
919699ec
LP
2279 return 0;
2280
113cea80 2281 } else if (status.si_status == SIGHUP) {
919699ec 2282 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2283 *container = CONTAINER_REBOOTED;
919699ec 2284 return 0;
113cea80 2285 }
919699ec 2286
4831981d 2287 _fallthrough_;
113cea80 2288 case CLD_DUMPED:
baaa35ad
ZJS
2289 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2290 "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
113cea80
DH
2291
2292 default:
baaa35ad
ZJS
2293 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2294 "Container %s failed due to unknown reason.", arg_machine);
113cea80 2295 }
113cea80
DH
2296}
2297
023fb90b
LP
2298static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2299 pid_t pid;
2300
4a0b58c4 2301 pid = PTR_TO_PID(userdata);
023fb90b 2302 if (pid > 0) {
c6c8f6e2 2303 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2304 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2305 sd_event_source_set_userdata(s, NULL);
2306 return 0;
2307 }
2308 }
2309
2310 sd_event_exit(sd_event_source_get_event(s), 0);
2311 return 0;
2312}
2313
6916b164 2314static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
abdb9b08
LP
2315 pid_t pid;
2316
2317 assert(s);
2318 assert(ssi);
2319
2320 pid = PTR_TO_PID(userdata);
2321
6916b164
AU
2322 for (;;) {
2323 siginfo_t si = {};
abdb9b08 2324
6916b164
AU
2325 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2326 return log_error_errno(errno, "Failed to waitid(): %m");
2327 if (si.si_pid == 0) /* No pending children. */
2328 break;
abdb9b08 2329 if (si.si_pid == pid) {
6916b164
AU
2330 /* The main process we care for has exited. Return from
2331 * signal handler but leave the zombie. */
2332 sd_event_exit(sd_event_source_get_event(s), 0);
2333 break;
2334 }
abdb9b08 2335
6916b164
AU
2336 /* Reap all other children. */
2337 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2338 }
2339
2340 return 0;
2341}
2342
abdb9b08
LP
2343static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2344 pid_t pid;
2345
2346 assert(m);
2347
2348 pid = PTR_TO_PID(userdata);
2349
2350 if (arg_kill_signal > 0) {
2351 log_info("Container termination requested. Attempting to halt container.");
2352 (void) kill(pid, arg_kill_signal);
2353 } else {
2354 log_info("Container termination requested. Exiting.");
2355 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2356 }
2357
2358 return 0;
2359}
2360
ec16945e 2361static int determine_names(void) {
1b9cebf6 2362 int r;
ec16945e 2363
c1521918
LP
2364 if (arg_template && !arg_directory && arg_machine) {
2365
2366 /* If --template= was specified then we should not
2367 * search for a machine, but instead create a new one
2368 * in /var/lib/machine. */
2369
605405c6 2370 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2371 if (!arg_directory)
2372 return log_oom();
2373 }
2374
ec16945e 2375 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2376 if (arg_machine) {
2377 _cleanup_(image_unrefp) Image *i = NULL;
2378
5ef46e5f 2379 r = image_find(IMAGE_MACHINE, arg_machine, &i);
3a6ce860
LP
2380 if (r == -ENOENT)
2381 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1b9cebf6 2384
eb38edce 2385 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2386 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2387 else
0f03c2a4 2388 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2389 if (r < 0)
0f3be6ca 2390 return log_oom();
1b9cebf6 2391
aee327b8
LP
2392 if (!arg_ephemeral)
2393 arg_read_only = arg_read_only || i->read_only;
d7249575
LP
2394 } else {
2395 r = safe_getcwd(&arg_directory);
2396 if (r < 0)
2397 return log_error_errno(r, "Failed to determine current directory: %m");
2398 }
ec16945e 2399
0f3be6ca 2400 if (!arg_directory && !arg_image) {
1b9cebf6 2401 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2402 return -EINVAL;
2403 }
2404 }
2405
2406 if (!arg_machine) {
b9ba4dab
LP
2407 if (arg_directory && path_equal(arg_directory, "/"))
2408 arg_machine = gethostname_malloc();
4827ab48
LP
2409 else {
2410 if (arg_image) {
2411 char *e;
2412
2413 arg_machine = strdup(basename(arg_image));
2414
2415 /* Truncate suffix if there is one */
2416 e = endswith(arg_machine, ".raw");
2417 if (e)
2418 *e = 0;
2419 } else
2420 arg_machine = strdup(basename(arg_directory));
2421 }
ec16945e
LP
2422 if (!arg_machine)
2423 return log_oom();
2424
ae691c1d 2425 hostname_cleanup(arg_machine);
ec16945e
LP
2426 if (!machine_name_is_valid(arg_machine)) {
2427 log_error("Failed to determine machine name automatically, please use -M.");
2428 return -EINVAL;
2429 }
b9ba4dab
LP
2430
2431 if (arg_ephemeral) {
2432 char *b;
2433
2434 /* Add a random suffix when this is an
2435 * ephemeral machine, so that we can run many
2436 * instances at once without manually having
2437 * to specify -M each time. */
2438
2439 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2440 return log_oom();
2441
2442 free(arg_machine);
2443 arg_machine = b;
2444 }
ec16945e
LP
2445 }
2446
2447 return 0;
2448}
2449
8d4aa2bb 2450static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2451 char *chased;
2452 int r;
2453
2454 assert(p);
2455
2456 if (!*p)
2457 return 0;
2458
8d4aa2bb 2459 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2460 if (r < 0)
2461 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2462
8405dcf7
ZJS
2463 free_and_replace(*p, chased);
2464 return r; /* r might be an fd here in case we ever use CHASE_OPEN in flags */
3f342ec4
LP
2465}
2466
03cfe0d5 2467static int determine_uid_shift(const char *directory) {
6dac160c
LP
2468 int r;
2469
0de7acce 2470 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2471 arg_uid_shift = 0;
6dac160c 2472 return 0;
03cfe0d5 2473 }
6dac160c
LP
2474
2475 if (arg_uid_shift == UID_INVALID) {
2476 struct stat st;
2477
03cfe0d5 2478 r = stat(directory, &st);
6dac160c 2479 if (r < 0)
03cfe0d5 2480 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2481
2482 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2483
baaa35ad
ZJS
2484 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
2485 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2486 "UID and GID base of %s don't match.", directory);
6dac160c
LP
2487
2488 arg_uid_range = UINT32_C(0x10000);
2489 }
2490
baaa35ad
ZJS
2491 if (arg_uid_shift > (uid_t) -1 - arg_uid_range)
2492 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2493 "UID base too high for UID range.");
6dac160c 2494
6dac160c
LP
2495 return 0;
2496}
2497
03cfe0d5
LP
2498static int inner_child(
2499 Barrier *barrier,
2500 const char *directory,
2501 bool secondary,
2502 int kmsg_socket,
2503 int rtnl_socket,
f757855e 2504 FDSet *fds) {
69c79d3c 2505
03cfe0d5 2506 _cleanup_free_ char *home = NULL;
e01ff70a 2507 char as_uuid[37];
88614c8a 2508 size_t n_env = 1;
03cfe0d5 2509 const char *envp[] = {
0c300adf 2510 "PATH=" DEFAULT_PATH_COMPAT,
6aadfa4c 2511 NULL, /* container */
03cfe0d5
LP
2512 NULL, /* TERM */
2513 NULL, /* HOME */
2514 NULL, /* USER */
2515 NULL, /* LOGNAME */
2516 NULL, /* container_uuid */
2517 NULL, /* LISTEN_FDS */
2518 NULL, /* LISTEN_PID */
9c1e04d0 2519 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2520 NULL
2521 };
1a68e1e5 2522 const char *exec_target;
2371271c 2523 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2524 int r;
88213476 2525
b37469d7
LP
2526 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
2527 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
2528 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
2529 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
2530 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
2531 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
2532 * namespace.
2533 *
2534 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
2535 * unshare(). See below. */
2536
03cfe0d5
LP
2537 assert(barrier);
2538 assert(directory);
2539 assert(kmsg_socket >= 0);
88213476 2540
0de7acce 2541 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2542 /* Tell the parent, that it now can write the UID map. */
2543 (void) barrier_place(barrier); /* #1 */
7027ff61 2544
03cfe0d5 2545 /* Wait until the parent wrote the UID map */
baaa35ad
ZJS
2546 if (!barrier_place_and_sync(barrier)) /* #2 */
2547 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2548 "Parent died too early");
88213476
LP
2549 }
2550
6d66bd3b
EV
2551 r = reset_uid_gid();
2552 if (r < 0)
2553 return log_error_errno(r, "Couldn't become new root: %m");
2554
0de7acce 2555 r = mount_all(NULL,
4f086aab 2556 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce 2557 arg_uid_shift,
0de7acce 2558 arg_selinux_apifs_context);
03cfe0d5
LP
2559 if (r < 0)
2560 return r;
2561
04413780
ZJS
2562 if (!arg_network_namespace_path && arg_private_network) {
2563 r = unshare(CLONE_NEWNET);
2564 if (r < 0)
2565 return log_error_errno(errno, "Failed to unshare network namespace: %m");
75116558
PS
2566
2567 /* Tell the parent that it can setup network interfaces. */
2568 (void) barrier_place(barrier); /* #3 */
04413780
ZJS
2569 }
2570
4f086aab 2571 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2572 if (r < 0)
2573 return r;
2574
03cfe0d5
LP
2575 /* Wait until we are cgroup-ified, so that we
2576 * can mount the right cgroup path writable */
baaa35ad
ZJS
2577 if (!barrier_place_and_sync(barrier)) /* #4 */
2578 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2579 "Parent died too early");
88213476 2580
5a8ff0e6 2581 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2582 r = unshare(CLONE_NEWCGROUP);
2583 if (r < 0)
04413780 2584 return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
0996ef00
CB
2585 r = mount_cgroups(
2586 "",
2587 arg_unified_cgroup_hierarchy,
2588 arg_userns_mode != USER_NAMESPACE_NO,
2589 arg_uid_shift,
2590 arg_uid_range,
5a8ff0e6 2591 arg_selinux_apifs_context,
ada54120 2592 true);
0996ef00
CB
2593 if (r < 0)
2594 return r;
2595 } else {
2596 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2597 if (r < 0)
2598 return r;
2599 }
ec16945e 2600
1e4f1671 2601 r = setup_boot_id();
03cfe0d5
LP
2602 if (r < 0)
2603 return r;
ec16945e 2604
1e4f1671 2605 r = setup_kmsg(kmsg_socket);
03cfe0d5
LP
2606 if (r < 0)
2607 return r;
2608 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2609
03cfe0d5
LP
2610 if (setsid() < 0)
2611 return log_error_errno(errno, "setsid() failed: %m");
2612
2613 if (arg_private_network)
2614 loopback_setup();
2615
7a8f6325
LP
2616 if (arg_expose_ports) {
2617 r = expose_port_send_rtnl(rtnl_socket);
2618 if (r < 0)
2619 return r;
2620 rtnl_socket = safe_close(rtnl_socket);
2621 }
03cfe0d5 2622
81f345df
LP
2623 if (arg_oom_score_adjust_set) {
2624 r = set_oom_score_adjust(arg_oom_score_adjust);
2625 if (r < 0)
2626 return log_error_errno(r, "Failed to adjust OOM score: %m");
2627 }
2628
d107bb7d
LP
2629 if (arg_cpuset)
2630 if (sched_setaffinity(0, CPU_ALLOC_SIZE(arg_cpuset_ncpus), arg_cpuset) < 0)
2631 return log_error_errno(errno, "Failed to set CPU affinity: %m");
2632
709f6e46
MS
2633 r = drop_capabilities();
2634 if (r < 0)
2635 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5 2636
c818eef1 2637 (void) setup_hostname();
03cfe0d5 2638
050f7277 2639 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2640 r = safe_personality(arg_personality);
2641 if (r < 0)
2642 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2643 } else if (secondary) {
21022b9d
LP
2644 r = safe_personality(PER_LINUX32);
2645 if (r < 0)
2646 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2647 }
2648
349cc4a5 2649#if HAVE_SELINUX
03cfe0d5 2650 if (arg_selinux_context)
2ed96880 2651 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2652 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2653#endif
2654
ee645080 2655 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2656 if (r < 0)
2657 return r;
2658
66edd963
LP
2659 if (arg_no_new_privileges)
2660 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
2661 return log_error_errno(errno, "Failed to disable new privileges: %m");
2662
6aadfa4c
ILG
2663 /* LXC sets container=lxc, so follow the scheme here */
2664 envp[n_env++] = strjoina("container=", arg_container_service_name);
2665
03cfe0d5
LP
2666 envp[n_env] = strv_find_prefix(environ, "TERM=");
2667 if (envp[n_env])
313cefa1 2668 n_env++;
03cfe0d5
LP
2669
2670 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2671 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2672 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2673 return log_oom();
2674
3bbaff3e 2675 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2676
691675ba 2677 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2678 return log_oom();
03cfe0d5
LP
2679
2680 if (fdset_size(fds) > 0) {
2681 r = fdset_cloexec(fds, false);
2682 if (r < 0)
2683 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2684
2685 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2686 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2687 return log_oom();
2688 }
9c1e04d0
AP
2689 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2690 return log_oom();
03cfe0d5 2691
2371271c
TG
2692 env_use = strv_env_merge(2, envp, arg_setenv);
2693 if (!env_use)
2694 return log_oom();
03cfe0d5
LP
2695
2696 /* Let the parent know that we are ready and
2697 * wait until the parent is ready with the
2698 * setup, too... */
baaa35ad
ZJS
2699 if (!barrier_place_and_sync(barrier)) /* #5 */
2700 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
2701 "Parent died too early");
03cfe0d5 2702
5f932eb9
LP
2703 if (arg_chdir)
2704 if (chdir(arg_chdir) < 0)
2705 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2706
7732f92b 2707 if (arg_start_mode == START_PID2) {
75bf701f 2708 r = stub_pid1(arg_uuid);
7732f92b
LP
2709 if (r < 0)
2710 return r;
2711 }
2712
8ca082b4
LP
2713 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
2714 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
2715 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
03cfe0d5 2716 log_close();
8ca082b4
LP
2717 log_set_open_when_needed(true);
2718
03cfe0d5
LP
2719 (void) fdset_close_others(fds);
2720
7732f92b 2721 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2722 char **a;
2723 size_t m;
2724
2725 /* Automatically search for the init system */
2726
75f32f04
ZJS
2727 m = strv_length(arg_parameters);
2728 a = newa(char*, m + 2);
2729 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2730 a[1 + m] = NULL;
03cfe0d5 2731
ced58da7 2732 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2733 execve(a[0], a, env_use);
2734
ced58da7 2735 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2736 execve(a[0], a, env_use);
2737
ced58da7 2738 a[0] = (char*) "/sbin/init";
03cfe0d5 2739 execve(a[0], a, env_use);
ced58da7
LP
2740
2741 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5 2742 } else if (!strv_isempty(arg_parameters)) {
b6b180b7
LP
2743 const char *dollar_path;
2744
1a68e1e5 2745 exec_target = arg_parameters[0];
b6b180b7
LP
2746
2747 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
2748 * binary. */
2749 dollar_path = strv_env_get(env_use, "PATH");
2750 if (dollar_path) {
2751 if (putenv((char*) dollar_path) != 0)
2752 return log_error_errno(errno, "Failed to update $PATH: %m");
2753 }
2754
f757855e 2755 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2756 } else {
5f932eb9 2757 if (!arg_chdir)
d929b0f9
ZJS
2758 /* If we cannot change the directory, we'll end up in /, that is expected. */
2759 (void) chdir(home ?: "/root");
5f932eb9 2760
03cfe0d5
LP
2761 execle("/bin/bash", "-bash", NULL, env_use);
2762 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2763
2764 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2765 }
2766
8ca082b4 2767 return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2768}
2769
9c1e04d0 2770static int setup_sd_notify_child(void) {
271f518f 2771 _cleanup_close_ int fd = -1;
9c1e04d0 2772 union sockaddr_union sa = {
44ed5214
LP
2773 .un.sun_family = AF_UNIX,
2774 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
9c1e04d0
AP
2775 };
2776 int r;
2777
2778 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2779 if (fd < 0)
2780 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2781
2782 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
fbda85b0 2783 (void) sockaddr_un_unlink(&sa.un);
9c1e04d0 2784
9c1e04d0 2785 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
271f518f 2786 if (r < 0)
44ed5214 2787 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
9c1e04d0 2788
adc7d9f0 2789 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
271f518f 2790 if (r < 0)
adc7d9f0 2791 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
adc7d9f0 2792
2ff48e98 2793 r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
271f518f 2794 if (r < 0)
2ff48e98 2795 return log_error_errno(r, "SO_PASSCRED failed: %m");
9c1e04d0 2796
271f518f 2797 return TAKE_FD(fd);
9c1e04d0
AP
2798}
2799
03cfe0d5
LP
2800static int outer_child(
2801 Barrier *barrier,
2802 const char *directory,
2803 const char *console,
2d845785 2804 DissectedImage *dissected_image,
03cfe0d5
LP
2805 bool interactive,
2806 bool secondary,
2807 int pid_socket,
e01ff70a 2808 int uuid_socket,
9c1e04d0 2809 int notify_socket,
03cfe0d5
LP
2810 int kmsg_socket,
2811 int rtnl_socket,
825d5287 2812 int uid_shift_socket,
8199d554 2813 int unified_cgroup_hierarchy_socket,
d7bea6b6
DP
2814 FDSet *fds,
2815 int netns_fd) {
03cfe0d5 2816
bf428efb
LP
2817 _cleanup_close_ int fd = -1;
2818 int r, which_failed;
03cfe0d5
LP
2819 pid_t pid;
2820 ssize_t l;
03cfe0d5 2821
b37469d7
LP
2822 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
2823 * its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
2824 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
2825 * initializations a second child (the "inner" one) is forked off it, and it exits. */
2826
03cfe0d5
LP
2827 assert(barrier);
2828 assert(directory);
2829 assert(console);
2830 assert(pid_socket >= 0);
e01ff70a 2831 assert(uuid_socket >= 0);
9c1e04d0 2832 assert(notify_socket >= 0);
03cfe0d5
LP
2833 assert(kmsg_socket >= 0);
2834
2835 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2836 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2837
2838 if (interactive) {
2b33ab09 2839 int terminal;
03cfe0d5 2840
2b33ab09
LP
2841 terminal = open_terminal(console, O_RDWR);
2842 if (terminal < 0)
2843 return log_error_errno(terminal, "Failed to open console: %m");
03cfe0d5 2844
17cac366
LP
2845 /* Make sure we can continue logging to the original stderr, even if stderr points elsewhere now */
2846 r = log_dup_console();
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to duplicate stderr: %m");
2849
2b33ab09
LP
2850 r = rearrange_stdio(terminal, terminal, terminal); /* invalidates 'terminal' on success and failure */
2851 if (r < 0)
2852 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
03cfe0d5
LP
2853 }
2854
2855 r = reset_audit_loginuid();
2856 if (r < 0)
2857 return r;
2858
2859 /* Mark everything as slave, so that we still
2860 * receive mounts from the real root, but don't
2861 * propagate mounts to the real root. */
60e76d48
ZJS
2862 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2863 if (r < 0)
2864 return r;
03cfe0d5 2865
2d845785 2866 if (dissected_image) {
2d3a5a73
LP
2867 /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
2868 * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
2869 * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
2870 * makes sure ESP partitions and userns are compatible. */
2871
2872 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
03bcb6d4
LP
2873 DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|
2874 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)|
2875 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
2d845785
LP
2876 if (r < 0)
2877 return r;
2878 }
03cfe0d5 2879
391567f4
LP
2880 r = determine_uid_shift(directory);
2881 if (r < 0)
2882 return r;
2883
0de7acce 2884 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2885 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2886 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2887 if (l < 0)
2888 return log_error_errno(errno, "Failed to send UID shift: %m");
baaa35ad
ZJS
2889 if (l != sizeof(arg_uid_shift))
2890 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2891 "Short write while sending UID shift.");
0e7ac751 2892
0de7acce 2893 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2894 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2895 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2896 * not it will pick a different one, and send it back to us. */
2897
2898 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2899 if (l < 0)
2900 return log_error_errno(errno, "Failed to recv UID shift: %m");
baaa35ad
ZJS
2901 if (l != sizeof(arg_uid_shift))
2902 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2903 "Short read while receiving UID shift.");
0e7ac751
LP
2904 }
2905
ff6c6cc1
LP
2906 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
2907 "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2908 }
2909
2d3a5a73
LP
2910 if (dissected_image) {
2911 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
2912 r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
2913 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2914 if (r < 0)
2915 return r;
2916 }
2917
8199d554
LP
2918 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
2919 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
2920
2921 r = detect_unified_cgroup_hierarchy_from_image(directory);
2922 if (r < 0)
2923 return r;
2924
2925 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
2926 if (l < 0)
2927 return log_error_errno(errno, "Failed to send cgroup mode: %m");
baaa35ad
ZJS
2928 if (l != sizeof(arg_unified_cgroup_hierarchy))
2929 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2930 "Short write while sending cgroup mode.");
8199d554
LP
2931
2932 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
2933 }
2934
03cfe0d5 2935 /* Turn directory into bind mount */
60e76d48
ZJS
2936 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2937 if (r < 0)
2938 return r;
03cfe0d5 2939
b53ede69
PW
2940 r = setup_pivot_root(
2941 directory,
2942 arg_pivot_root_new,
2943 arg_pivot_root_old);
2944 if (r < 0)
2945 return r;
2946
0de7acce
LP
2947 r = setup_volatile(
2948 directory,
2949 arg_volatile_mode,
2950 arg_userns_mode != USER_NAMESPACE_NO,
2951 arg_uid_shift,
2952 arg_uid_range,
2953 arg_selinux_context);
03cfe0d5
LP
2954 if (r < 0)
2955 return r;
2956
0de7acce
LP
2957 r = setup_volatile_state(
2958 directory,
2959 arg_volatile_mode,
2960 arg_userns_mode != USER_NAMESPACE_NO,
2961 arg_uid_shift,
2962 arg_uid_range,
2963 arg_selinux_context);
03cfe0d5
LP
2964 if (r < 0)
2965 return r;
2966
4ad14eff
LP
2967 /* Mark everything as shared so our mounts get propagated down. This is
2968 * required to make new bind mounts available in systemd services
2969 * inside the containter that create a new mount namespace.
2970 * See https://github.com/systemd/systemd/issues/3860
2971 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2972 * shared propagation mode. */
4ad14eff
LP
2973 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2974 if (r < 0)
2975 return r;
2976
2977 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2978 if (r < 0)
2979 return r;
2980
03cfe0d5
LP
2981 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2982 if (r < 0)
2983 return r;
2984
03cfe0d5 2985 if (arg_read_only) {
6b7c9f8b 2986 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2987 if (r < 0)
2988 return log_error_errno(r, "Failed to make tree read-only: %m");
2989 }
2990
0de7acce 2991 r = mount_all(directory,
4f086aab 2992 arg_mount_settings,
0de7acce 2993 arg_uid_shift,
0de7acce 2994 arg_selinux_apifs_context);
03cfe0d5
LP
2995 if (r < 0)
2996 return r;
2997
07fa00f9
LP
2998 r = copy_devnodes(directory);
2999 if (r < 0)
03cfe0d5
LP
3000 return r;
3001
3002 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3003
07fa00f9
LP
3004 r = setup_pts(directory);
3005 if (r < 0)
03cfe0d5
LP
3006 return r;
3007
3008 r = setup_propagate(directory);
3009 if (r < 0)
3010 return r;
3011
3012 r = setup_dev_console(directory, console);
3013 if (r < 0)
3014 return r;
3015
8e5430c4
LP
3016 r = setup_keyring();
3017 if (r < 0)
3018 return r;
3019
960e4569 3020 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
3021 if (r < 0)
3022 return r;
3023
3024 r = setup_timezone(directory);
3025 if (r < 0)
3026 return r;
3027
3028 r = setup_resolv_conf(directory);
3029 if (r < 0)
3030 return r;
3031
e01ff70a
MS
3032 r = setup_machine_id(directory);
3033 if (r < 0)
3034 return r;
3035
03cfe0d5
LP
3036 r = setup_journal(directory);
3037 if (r < 0)
3038 return r;
3039
0de7acce
LP
3040 r = mount_custom(
3041 directory,
3042 arg_custom_mounts,
3043 arg_n_custom_mounts,
3044 arg_userns_mode != USER_NAMESPACE_NO,
3045 arg_uid_shift,
3046 arg_uid_range,
3047 arg_selinux_apifs_context);
03cfe0d5
LP
3048 if (r < 0)
3049 return r;
3050
5a8ff0e6 3051 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3052 r = mount_cgroups(
3053 directory,
3054 arg_unified_cgroup_hierarchy,
3055 arg_userns_mode != USER_NAMESPACE_NO,
3056 arg_uid_shift,
3057 arg_uid_range,
5a8ff0e6 3058 arg_selinux_apifs_context,
ada54120 3059 false);
0996ef00
CB
3060 if (r < 0)
3061 return r;
3062 }
03cfe0d5
LP
3063
3064 r = mount_move_root(directory);
3065 if (r < 0)
3066 return log_error_errno(r, "Failed to move root directory: %m");
3067
9c1e04d0
AP
3068 fd = setup_sd_notify_child();
3069 if (fd < 0)
3070 return fd;
3071
bf428efb
LP
3072 r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3073 if (r < 0)
3074 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3075
03cfe0d5 3076 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3077 arg_clone_ns_flags |
8869a0b4 3078 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3079 if (pid < 0)
3080 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3081 if (pid == 0) {
3082 pid_socket = safe_close(pid_socket);
e01ff70a 3083 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3084 notify_socket = safe_close(notify_socket);
825d5287 3085 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3086
3087 /* The inner child has all namespaces that are
3088 * requested, so that we all are owned by the user if
3089 * user namespaces are turned on. */
3090
d7bea6b6
DP
3091 if (arg_network_namespace_path) {
3092 r = namespace_enter(-1, -1, netns_fd, -1, -1);
3093 if (r < 0)
e2d39e54 3094 return log_error_errno(r, "Failed to join network namespace: %m");
d7bea6b6
DP
3095 }
3096
f757855e 3097 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3098 if (r < 0)
3099 _exit(EXIT_FAILURE);
3100
3101 _exit(EXIT_SUCCESS);
3102 }
3103
3104 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3105 if (l < 0)
3106 return log_error_errno(errno, "Failed to send PID: %m");
baaa35ad
ZJS
3107 if (l != sizeof(pid))
3108 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3109 "Short write while sending PID.");
03cfe0d5 3110
e01ff70a
MS
3111 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3112 if (l < 0)
3113 return log_error_errno(errno, "Failed to send machine ID: %m");
baaa35ad
ZJS
3114 if (l != sizeof(arg_uuid))
3115 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3116 "Short write while sending machine ID.");
e01ff70a 3117
9c1e04d0
AP
3118 l = send_one_fd(notify_socket, fd, 0);
3119 if (l < 0)
3120 return log_error_errno(errno, "Failed to send notify fd: %m");
3121
03cfe0d5 3122 pid_socket = safe_close(pid_socket);
e01ff70a 3123 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3124 notify_socket = safe_close(notify_socket);
327e26d6
KN
3125 kmsg_socket = safe_close(kmsg_socket);
3126 rtnl_socket = safe_close(rtnl_socket);
d7bea6b6 3127 netns_fd = safe_close(netns_fd);
03cfe0d5
LP
3128
3129 return 0;
3130}
3131
0e7ac751 3132static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
d381c8a6 3133 bool tried_hashed = false;
0e7ac751
LP
3134 unsigned n_tries = 100;
3135 uid_t candidate;
3136 int r;
3137
3138 assert(shift);
3139 assert(ret_lock_file);
0de7acce 3140 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3141 assert(arg_uid_range == 0x10000U);
3142
3143 candidate = *shift;
3144
3145 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3146
3147 for (;;) {
fbd0b64f 3148 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
8e766630 3149 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
0e7ac751
LP
3150
3151 if (--n_tries <= 0)
3152 return -EBUSY;
3153
87d5e4f2 3154 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
0e7ac751
LP
3155 goto next;
3156 if ((candidate & UINT32_C(0xFFFF)) != 0)
3157 goto next;
3158
3159 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3160 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3161 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3162 goto next;
3163 if (r < 0)
3164 return r;
3165
3166 /* Make some superficial checks whether the range is currently known in the user database */
3167 if (getpwuid(candidate))
3168 goto next;
3169 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3170 goto next;
3171 if (getgrgid(candidate))
3172 goto next;
3173 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3174 goto next;
3175
3176 *ret_lock_file = lf;
3177 lf = (struct LockFile) LOCK_FILE_INIT;
3178 *shift = candidate;
3179 return 0;
3180
3181 next:
d381c8a6
LP
3182 if (arg_machine && !tried_hashed) {
3183 /* Try to hash the base from the container name */
3184
3185 static const uint8_t hash_key[] = {
3186 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
3187 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
3188 };
3189
3190 candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
3191
3192 tried_hashed = true;
3193 } else
3194 random_bytes(&candidate, sizeof(candidate));
3195
87d5e4f2 3196 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
0e7ac751
LP
3197 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3198 }
3199}
3200
03cfe0d5 3201static int setup_uid_map(pid_t pid) {
fbd0b64f 3202 char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
03cfe0d5
LP
3203 int r;
3204
3205 assert(pid > 1);
3206
3207 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3208 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
57512c89 3209 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3210 if (r < 0)
3211 return log_error_errno(r, "Failed to write UID map: %m");
3212
3213 /* We always assign the same UID and GID ranges */
3214 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
57512c89 3215 r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
03cfe0d5
LP
3216 if (r < 0)
3217 return log_error_errno(r, "Failed to write GID map: %m");
3218
3219 return 0;
3220}
3221
9c1e04d0 3222static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3223 char buf[NOTIFY_BUFFER_MAX+1];
3224 char *p = NULL;
3225 struct iovec iovec = {
3226 .iov_base = buf,
3227 .iov_len = sizeof(buf)-1,
3228 };
3229 union {
3230 struct cmsghdr cmsghdr;
3231 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3232 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3233 } control = {};
3234 struct msghdr msghdr = {
3235 .msg_iov = &iovec,
3236 .msg_iovlen = 1,
3237 .msg_control = &control,
3238 .msg_controllen = sizeof(control),
3239 };
3240 struct cmsghdr *cmsg;
3241 struct ucred *ucred = NULL;
3242 ssize_t n;
3243 pid_t inner_child_pid;
3244 _cleanup_strv_free_ char **tags = NULL;
3245
3246 assert(userdata);
3247
3248 inner_child_pid = PTR_TO_PID(userdata);
3249
3250 if (revents != EPOLLIN) {
3251 log_warning("Got unexpected poll event for notify fd.");
3252 return 0;
3253 }
3254
3255 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3256 if (n < 0) {
3742095b 3257 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
3258 return 0;
3259
3260 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3261 }
3262 cmsg_close_all(&msghdr);
3263
3264 CMSG_FOREACH(cmsg, &msghdr) {
3265 if (cmsg->cmsg_level == SOL_SOCKET &&
3266 cmsg->cmsg_type == SCM_CREDENTIALS &&
3267 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3268
3269 ucred = (struct ucred*) CMSG_DATA(cmsg);
3270 }
3271 }
3272
3273 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 3274 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
3275 return 0;
3276 }
3277
3278 if ((size_t) n >= sizeof(buf)) {
3279 log_warning("Received notify message exceeded maximum size. Ignoring.");
3280 return 0;
3281 }
3282
3283 buf[n] = 0;
3284 tags = strv_split(buf, "\n\r");
3285 if (!tags)
3286 return log_oom();
3287
3288 if (strv_find(tags, "READY=1"))
3289 sd_notifyf(false, "READY=1\n");
3290
3291 p = strv_find_startswith(tags, "STATUS=");
3292 if (p)
3293 sd_notifyf(false, "STATUS=Container running: %s", p);
3294
3295 return 0;
3296}
3297
5773024d 3298static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 3299 int r;
9c1e04d0 3300
5773024d 3301 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
3302 if (r < 0)
3303 return log_error_errno(r, "Failed to allocate notify event source: %m");
3304
5773024d 3305 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
3306
3307 return 0;
3308}
3309
5d961407
LP
3310static int merge_settings(Settings *settings, const char *path) {
3311 int rl;
f757855e 3312
5d961407
LP
3313 assert(settings);
3314 assert(path);
f757855e 3315
5d961407
LP
3316 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
3317 * that this steals the fields of the Settings* structure, and hence modifies it. */
f757855e 3318
7732f92b
LP
3319 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3320 settings->start_mode >= 0) {
3321 arg_start_mode = settings->start_mode;
130d3d22 3322 strv_free_and_replace(arg_parameters, settings->parameters);
f757855e
LP
3323 }
3324
a2f577fc
JL
3325 if ((arg_settings_mask & SETTING_EPHEMERAL) == 0)
3326 arg_ephemeral = settings->ephemeral;
3327
b53ede69
PW
3328 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3329 settings->pivot_root_new) {
3330 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3331 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3332 }
3333
5f932eb9 3334 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
1cc6c93a
YW
3335 settings->working_directory)
3336 free_and_replace(arg_chdir, settings->working_directory);
5f932eb9 3337
f757855e 3338 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
130d3d22
YW
3339 settings->environment)
3340 strv_free_and_replace(arg_setenv, settings->environment);
f757855e
LP
3341
3342 if ((arg_settings_mask & SETTING_USER) == 0 &&
1cc6c93a
YW
3343 settings->user)
3344 free_and_replace(arg_user, settings->user);
f757855e
LP
3345
3346 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3347 uint64_t plus;
f757855e 3348
0e265674
LP
3349 plus = settings->capability;
3350 if (settings_private_network(settings))
3351 plus |= (1ULL << CAP_NET_ADMIN);
3352
3353 if (!arg_settings_trusted && plus != 0) {
3354 if (settings->capability != 0)
5d961407 3355 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
0e265674 3356 } else
520e0d54 3357 arg_caps_retain |= plus;
f757855e 3358
520e0d54 3359 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3360 }
3361
3362 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3363 settings->kill_signal > 0)
3364 arg_kill_signal = settings->kill_signal;
3365
3366 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3367 settings->personality != PERSONALITY_INVALID)
3368 arg_personality = settings->personality;
3369
3370 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3371 !sd_id128_is_null(settings->machine_id)) {
3372
3373 if (!arg_settings_trusted)
5d961407 3374 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
f757855e
LP
3375 else
3376 arg_uuid = settings->machine_id;
3377 }
3378
3379 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3380 settings->read_only >= 0)
3381 arg_read_only = settings->read_only;
3382
3383 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3384 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3385 arg_volatile_mode = settings->volatile_mode;
3386
3387 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3388 settings->n_custom_mounts > 0) {
3389
3390 if (!arg_settings_trusted)
5d961407 3391 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
f757855e
LP
3392 else {
3393 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
1cc6c93a 3394 arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
f757855e 3395 arg_n_custom_mounts = settings->n_custom_mounts;
f757855e
LP
3396 settings->n_custom_mounts = 0;
3397 }
3398 }
3399
3400 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3401 (settings->private_network >= 0 ||
3402 settings->network_veth >= 0 ||
3403 settings->network_bridge ||
22b28dfd 3404 settings->network_zone ||
f757855e
LP
3405 settings->network_interfaces ||
3406 settings->network_macvlan ||
f6d6bad1
LP
3407 settings->network_ipvlan ||
3408 settings->network_veth_extra)) {
f757855e
LP
3409
3410 if (!arg_settings_trusted)
5d961407 3411 log_warning("Ignoring network settings, file %s is not trusted.", path);
f757855e 3412 else {
f6d6bad1 3413 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3414 arg_private_network = settings_private_network(settings);
3415
130d3d22
YW
3416 strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
3417 strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
3418 strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
3419 strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
f6d6bad1 3420
1cc6c93a
YW
3421 free_and_replace(arg_network_bridge, settings->network_bridge);
3422 free_and_replace(arg_network_zone, settings->network_zone);
f757855e
LP
3423 }
3424 }
3425
3426 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3427 settings->expose_ports) {
3428
3429 if (!arg_settings_trusted)
5d961407 3430 log_warning("Ignoring Port= setting, file %s is not trusted.", path);
f757855e
LP
3431 else {
3432 expose_port_free_all(arg_expose_ports);
1cc6c93a 3433 arg_expose_ports = TAKE_PTR(settings->expose_ports);
f757855e
LP
3434 }
3435 }
3436
0de7acce
LP
3437 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3438 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3439
3440 if (!arg_settings_trusted)
5d961407 3441 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
0de7acce
LP
3442 else {
3443 arg_userns_mode = settings->userns_mode;
3444 arg_uid_shift = settings->uid_shift;
3445 arg_uid_range = settings->uid_range;
3446 arg_userns_chown = settings->userns_chown;
3447 }
3448 }
3449
9c1e04d0
AP
3450 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3451 arg_notify_ready = settings->notify_ready;
3452
960e4569
LP
3453 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3454
3455 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
5d961407 3456 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
960e4569 3457 else {
130d3d22
YW
3458 strv_free_and_replace(arg_syscall_whitelist, settings->syscall_whitelist);
3459 strv_free_and_replace(arg_syscall_blacklist, settings->syscall_blacklist);
960e4569
LP
3460 }
3461 }
3462
bf428efb
LP
3463 for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
3464 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
3465 continue;
3466
3467 if (!settings->rlimit[rl])
3468 continue;
3469
3470 if (!arg_settings_trusted) {
5d961407 3471 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
bf428efb
LP
3472 continue;
3473 }
3474
3475 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
3476 }
3477
3a9530e5
LP
3478 if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
3479 settings->hostname)
3480 free_and_replace(arg_hostname, settings->hostname);
3481
66edd963
LP
3482 if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
3483 settings->no_new_privileges >= 0)
3484 arg_no_new_privileges = settings->no_new_privileges;
3485
81f345df
LP
3486 if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
3487 settings->oom_score_adjust_set) {
3488
3489 if (!arg_settings_trusted)
5d961407 3490 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
81f345df
LP
3491 else {
3492 arg_oom_score_adjust = settings->oom_score_adjust;
3493 arg_oom_score_adjust_set = true;
3494 }
3495 }
3496
d107bb7d
LP
3497 if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
3498 settings->cpuset) {
3499
3500 if (!arg_settings_trusted)
5d961407 3501 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
d107bb7d
LP
3502 else {
3503 if (arg_cpuset)
3504 CPU_FREE(arg_cpuset);
3505 arg_cpuset = TAKE_PTR(settings->cpuset);
3506 arg_cpuset_ncpus = settings->cpuset_ncpus;
3507 }
3508 }
3509
09d423e9
LP
3510 if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
3511 settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
3512 arg_resolv_conf = settings->resolv_conf;
3513
4e1d6aa9
LP
3514 if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
3515 settings->link_journal != _LINK_JOURNAL_INVALID) {
3516
3517 if (!arg_settings_trusted)
3518 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
3519 else {
3520 arg_link_journal = settings->link_journal;
3521 arg_link_journal_try = settings->link_journal_try;
3522 }
3523 }
3524
1688841f
LP
3525 if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
3526 settings->timezone != _TIMEZONE_MODE_INVALID)
3527 arg_timezone = settings->timezone;
3528
f757855e
LP
3529 return 0;
3530}
3531
5d961407
LP
3532static int load_settings(void) {
3533 _cleanup_(settings_freep) Settings *settings = NULL;
3534 _cleanup_fclose_ FILE *f = NULL;
3535 _cleanup_free_ char *p = NULL;
3536 const char *fn, *i;
3537 int r;
3538
3539 /* If all settings are masked, there's no point in looking for
3540 * the settings file */
3541 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3542 return 0;
3543
3544 fn = strjoina(arg_machine, ".nspawn");
3545
3546 /* We first look in the admin's directories in /etc and /run */
3547 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3548 _cleanup_free_ char *j = NULL;
3549
3550 j = strjoin(i, "/", fn);
3551 if (!j)
3552 return log_oom();
3553
3554 f = fopen(j, "re");
3555 if (f) {
3556 p = TAKE_PTR(j);
3557
3558 /* By default, we trust configuration from /etc and /run */
3559 if (arg_settings_trusted < 0)
3560 arg_settings_trusted = true;
3561
3562 break;
3563 }
3564
3565 if (errno != ENOENT)
3566 return log_error_errno(errno, "Failed to open %s: %m", j);
3567 }
3568
3569 if (!f) {
3570 /* After that, let's look for a file next to the
3571 * actual image we shall boot. */
3572
3573 if (arg_image) {
3574 p = file_in_same_dir(arg_image, fn);
3575 if (!p)
3576 return log_oom();
3577 } else if (arg_directory) {
3578 p = file_in_same_dir(arg_directory, fn);
3579 if (!p)
3580 return log_oom();
3581 }
3582
3583 if (p) {
3584 f = fopen(p, "re");
3585 if (!f && errno != ENOENT)
3586 return log_error_errno(errno, "Failed to open %s: %m", p);
3587
3588 /* By default, we do not trust configuration from /var/lib/machines */
3589 if (arg_settings_trusted < 0)
3590 arg_settings_trusted = false;
3591 }
3592 }
3593
3594 if (!f)
3595 return 0;
3596
3597 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3598
3599 r = settings_load(f, p, &settings);
3600 if (r < 0)
3601 return r;
3602
3603 return merge_settings(settings, p);
3604}
3605
b0067625
ZJS
3606static int run(int master,
3607 const char* console,
2d845785 3608 DissectedImage *dissected_image,
b0067625
ZJS
3609 bool interactive,
3610 bool secondary,
3611 FDSet *fds,
3612 char veth_name[IFNAMSIZ], bool *veth_created,
3613 union in_addr_union *exposed,
3614 pid_t *pid, int *ret) {
3615
3616 static const struct sigaction sa = {
3617 .sa_handler = nop_signal_handler,
e28c7cd0 3618 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3619 };
3620
8e766630 3621 _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
b0067625
ZJS
3622 _cleanup_close_ int etc_passwd_lock = -1;
3623 _cleanup_close_pair_ int
3624 kmsg_socket_pair[2] = { -1, -1 },
3625 rtnl_socket_pair[2] = { -1, -1 },
3626 pid_socket_pair[2] = { -1, -1 },
3627 uuid_socket_pair[2] = { -1, -1 },
3628 notify_socket_pair[2] = { -1, -1 },
8199d554
LP
3629 uid_shift_socket_pair[2] = { -1, -1 },
3630 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
3631
b0067625
ZJS
3632 _cleanup_close_ int notify_socket= -1;
3633 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3634 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3635 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3636 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3637 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
abdb9b08 3638 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
b0067625
ZJS
3639 ContainerStatus container_status = 0;
3640 char last_char = 0;
3641 int ifi = 0, r;
3642 ssize_t l;
3643 sigset_t mask_chld;
d7bea6b6 3644 _cleanup_close_ int netns_fd = -1;
b0067625
ZJS
3645
3646 assert_se(sigemptyset(&mask_chld) == 0);
3647 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3648
3649 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3650 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3651 * check with getpwuid() if the specific user already exists. Note that /etc might be
3652 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3653 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3654 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3655 * really ours. */
3656
3657 etc_passwd_lock = take_etc_passwd_lock(NULL);
3658 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3659 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3660 }
3661
3662 r = barrier_create(&barrier);
3663 if (r < 0)
3664 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3665
3666 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3667 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3668
3669 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3670 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3671
3672 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3673 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3674
3675 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3676 return log_error_errno(errno, "Failed to create id socket pair: %m");
3677
3678 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3679 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3680
3681 if (arg_userns_mode != USER_NAMESPACE_NO)
3682 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3683 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3684
8199d554
LP
3685 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
3686 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
3687 return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
3688
b0067625
ZJS
3689 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3690 * parent's blocking calls and give it a chance to call wait() and terminate. */
3691 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3692 if (r < 0)
3693 return log_error_errno(errno, "Failed to change the signal mask: %m");
3694
3695 r = sigaction(SIGCHLD, &sa, NULL);
3696 if (r < 0)
3697 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3698
d7bea6b6
DP
3699 if (arg_network_namespace_path) {
3700 netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
3701 if (netns_fd < 0)
3702 return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
3703
3704 r = fd_is_network_ns(netns_fd);
6619ad88
LP
3705 if (r == -EUCLEAN)
3706 log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
3707 else if (r < 0)
d7bea6b6 3708 return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
6619ad88
LP
3709 else if (r == 0) {
3710 log_error("Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
d7bea6b6
DP
3711 return -EINVAL;
3712 }
3713 }
3714
b0067625
ZJS
3715 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3716 if (*pid < 0)
3717 return log_error_errno(errno, "clone() failed%s: %m",
3718 errno == EINVAL ?
3719 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3720
3721 if (*pid == 0) {
3722 /* The outer child only has a file system namespace. */
3723 barrier_set_role(&barrier, BARRIER_CHILD);
3724
3725 master = safe_close(master);
3726
3727 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3728 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3729 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3730 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3731 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3732 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
8199d554 3733 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
b0067625
ZJS
3734
3735 (void) reset_all_signal_handlers();
3736 (void) reset_signal_mask();
3737
3738 r = outer_child(&barrier,
3739 arg_directory,
3740 console,
2d845785 3741 dissected_image,
b0067625
ZJS
3742 interactive,
3743 secondary,
3744 pid_socket_pair[1],
3745 uuid_socket_pair[1],
3746 notify_socket_pair[1],
3747 kmsg_socket_pair[1],
3748 rtnl_socket_pair[1],
3749 uid_shift_socket_pair[1],
8199d554 3750 unified_cgroup_hierarchy_socket_pair[1],
d7bea6b6
DP
3751 fds,
3752 netns_fd);
b0067625
ZJS
3753 if (r < 0)
3754 _exit(EXIT_FAILURE);
3755
3756 _exit(EXIT_SUCCESS);
3757 }
3758
3759 barrier_set_role(&barrier, BARRIER_PARENT);
3760
3761 fds = fdset_free(fds);
3762
3763 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3764 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3765 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3766 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3767 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3768 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
8199d554 3769 unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
b0067625
ZJS
3770
3771 if (arg_userns_mode != USER_NAMESPACE_NO) {
3772 /* The child just let us know the UID shift it might have read from the image. */
3773 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3774 if (l < 0)
3775 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3776 if (l != sizeof arg_uid_shift) {
3777 log_error("Short read while reading UID shift.");
3778 return -EIO;
3779 }
3780
3781 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3782 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3783 * image, but if that's already in use, pick a new one, and report back to the child,
3784 * which one we now picked. */
3785
3786 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3787 if (r < 0)
3788 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3789
3790 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3791 if (l < 0)
3792 return log_error_errno(errno, "Failed to send UID shift: %m");
3793 if (l != sizeof arg_uid_shift) {
3794 log_error("Short write while writing UID shift.");
3795 return -EIO;
3796 }
3797 }
3798 }
3799
8199d554
LP
3800 if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3801 /* The child let us know the support cgroup mode it might have read from the image. */
3802 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
3803 if (l < 0)
3804 return log_error_errno(errno, "Failed to read cgroup mode: %m");
3805 if (l != sizeof(arg_unified_cgroup_hierarchy)) {
bd897e72
ZJS
3806 log_error("Short read while reading cgroup mode (%zu bytes).%s",
3807 l, l == 0 ? " The child is most likely dead." : "");
8199d554
LP
3808 return -EIO;
3809 }
3810 }
3811
b0067625 3812 /* Wait for the outer child. */
d2e0ac3d
LP
3813 r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
3814 if (r < 0)
3815 return r;
3816 if (r != EXIT_SUCCESS)
3817 return -EIO;
b0067625
ZJS
3818
3819 /* And now retrieve the PID of the inner child. */
3820 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3821 if (l < 0)
3822 return log_error_errno(errno, "Failed to read inner child PID: %m");
3823 if (l != sizeof *pid) {
3824 log_error("Short read while reading inner child PID.");
3825 return -EIO;
3826 }
3827
3828 /* We also retrieve container UUID in case it was generated by outer child */
3829 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3830 if (l < 0)
3831 return log_error_errno(errno, "Failed to read container machine ID: %m");
3832 if (l != sizeof(arg_uuid)) {
3833 log_error("Short read while reading container machined ID.");
3834 return -EIO;
3835 }
3836
3837 /* We also retrieve the socket used for notifications generated by outer child */
3838 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3839 if (notify_socket < 0)
3840 return log_error_errno(notify_socket,
3841 "Failed to receive notification socket from the outer child: %m");
3842
3843 log_debug("Init process invoked as PID "PID_FMT, *pid);
3844
3845 if (arg_userns_mode != USER_NAMESPACE_NO) {
3846 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3847 log_error("Child died too early.");
3848 return -ESRCH;
3849 }
3850
3851 r = setup_uid_map(*pid);
3852 if (r < 0)
3853 return r;
3854
3855 (void) barrier_place(&barrier); /* #2 */
3856 }
3857
3858 if (arg_private_network) {
75116558
PS
3859 if (!arg_network_namespace_path) {
3860 /* Wait until the child has unshared its network namespace. */
3861 if (!barrier_place_and_sync(&barrier)) { /* #3 */
3862 log_error("Child died too early");
3863 return -ESRCH;
3864 }
3865 }
3866
b0067625
ZJS
3867 r = move_network_interfaces(*pid, arg_network_interfaces);
3868 if (r < 0)
3869 return r;
3870
3871 if (arg_network_veth) {
3872 r = setup_veth(arg_machine, *pid, veth_name,
3873 arg_network_bridge || arg_network_zone);
3874 if (r < 0)
3875 return r;
3876 else if (r > 0)
3877 ifi = r;
3878
3879 if (arg_network_bridge) {
3880 /* Add the interface to a bridge */
3881 r = setup_bridge(veth_name, arg_network_bridge, false);
3882 if (r < 0)
3883 return r;
3884 if (r > 0)
3885 ifi = r;
3886 } else if (arg_network_zone) {
3887 /* Add the interface to a bridge, possibly creating it */
3888 r = setup_bridge(veth_name, arg_network_zone, true);
3889 if (r < 0)
3890 return r;
3891 if (r > 0)
3892 ifi = r;
3893 }
3894 }
3895
3896 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3897 if (r < 0)
3898 return r;
3899
3900 /* We created the primary and extra veth links now; let's remember this, so that we know to
3901 remove them later on. Note that we don't bother with removing veth links that were created
3902 here when their setup failed half-way, because in that case the kernel should be able to
3903 remove them on its own, since they cannot be referenced by anything yet. */
3904 *veth_created = true;
3905
3906 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3907 if (r < 0)
3908 return r;
3909
3910 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3911 if (r < 0)
3912 return r;
3913 }
3914
abdb9b08
LP
3915 if (arg_register || !arg_keep_unit) {
3916 r = sd_bus_default_system(&bus);
3917 if (r < 0)
3918 return log_error_errno(r, "Failed to open system bus: %m");
e5a2d8b5
LP
3919
3920 r = sd_bus_set_close_on_exit(bus, false);
3921 if (r < 0)
3922 return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
abdb9b08
LP
3923 }
3924
3925 if (!arg_keep_unit) {
3926 /* When a new scope is created for this container, then we'll be registered as its controller, in which
3927 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
3928 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
3929
75152a4d
LP
3930 r = sd_bus_match_signal_async(
3931 bus,
3932 NULL,
3933 "org.freedesktop.systemd1",
3934 NULL,
3935 "org.freedesktop.systemd1.Scope",
3936 "RequestStop",
3937 on_request_stop, NULL, PID_TO_PTR(*pid));
abdb9b08 3938 if (r < 0)
75152a4d 3939 return log_error_errno(r, "Failed to request RequestStop match: %m");
abdb9b08
LP
3940 }
3941
b0067625
ZJS
3942 if (arg_register) {
3943 r = register_machine(
abdb9b08 3944 bus,
b0067625
ZJS
3945 arg_machine,
3946 *pid,
3947 arg_directory,
3948 arg_uuid,
3949 ifi,
3950 arg_slice,
3951 arg_custom_mounts, arg_n_custom_mounts,
3952 arg_kill_signal,
3953 arg_property,
3954 arg_keep_unit,
3955 arg_container_service_name);
3956 if (r < 0)
3957 return r;
abdb9b08 3958
cd2dfc6f
LP
3959 } else if (!arg_keep_unit) {
3960 r = allocate_scope(
abdb9b08 3961 bus,
cd2dfc6f
LP
3962 arg_machine,
3963 *pid,
3964 arg_slice,
3965 arg_custom_mounts, arg_n_custom_mounts,
3966 arg_kill_signal,
3967 arg_property);
3968 if (r < 0)
3969 return r;
3970
3971 } else if (arg_slice || arg_property)
3972 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3973
f0bef277 3974 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3975 if (r < 0)
3976 return r;
3977
720f0a2f
LP
3978 r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
3979 if (r < 0)
3980 return r;
b0067625 3981
de54e02d 3982 r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3983 if (r < 0)
3984 return r;
3985
3986 /* Notify the child that the parent is ready with all
3987 * its setup (including cgroup-ification), and that
3988 * the child can now hand over control to the code to
3989 * run inside the container. */
75116558 3990 (void) barrier_place(&barrier); /* #4 */
b0067625
ZJS
3991
3992 /* Block SIGCHLD here, before notifying child.
3993 * process_pty() will handle it with the other signals. */
3994 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3995
3996 /* Reset signal to default */
3997 r = default_signals(SIGCHLD, -1);
3998 if (r < 0)
3999 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
4000
4001 r = sd_event_new(&event);
4002 if (r < 0)
4003 return log_error_errno(r, "Failed to get default event source: %m");
4004
8fd010bb
LP
4005 (void) sd_event_set_watchdog(event, true);
4006
abdb9b08
LP
4007 if (bus) {
4008 r = sd_bus_attach_event(bus, event, 0);
4009 if (r < 0)
4010 return log_error_errno(r, "Failed to attach bus to event loop: %m");
4011 }
4012
5773024d 4013 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
4014 if (r < 0)
4015 return r;
4016
4017 /* Let the child know that we are ready and wait that the child is completely ready now. */
75116558 4018 if (!barrier_place_and_sync(&barrier)) { /* #5 */
b0067625
ZJS
4019 log_error("Child died too early.");
4020 return -ESRCH;
4021 }
4022
4023 /* At this point we have made use of the UID we picked, and thus nss-mymachines
4024 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
4025 etc_passwd_lock = safe_close(etc_passwd_lock);
4026
4027 sd_notifyf(false,
4028 "STATUS=Container running.\n"
4029 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
4030 if (!arg_notify_ready)
919f5ae0 4031 (void) sd_notify(false, "READY=1\n");
b0067625
ZJS
4032
4033 if (arg_kill_signal > 0) {
4034 /* Try to kill the init system on SIGINT or SIGTERM */
919f5ae0
LP
4035 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
4036 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
b0067625
ZJS
4037 } else {
4038 /* Immediately exit */
919f5ae0
LP
4039 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4040 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
b0067625
ZJS
4041 }
4042
6916b164 4043 /* Exit when the child exits */
919f5ae0 4044 (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
4045
4046 if (arg_expose_ports) {
4047 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
4048 if (r < 0)
4049 return r;
4050
4051 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
4052 }
4053
4054 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4055
4056 r = pty_forward_new(event, master,
4057 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
4058 &forward);
4059 if (r < 0)
4060 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4061
4062 r = sd_event_loop(event);
4063 if (r < 0)
4064 return log_error_errno(r, "Failed to run event loop: %m");
4065
4066 pty_forward_get_last_char(forward, &last_char);
4067
4068 forward = pty_forward_free(forward);
4069
4070 if (!arg_quiet && last_char != '\n')
4071 putc('\n', stdout);
4072
4073 /* Kill if it is not dead yet anyway */
1d78fea2
LP
4074 if (bus) {
4075 if (arg_register)
4076 terminate_machine(bus, arg_machine);
4077 else if (!arg_keep_unit)
4078 terminate_scope(bus, arg_machine);
4079 }
b0067625
ZJS
4080
4081 /* Normally redundant, but better safe than sorry */
c67b0082 4082 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4083
4084 r = wait_for_container(*pid, &container_status);
4085 *pid = 0;
4086
4087 if (r < 0)
4088 /* We failed to wait for the container, or the container exited abnormally. */
4089 return r;
4090 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4091 /* r > 0 → The container exited with a non-zero status.
4092 * As a special case, we need to replace 133 with a different value,
4093 * because 133 is special-cased in the service file to reboot the container.
4094 * otherwise → The container exited with zero status and a reboot was not requested.
4095 */
2a49b612 4096 if (r == EXIT_FORCE_RESTART)
27e29a1e 4097 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4098 *ret = r;
b0067625
ZJS
4099 return 0; /* finito */
4100 }
4101
4102 /* CONTAINER_REBOOTED, loop again */
4103
4104 if (arg_keep_unit) {
4105 /* Special handling if we are running as a service: instead of simply
4106 * restarting the machine we want to restart the entire service, so let's
4107 * inform systemd about this with the special exit code 133. The service
4108 * file uses RestartForceExitStatus=133 so that this results in a full
4109 * nspawn restart. This is necessary since we might have cgroup parameters
4110 * set we want to have flushed out. */
2a49b612
ZJS
4111 *ret = EXIT_FORCE_RESTART;
4112 return 0; /* finito */
b0067625
ZJS
4113 }
4114
4115 expose_port_flush(arg_expose_ports, exposed);
4116
4117 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4118 *veth_created = false;
4119 return 1; /* loop again */
4120}
4121
bf428efb 4122static int initialize_rlimits(void) {
bf428efb
LP
4123 /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
4124 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
4125 * container execution environments. */
4126
4127 static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
4128 [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
4129 [RLIMIT_CORE] = { 0, RLIM_INFINITY },
4130 [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
4131 [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
4132 [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
4133 [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
4134 [RLIMIT_MEMLOCK] = { 65536, 65536 },
4135 [RLIMIT_MSGQUEUE] = { 819200, 819200 },
4136 [RLIMIT_NICE] = { 0, 0 },
4137 [RLIMIT_NOFILE] = { 1024, 4096 },
4138 [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
4139 [RLIMIT_RTPRIO] = { 0, 0 },
4140 [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
4141 [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
4142
4143 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
4144 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
4145 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
4146 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
4147 * that PID 1 changes a number of other resource limits during early initialization which is why we
4148 * don't read the other limits from PID 1 but prefer the static table above. */
4149 };
4150
4151 int rl;
4152
4153 for (rl = 0; rl < _RLIMIT_MAX; rl++) {
bf428efb
LP
4154 /* Let's only fill in what the user hasn't explicitly configured anyway */
4155 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
4156 const struct rlimit *v;
4157 struct rlimit buffer;
4158
4159 if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
4160 /* For these two let's read the limits off PID 1. See above for an explanation. */
4161
4162 if (prlimit(1, rl, NULL, &buffer) < 0)
4163 return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
4164
4165 v = &buffer;
4166 } else
4167 v = kernel_defaults + rl;
4168
4169 arg_rlimit[rl] = newdup(struct rlimit, v, 1);
4170 if (!arg_rlimit[rl])
4171 return log_oom();
4172 }
4173
4174 if (DEBUG_LOGGING) {
4175 _cleanup_free_ char *k = NULL;
4176
4177 (void) rlimit_format(arg_rlimit[rl], &k);
4178 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
4179 }
4180 }
4181
4182 return 0;
4183}
4184
03cfe0d5 4185int main(int argc, char *argv[]) {
2d845785
LP
4186 _cleanup_free_ char *console = NULL;
4187 _cleanup_close_ int master = -1;
03cfe0d5 4188 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 4189 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 4190 char veth_name[IFNAMSIZ] = "";
17cbb288 4191 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4192 pid_t pid = 0;
03cfe0d5 4193 union in_addr_union exposed = {};
8e766630 4194 _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4195 bool interactive, veth_created = false, remove_tmprootdir = false;
4196 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 4197 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
4198 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
4199 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
4200
4201 log_parse_environment();
4202 log_open();
415fc41c 4203
7732f92b
LP
4204 /* Make sure rename_process() in the stub init process can work */
4205 saved_argv = argv;
4206 saved_argc = argc;
4207
03cfe0d5
LP
4208 r = parse_argv(argc, argv);
4209 if (r <= 0)
4210 goto finish;
4211
fba868fa
LP
4212 r = must_be_root();
4213 if (r < 0)
03cfe0d5 4214 goto finish;
fba868fa 4215
bf428efb
LP
4216 r = initialize_rlimits();
4217 if (r < 0)
4218 goto finish;
4219
f757855e
LP
4220 r = determine_names();
4221 if (r < 0)
4222 goto finish;
4223
4224 r = load_settings();
4225 if (r < 0)
4226 goto finish;
4227
4228 r = verify_arguments();
4229 if (r < 0)
4230 goto finish;
03cfe0d5 4231
8199d554
LP
4232 r = detect_unified_cgroup_hierarchy_from_environment();
4233 if (r < 0)
4234 goto finish;
4235
03cfe0d5
LP
4236 n_fd_passed = sd_listen_fds(false);
4237 if (n_fd_passed > 0) {
4238 r = fdset_new_listen_fds(&fds, false);
4239 if (r < 0) {
4240 log_error_errno(r, "Failed to collect file descriptors: %m");
4241 goto finish;
4242 }
4243 }
4244
83e803a9
ZJS
4245 /* The "default" umask. This is appropriate for most file and directory
4246 * operations performed by nspawn, and is the umask that will be used for
4247 * the child. Functions like copy_devnodes() change the umask temporarily. */
4248 umask(0022);
4249
03cfe0d5
LP
4250 if (arg_directory) {
4251 assert(!arg_image);
4252
4253 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4254 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4255 r = -EINVAL;
4256 goto finish;
4257 }
4258
4259 if (arg_ephemeral) {
4260 _cleanup_free_ char *np = NULL;
4261
8d4aa2bb 4262 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4263 if (r < 0)
4264 goto finish;
4265
03cfe0d5
LP
4266 /* If the specified path is a mount point we
4267 * generate the new snapshot immediately
4268 * inside it under a random name. However if
4269 * the specified is not a mount point we
4270 * create the new snapshot in the parent
4271 * directory, just next to it. */
e1873695 4272 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4273 if (r < 0) {
4274 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4275 goto finish;
4276 }
4277 if (r > 0)
770b5ce4 4278 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4279 else
770b5ce4 4280 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4281 if (r < 0) {
0f3be6ca 4282 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4283 goto finish;
4284 }
4285
4286 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4287 if (r < 0) {
4288 log_error_errno(r, "Failed to lock %s: %m", np);
4289 goto finish;
4290 }
4291
17cbb288
LP
4292 r = btrfs_subvol_snapshot(arg_directory, np,
4293 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4294 BTRFS_SNAPSHOT_FALLBACK_COPY |
4295 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4296 BTRFS_SNAPSHOT_RECURSIVE |
4297 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4298 if (r < 0) {
4299 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4300 goto finish;
ec16945e
LP
4301 }
4302
1cc6c93a 4303 free_and_replace(arg_directory, np);
ec16945e 4304
17cbb288 4305 remove_directory = true;
30535c16
LP
4306
4307 } else {
cb638b5e 4308 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4309 if (r < 0)
4310 goto finish;
4311
30535c16
LP
4312 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4313 if (r == -EBUSY) {
4314 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4315 goto finish;
4316 }
4317 if (r < 0) {
4318 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4319 goto finish;
30535c16
LP
4320 }
4321
4322 if (arg_template) {
8d4aa2bb 4323 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4324 if (r < 0)
4325 goto finish;
4326
17cbb288
LP
4327 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4328 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4329 BTRFS_SNAPSHOT_FALLBACK_COPY |
4330 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4331 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4332 BTRFS_SNAPSHOT_RECURSIVE |
4333 BTRFS_SNAPSHOT_QUOTA);
ff6c6cc1
LP
4334 if (r == -EEXIST)
4335 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4336 "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4337 else if (r < 0) {
83521414 4338 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16 4339 goto finish;
ff6c6cc1
LP
4340 } else
4341 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
4342 "Populated %s from template %s.", arg_directory, arg_template);
30535c16 4343 }
ec16945e
LP
4344 }
4345
7732f92b 4346 if (arg_start_mode == START_BOOT) {
a5201ed6 4347 const char *p;
c9fe05e0 4348
a5201ed6
LP
4349 if (arg_pivot_root_new)
4350 p = prefix_roota(arg_directory, arg_pivot_root_new);
4351 else
4352 p = arg_directory;
c9fe05e0
AR
4353
4354 if (path_is_os_tree(p) <= 0) {
4355 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
ec16945e 4356 r = -EINVAL;
1b9e5b12
LP
4357 goto finish;
4358 }
4359 } else {
c9fe05e0
AR
4360 const char *p, *q;
4361
a5201ed6
LP
4362 if (arg_pivot_root_new)
4363 p = prefix_roota(arg_directory, arg_pivot_root_new);
4364 else
4365 p = arg_directory;
c9fe05e0
AR
4366
4367 q = strjoina(p, "/usr/");
1b9e5b12 4368
c9fe05e0
AR
4369 if (laccess(q, F_OK) < 0) {
4370 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
ec16945e 4371 r = -EINVAL;
1b9e5b12 4372 goto finish;
1b9e5b12
LP
4373 }
4374 }
ec16945e 4375
6b9132a9 4376 } else {
ec16945e
LP
4377 assert(arg_image);
4378 assert(!arg_template);
4379
8d4aa2bb 4380 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4381 if (r < 0)
4382 goto finish;
4383
0f3be6ca
LP
4384 if (arg_ephemeral) {
4385 _cleanup_free_ char *np = NULL;
4386
4387 r = tempfn_random(arg_image, "machine.", &np);
4388 if (r < 0) {
4389 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4390 goto finish;
4391 }
4392
4393 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4394 if (r < 0) {
4395 r = log_error_errno(r, "Failed to create image lock: %m");
4396 goto finish;
4397 }
4398
1c876927 4399 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
4400 if (r < 0) {
4401 r = log_error_errno(r, "Failed to copy image file: %m");
4402 goto finish;
4403 }
4404
1cc6c93a 4405 free_and_replace(arg_image, np);
0f3be6ca
LP
4406
4407 remove_image = true;
4408 } else {
4409 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4410 if (r == -EBUSY) {
4411 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4412 goto finish;
4413 }
4414 if (r < 0) {
4415 r = log_error_errno(r, "Failed to create image lock: %m");
4416 goto finish;
4417 }
4623e8e6 4418
78ebe980
LP
4419 if (!arg_root_hash) {
4420 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
4421 if (r < 0) {
4422 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
4423 goto finish;
4424 }
4425 }
30535c16
LP
4426 }
4427
c67b0082 4428 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4429 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4430 goto finish;
1b9e5b12 4431 }
6b9132a9 4432
c67b0082
LP
4433 remove_tmprootdir = true;
4434
4435 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4436 if (!arg_directory) {
4437 r = log_oom();
4438 goto finish;
6b9132a9 4439 }
88213476 4440
2d845785
LP
4441 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
4442 if (r < 0) {
4443 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
4444 goto finish;
4445 }
1b9e5b12 4446
4526113f 4447 r = dissect_image_and_warn(
e0f9e7bd 4448 loop->fd,
4526113f 4449 arg_image,
e0f9e7bd
LP
4450 arg_root_hash, arg_root_hash_size,
4451 DISSECT_IMAGE_REQUIRE_ROOT,
4452 &dissected_image);
2d845785 4453 if (r == -ENOPKG) {
4526113f 4454 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
2d845785
LP
4455 log_notice("Note that the disk image needs to\n"
4456 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
4457 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
4458 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
4459 " d) or contain a file system without a partition table\n"
4460 "in order to be bootable with systemd-nspawn.");
1b9e5b12 4461 goto finish;
2d845785 4462 }
4526113f 4463 if (r < 0)
842f3b0f 4464 goto finish;
1b9e5b12 4465
4623e8e6
LP
4466 if (!arg_root_hash && dissected_image->can_verity)
4467 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
4468
4469 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
4470 if (r < 0)
4471 goto finish;
0f3be6ca
LP
4472
4473 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4474 if (remove_image && unlink(arg_image) >= 0)
4475 remove_image = false;
842f3b0f 4476 }
842f3b0f 4477
86c0dd4a 4478 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4479 if (r < 0)
4480 goto finish;
4481
03cfe0d5
LP
4482 interactive =
4483 isatty(STDIN_FILENO) > 0 &&
4484 isatty(STDOUT_FILENO) > 0;
9c857b9d 4485
669fc4e5 4486 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
db7feb7e 4487 if (master < 0) {
ec16945e 4488 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4489 goto finish;
4490 }
4491
611b312b
LP
4492 r = ptsname_malloc(master, &console);
4493 if (r < 0) {
4494 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4495 goto finish;
68b02049
DW
4496 }
4497
4498 if (arg_selinux_apifs_context) {
4499 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4500 if (r < 0)
4501 goto finish;
a258bf26
LP
4502 }
4503
a258bf26 4504 if (unlockpt(master) < 0) {
ec16945e 4505 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4506 goto finish;
4507 }
4508
9c857b9d
LP
4509 if (!arg_quiet)
4510 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4511 arg_machine, arg_image ?: arg_directory);
4512
72c0a2c2 4513 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4514
66edd963 4515 if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
03cfe0d5
LP
4516 r = log_error_errno(errno, "Failed to become subreaper: %m");
4517 goto finish;
4518 }
4519
d87be9b0 4520 for (;;) {
b0067625
ZJS
4521 r = run(master,
4522 console,
2d845785 4523 dissected_image,
b0067625
ZJS
4524 interactive, secondary,
4525 fds,
4526 veth_name, &veth_created,
4527 &exposed,
4528 &pid, &ret);
4529 if (r <= 0)
d87be9b0 4530 break;
d87be9b0 4531 }
88213476
LP
4532
4533finish:
af4ec430 4534 sd_notify(false,
2a49b612
ZJS
4535 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4536 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4537
9444b1f2 4538 if (pid > 0)
c67b0082 4539 (void) kill(pid, SIGKILL);
88213476 4540
503546da 4541 /* Try to flush whatever is still queued in the pty */
6a0f896b 4542 if (master >= 0) {
1c876927 4543 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
4544 master = safe_close(master);
4545 }
4546
4547 if (pid > 0)
4548 (void) wait_for_terminate(pid, NULL);
503546da 4549
50ebcf6c
LP
4550 pager_close();
4551
17cbb288 4552 if (remove_directory && arg_directory) {
ec16945e
LP
4553 int k;
4554
17cbb288 4555 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4556 if (k < 0)
17cbb288 4557 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4558 }
4559
0f3be6ca
LP
4560 if (remove_image && arg_image) {
4561 if (unlink(arg_image) < 0)
4562 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4563 }
4564
c67b0082
LP
4565 if (remove_tmprootdir) {
4566 if (rmdir(tmprootdir) < 0)
4567 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4568 }
4569
785890ac
LP
4570 if (arg_machine) {
4571 const char *p;
4572
63c372cb 4573 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4574 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4575 }
4576
7a8f6325 4577 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4578
4579 if (veth_created)
4580 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4581 (void) remove_bridge(arg_network_zone);
f757855e 4582
04d391da 4583 free(arg_directory);
ec16945e
LP
4584 free(arg_template);
4585 free(arg_image);
7027ff61 4586 free(arg_machine);
3a9530e5 4587 free(arg_hostname);
c74e630d 4588 free(arg_user);
b53ede69
PW
4589 free(arg_pivot_root_new);
4590 free(arg_pivot_root_old);
5f932eb9 4591 free(arg_chdir);
c74e630d 4592 strv_free(arg_setenv);
f757855e 4593 free(arg_network_bridge);
c74e630d
LP
4594 strv_free(arg_network_interfaces);
4595 strv_free(arg_network_macvlan);
4bbfe7ad 4596 strv_free(arg_network_ipvlan);
f6d6bad1 4597 strv_free(arg_network_veth_extra);
f757855e 4598 strv_free(arg_parameters);
df1fac6d
LP
4599 free(arg_network_zone);
4600 free(arg_network_namespace_path);
4601 strv_free(arg_property);
f757855e
LP
4602 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4603 expose_port_free_all(arg_expose_ports);
4623e8e6 4604 free(arg_root_hash);
bf428efb 4605 rlimit_free_all(arg_rlimit);
df1fac6d
LP
4606 strv_free(arg_syscall_whitelist);
4607 strv_free(arg_syscall_blacklist);
d107bb7d 4608 arg_cpuset = cpu_set_mfree(arg_cpuset);
6d0b55c2 4609
ec16945e 4610 return r < 0 ? EXIT_FAILURE : ret;
88213476 4611}