]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
build-sys: use #if Y instead of #ifdef Y everywhere
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
349cc4a5 20#if HAVE_BLKID
6b5cf3ea 21#include <blkid.h>
8fe0087e 22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
349cc4a5 29#if HAVE_SELINUX
8fe0087e 30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
6916b164 41#include <sys/wait.h>
8fe0087e 42#include <unistd.h>
1b9e5b12 43
b053cd5f 44#include "sd-bus.h"
1f0cd86b 45#include "sd-daemon.h"
1f0cd86b 46#include "sd-id128.h"
8fe0087e 47
b5efdb8a 48#include "alloc-util.h"
8fe0087e
LP
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
b053cd5f 53#include "bus-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
2d845785 59#include "dissect-image.h"
8fe0087e 60#include "env-util.h"
3ffd4af2 61#include "fd-util.h"
842f3b0f 62#include "fdset.h"
a5c32cff 63#include "fileio.h"
f97b34a6 64#include "format-util.h"
f4f15635 65#include "fs-util.h"
1b9e5b12 66#include "gpt.h"
4623e8e6 67#include "hexdecoct.h"
8fe0087e 68#include "hostname-util.h"
910fd145 69#include "id128-util.h"
8fe0087e 70#include "log.h"
2d845785 71#include "loop-util.h"
8fe0087e 72#include "loopback-setup.h"
1b9cebf6 73#include "machine-image.h"
8fe0087e
LP
74#include "macro.h"
75#include "missing.h"
76#include "mkdir.h"
4349cd7c 77#include "mount-util.h"
8fe0087e 78#include "netlink-util.h"
07630cea
LP
79#include "nspawn-cgroup.h"
80#include "nspawn-expose-ports.h"
81#include "nspawn-mount.h"
82#include "nspawn-network.h"
7336138e 83#include "nspawn-patch-uid.h"
07630cea 84#include "nspawn-register.h"
910fd145 85#include "nspawn-seccomp.h"
07630cea
LP
86#include "nspawn-settings.h"
87#include "nspawn-setuid.h"
7732f92b 88#include "nspawn-stub-pid1.h"
6bedfcbb 89#include "parse-util.h"
8fe0087e 90#include "path-util.h"
0b452006 91#include "process-util.h"
8fe0087e
LP
92#include "ptyfwd.h"
93#include "random-util.h"
8869a0b4 94#include "raw-clone.h"
8fe0087e 95#include "rm-rf.h"
68b02049 96#include "selinux-util.h"
8fe0087e 97#include "signal-util.h"
2583fbea 98#include "socket-util.h"
8fcde012 99#include "stat-util.h"
15a5e950 100#include "stdio-util.h"
07630cea 101#include "string-util.h"
8fe0087e
LP
102#include "strv.h"
103#include "terminal-util.h"
104#include "udev-util.h"
affb60b1 105#include "umask-util.h"
b1d4f8e1 106#include "user-util.h"
8fe0087e 107#include "util.h"
e9642be2 108
0e7ac751 109/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
110 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
111 * may have their own allocation ranges too. */
0e7ac751
LP
112#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
113#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 114
9c1e04d0
AP
115/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED
125} ContainerStatus;
126
57fb9fb5
LP
127typedef enum LinkJournal {
128 LINK_NO,
129 LINK_AUTO,
130 LINK_HOST,
131 LINK_GUEST
132} LinkJournal;
88213476
LP
133
134static char *arg_directory = NULL;
ec16945e 135static char *arg_template = NULL;
5f932eb9 136static char *arg_chdir = NULL;
b53ede69
PW
137static char *arg_pivot_root_new = NULL;
138static char *arg_pivot_root_old = NULL;
687d0825 139static char *arg_user = NULL;
9444b1f2 140static sd_id128_t arg_uuid = {};
7027ff61 141static char *arg_machine = NULL;
c74e630d
LP
142static const char *arg_selinux_context = NULL;
143static const char *arg_selinux_apifs_context = NULL;
9444b1f2 144static const char *arg_slice = NULL;
ff01d048 145static bool arg_private_network = false;
bc2f673e 146static bool arg_read_only = false;
7732f92b 147static StartMode arg_start_mode = START_PID1;
ec16945e 148static bool arg_ephemeral = false;
57fb9fb5 149static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 150static bool arg_link_journal_try = false;
520e0d54 151static uint64_t arg_caps_retain =
50b52222
LP
152 (1ULL << CAP_AUDIT_CONTROL) |
153 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
154 (1ULL << CAP_CHOWN) |
155 (1ULL << CAP_DAC_OVERRIDE) |
156 (1ULL << CAP_DAC_READ_SEARCH) |
157 (1ULL << CAP_FOWNER) |
158 (1ULL << CAP_FSETID) |
159 (1ULL << CAP_IPC_OWNER) |
160 (1ULL << CAP_KILL) |
161 (1ULL << CAP_LEASE) |
162 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 163 (1ULL << CAP_MKNOD) |
5076f0cc
LP
164 (1ULL << CAP_NET_BIND_SERVICE) |
165 (1ULL << CAP_NET_BROADCAST) |
166 (1ULL << CAP_NET_RAW) |
5076f0cc 167 (1ULL << CAP_SETFCAP) |
50b52222 168 (1ULL << CAP_SETGID) |
5076f0cc
LP
169 (1ULL << CAP_SETPCAP) |
170 (1ULL << CAP_SETUID) |
171 (1ULL << CAP_SYS_ADMIN) |
50b52222 172 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
173 (1ULL << CAP_SYS_CHROOT) |
174 (1ULL << CAP_SYS_NICE) |
175 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 176 (1ULL << CAP_SYS_RESOURCE) |
50b52222 177 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
178static CustomMount *arg_custom_mounts = NULL;
179static unsigned arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
050f7277 191static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 192static char *arg_image = NULL;
f757855e 193static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 194static ExposePort *arg_expose_ports = NULL;
f36933fe 195static char **arg_property = NULL;
0de7acce 196static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 197static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 198static bool arg_userns_chown = false;
c6c8f6e2 199static int arg_kill_signal = 0;
5da38d07 200static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
201static SettingsMask arg_settings_mask = 0;
202static int arg_settings_trusted = -1;
203static char **arg_parameters = NULL;
6aadfa4c 204static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 205static bool arg_notify_ready = false;
5a8ff0e6 206static bool arg_use_cgns = true;
0c582db0 207static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 208static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
209static void *arg_root_hash = NULL;
210static size_t arg_root_hash_size = 0;
960e4569
LP
211static char **arg_syscall_whitelist = NULL;
212static char **arg_syscall_blacklist = NULL;
88213476 213
601185b4 214static void help(void) {
88213476
LP
215 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
216 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
217 " -h --help Show this help\n"
218 " --version Print version string\n"
69c79d3c 219 " -q --quiet Do not show status information\n"
1b9e5b12 220 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
221 " --template=PATH Initialize root directory from template directory,\n"
222 " if missing\n"
223 " -x --ephemeral Run container with snapshot of root directory, and\n"
224 " remove it after exit\n"
225 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 226 " --root-hash=HASH Specify verity root hash\n"
7732f92b 227 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 228 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 229 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
230 " --pivot-root=PATH[:PATH]\n"
231 " Pivot root to given directory in the container\n"
a8828ed9 232 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 233 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 234 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 235 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 236 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 237 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 238 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 239 " Similar, but with user configured UID/GID range\n"
24597ee0 240 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
241 " --private-network Disable network in container\n"
242 " --network-interface=INTERFACE\n"
243 " Assign an existing network interface to the\n"
244 " container\n"
c74e630d
LP
245 " --network-macvlan=INTERFACE\n"
246 " Create a macvlan network interface based on an\n"
247 " existing network interface to the container\n"
4bbfe7ad
TG
248 " --network-ipvlan=INTERFACE\n"
249 " Create a ipvlan network interface based on an\n"
250 " existing network interface to the container\n"
a8eaaee7 251 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 252 " and container\n"
f6d6bad1
LP
253 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
254 " Add an additional virtual Ethernet link between\n"
255 " host and container\n"
ab046dde 256 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
257 " Add a virtual Ethernet connection to the container\n"
258 " and attach it to an existing bridge on the host\n"
259 " --network-zone=NAME Similar, but attach the new interface to an\n"
260 " an automatically managed bridge interface\n"
6d0b55c2 261 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 262 " Expose a container IP port on the host\n"
82adf6af
LP
263 " -Z --selinux-context=SECLABEL\n"
264 " Set the SELinux security context to be used by\n"
265 " processes in the container\n"
266 " -L --selinux-apifs-context=SECLABEL\n"
267 " Set the SELinux security context to be used by\n"
268 " API/tmpfs file systems in the container\n"
a8828ed9
DW
269 " --capability=CAP In addition to the default, retain specified\n"
270 " capability\n"
271 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
272 " --system-call-filter=LIST|~LIST\n"
273 " Permit/prohibit specific system calls\n"
c6c8f6e2 274 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
275 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
276 " host, try-guest, try-host\n"
574edc90 277 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 278 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
279 " --bind=PATH[:PATH[:OPTIONS]]\n"
280 " Bind mount a file or directory from the host into\n"
a8828ed9 281 " the container\n"
5e5bfa6e
EY
282 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
283 " Similar, but creates a read-only bind mount\n"
06c17c39 284 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
285 " --overlay=PATH[:PATH...]:PATH\n"
286 " Create an overlay mount from the host to \n"
287 " the container\n"
288 " --overlay-ro=PATH[:PATH...]:PATH\n"
289 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 290 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 291 " --register=BOOLEAN Register container as machine\n"
89f7c846 292 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 293 " the service unit nspawn is running in\n"
6d0b55c2 294 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 295 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 296 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 297 , program_invocation_short_name);
88213476
LP
298}
299
86c0dd4a 300static int custom_mount_check_all(void) {
5a8af538 301 unsigned i;
5a8af538 302
5a8af538
LP
303 for (i = 0; i < arg_n_custom_mounts; i++) {
304 CustomMount *m = &arg_custom_mounts[i];
305
0de7acce 306 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
307
308 if (arg_userns_chown) {
309 log_error("--private-users-chown may not be combined with custom root mounts.");
310 return -EINVAL;
311 } else if (arg_uid_shift == UID_INVALID) {
312 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
313 return -EINVAL;
314 }
825d5287 315 }
5a8af538
LP
316 }
317
318 return 0;
319}
320
0fd9563f 321static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 322 const char *e;
415fc41c 323 int r;
5da38d07 324
efdb0237
LP
325 /* Allow the user to control whether the unified hierarchy is used */
326 e = getenv("UNIFIED_CGROUP_HIERARCHY");
327 if (e) {
328 r = parse_boolean(e);
329 if (r < 0)
330 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
331 if (r > 0)
332 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
333 else
334 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 335
efdb0237
LP
336 return 0;
337 }
338
339 /* Otherwise inherit the default from the host system */
b4cccbc1
LP
340 r = cg_all_unified();
341 if (r < 0)
342 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
343 if (r > 0) {
a8725a06
ZJS
344 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
345 * routine only detects 231, so we'll have a false negative here for 230. */
346 r = systemd_installation_has_version(directory, 230);
347 if (r < 0)
348 return log_error_errno(r, "Failed to determine systemd version in container: %m");
349 if (r > 0)
350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
351 else
352 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 353 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
354 /* Mixed cgroup hierarchy support was added in 233 */
355 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
356 if (r < 0)
357 return log_error_errno(r, "Failed to determine systemd version in container: %m");
358 if (r > 0)
359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
360 else
361 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
362 } else
5da38d07 363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 364
efdb0237
LP
365 return 0;
366}
367
0c582db0
LB
368static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
369 int r;
370
371 r = getenv_bool(name);
372 if (r == -ENXIO)
373 return;
374 if (r < 0)
375 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
376 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
377}
378
4f086aab
SU
379static void parse_mount_settings_env(void) {
380 int r;
381 const char *e;
382
383 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
384 if (!e)
385 return;
386
387 if (streq(e, "network")) {
388 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
389 return;
390 }
391
392 r = parse_boolean(e);
393 if (r < 0) {
394 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
395 return;
ab8ee0f2 396 }
4f086aab 397
ab8ee0f2
ZJS
398 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
399 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
400}
401
88213476
LP
402static int parse_argv(int argc, char *argv[]) {
403
a41fe3a2 404 enum {
acbeb427
ZJS
405 ARG_VERSION = 0x100,
406 ARG_PRIVATE_NETWORK,
bc2f673e 407 ARG_UUID,
5076f0cc 408 ARG_READ_ONLY,
57fb9fb5 409 ARG_CAPABILITY,
420c7379 410 ARG_DROP_CAPABILITY,
17fe0523
LP
411 ARG_LINK_JOURNAL,
412 ARG_BIND,
f4889f65 413 ARG_BIND_RO,
06c17c39 414 ARG_TMPFS,
5a8af538
LP
415 ARG_OVERLAY,
416 ARG_OVERLAY_RO,
eb91eb18 417 ARG_SHARE_SYSTEM,
89f7c846 418 ARG_REGISTER,
aa28aefe 419 ARG_KEEP_UNIT,
69c79d3c 420 ARG_NETWORK_INTERFACE,
c74e630d 421 ARG_NETWORK_MACVLAN,
4bbfe7ad 422 ARG_NETWORK_IPVLAN,
ab046dde 423 ARG_NETWORK_BRIDGE,
22b28dfd 424 ARG_NETWORK_ZONE,
f6d6bad1 425 ARG_NETWORK_VETH_EXTRA,
6afc95b7 426 ARG_PERSONALITY,
4d9f07b4 427 ARG_VOLATILE,
ec16945e 428 ARG_TEMPLATE,
f36933fe 429 ARG_PROPERTY,
6dac160c 430 ARG_PRIVATE_USERS,
c6c8f6e2 431 ARG_KILL_SIGNAL,
f757855e 432 ARG_SETTINGS,
5f932eb9 433 ARG_CHDIR,
b53ede69 434 ARG_PIVOT_ROOT,
7336138e 435 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 436 ARG_NOTIFY_READY,
4623e8e6 437 ARG_ROOT_HASH,
960e4569 438 ARG_SYSTEM_CALL_FILTER,
a41fe3a2
LP
439 };
440
88213476 441 static const struct option options[] = {
27eb8e90
ZJS
442 { "help", no_argument, NULL, 'h' },
443 { "version", no_argument, NULL, ARG_VERSION },
444 { "directory", required_argument, NULL, 'D' },
445 { "template", required_argument, NULL, ARG_TEMPLATE },
446 { "ephemeral", no_argument, NULL, 'x' },
447 { "user", required_argument, NULL, 'u' },
448 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
449 { "as-pid2", no_argument, NULL, 'a' },
450 { "boot", no_argument, NULL, 'b' },
451 { "uuid", required_argument, NULL, ARG_UUID },
452 { "read-only", no_argument, NULL, ARG_READ_ONLY },
453 { "capability", required_argument, NULL, ARG_CAPABILITY },
454 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
455 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
456 { "bind", required_argument, NULL, ARG_BIND },
457 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
458 { "tmpfs", required_argument, NULL, ARG_TMPFS },
459 { "overlay", required_argument, NULL, ARG_OVERLAY },
460 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
461 { "machine", required_argument, NULL, 'M' },
462 { "slice", required_argument, NULL, 'S' },
463 { "setenv", required_argument, NULL, 'E' },
464 { "selinux-context", required_argument, NULL, 'Z' },
465 { "selinux-apifs-context", required_argument, NULL, 'L' },
466 { "quiet", no_argument, NULL, 'q' },
467 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
468 { "register", required_argument, NULL, ARG_REGISTER },
469 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
470 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
471 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
472 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
473 { "network-veth", no_argument, NULL, 'n' },
474 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
475 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
476 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
477 { "personality", required_argument, NULL, ARG_PERSONALITY },
478 { "image", required_argument, NULL, 'i' },
479 { "volatile", optional_argument, NULL, ARG_VOLATILE },
480 { "port", required_argument, NULL, 'p' },
481 { "property", required_argument, NULL, ARG_PROPERTY },
482 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
483 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
484 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
485 { "settings", required_argument, NULL, ARG_SETTINGS },
486 { "chdir", required_argument, NULL, ARG_CHDIR },
b53ede69 487 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
27eb8e90 488 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
4623e8e6 489 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
960e4569 490 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
eb9da376 491 {}
88213476
LP
492 };
493
9444b1f2 494 int c, r;
6aadfa4c 495 const char *p, *e;
a42c8b54 496 uint64_t plus = 0, minus = 0;
f757855e 497 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
498
499 assert(argc >= 0);
500 assert(argv);
501
2e1f244e 502 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
503
504 switch (c) {
505
506 case 'h':
601185b4
ZJS
507 help();
508 return 0;
88213476 509
acbeb427 510 case ARG_VERSION:
3f6fd1ba 511 return version();
acbeb427 512
88213476 513 case 'D':
0f03c2a4 514 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 515 if (r < 0)
0f03c2a4 516 return r;
ec16945e
LP
517 break;
518
519 case ARG_TEMPLATE:
0f03c2a4 520 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 521 if (r < 0)
0f03c2a4 522 return r;
88213476
LP
523 break;
524
1b9e5b12 525 case 'i':
0f03c2a4 526 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 527 if (r < 0)
0f03c2a4 528 return r;
ec16945e
LP
529 break;
530
531 case 'x':
532 arg_ephemeral = true;
1b9e5b12
LP
533 break;
534
687d0825 535 case 'u':
2fc09a9c
DM
536 r = free_and_strdup(&arg_user, optarg);
537 if (r < 0)
7027ff61 538 return log_oom();
687d0825 539
f757855e 540 arg_settings_mask |= SETTING_USER;
687d0825
MV
541 break;
542
22b28dfd
LP
543 case ARG_NETWORK_ZONE: {
544 char *j;
545
546 j = strappend("vz-", optarg);
547 if (!j)
548 return log_oom();
549
550 if (!ifname_valid(j)) {
551 log_error("Network zone name not valid: %s", j);
552 free(j);
553 return -EINVAL;
554 }
555
556 free(arg_network_zone);
557 arg_network_zone = j;
558
559 arg_network_veth = true;
560 arg_private_network = true;
561 arg_settings_mask |= SETTING_NETWORK;
562 break;
563 }
564
ab046dde 565 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
566
567 if (!ifname_valid(optarg)) {
568 log_error("Bridge interface name not valid: %s", optarg);
569 return -EINVAL;
570 }
571
f757855e
LP
572 r = free_and_strdup(&arg_network_bridge, optarg);
573 if (r < 0)
574 return log_oom();
ab046dde
TG
575
576 /* fall through */
577
0dfaa006 578 case 'n':
69c79d3c
LP
579 arg_network_veth = true;
580 arg_private_network = true;
f757855e 581 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
582 break;
583
f6d6bad1
LP
584 case ARG_NETWORK_VETH_EXTRA:
585 r = veth_extra_parse(&arg_network_veth_extra, optarg);
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
588
589 arg_private_network = true;
590 arg_settings_mask |= SETTING_NETWORK;
591 break;
592
aa28aefe 593 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
594
595 if (!ifname_valid(optarg)) {
596 log_error("Network interface name not valid: %s", optarg);
597 return -EINVAL;
598 }
599
c74e630d
LP
600 if (strv_extend(&arg_network_interfaces, optarg) < 0)
601 return log_oom();
602
603 arg_private_network = true;
f757855e 604 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
605 break;
606
607 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
608
609 if (!ifname_valid(optarg)) {
610 log_error("MACVLAN network interface name not valid: %s", optarg);
611 return -EINVAL;
612 }
613
c74e630d 614 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
615 return log_oom();
616
4bbfe7ad 617 arg_private_network = true;
f757855e 618 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
619 break;
620
621 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
622
623 if (!ifname_valid(optarg)) {
624 log_error("IPVLAN network interface name not valid: %s", optarg);
625 return -EINVAL;
626 }
627
4bbfe7ad
TG
628 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
629 return log_oom();
630
aa28aefe
LP
631 /* fall through */
632
ff01d048
LP
633 case ARG_PRIVATE_NETWORK:
634 arg_private_network = true;
f757855e 635 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
636 break;
637
0f0dbc46 638 case 'b':
7732f92b
LP
639 if (arg_start_mode == START_PID2) {
640 log_error("--boot and --as-pid2 may not be combined.");
641 return -EINVAL;
642 }
643
644 arg_start_mode = START_BOOT;
645 arg_settings_mask |= SETTING_START_MODE;
646 break;
647
648 case 'a':
649 if (arg_start_mode == START_BOOT) {
650 log_error("--boot and --as-pid2 may not be combined.");
651 return -EINVAL;
652 }
653
654 arg_start_mode = START_PID2;
655 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
656 break;
657
144f0fc0 658 case ARG_UUID:
9444b1f2 659 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
660 if (r < 0)
661 return log_error_errno(r, "Invalid UUID: %s", optarg);
662
663 if (sd_id128_is_null(arg_uuid)) {
664 log_error("Machine UUID may not be all zeroes.");
665 return -EINVAL;
aa96c6cb 666 }
f757855e
LP
667
668 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 669 break;
aa96c6cb 670
9444b1f2 671 case 'S':
c74e630d 672 arg_slice = optarg;
144f0fc0
LP
673 break;
674
7027ff61 675 case 'M':
c1521918 676 if (isempty(optarg))
97b11eed 677 arg_machine = mfree(arg_machine);
c1521918 678 else {
0c3c4284 679 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
680 log_error("Invalid machine name: %s", optarg);
681 return -EINVAL;
682 }
7027ff61 683
0c3c4284
LP
684 r = free_and_strdup(&arg_machine, optarg);
685 if (r < 0)
eb91eb18 686 return log_oom();
eb91eb18 687 }
9ce6d1b3 688 break;
7027ff61 689
82adf6af
LP
690 case 'Z':
691 arg_selinux_context = optarg;
a8828ed9
DW
692 break;
693
82adf6af
LP
694 case 'L':
695 arg_selinux_apifs_context = optarg;
a8828ed9
DW
696 break;
697
bc2f673e
LP
698 case ARG_READ_ONLY:
699 arg_read_only = true;
f757855e 700 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
701 break;
702
420c7379
LP
703 case ARG_CAPABILITY:
704 case ARG_DROP_CAPABILITY: {
6cbe4ed1 705 p = optarg;
9ed794a3 706 for (;;) {
6cbe4ed1 707 _cleanup_free_ char *t = NULL;
5076f0cc 708
6cbe4ed1
SS
709 r = extract_first_word(&p, &t, ",", 0);
710 if (r < 0)
711 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 712
6cbe4ed1
SS
713 if (r == 0)
714 break;
5076f0cc 715
39ed67d1
LP
716 if (streq(t, "all")) {
717 if (c == ARG_CAPABILITY)
a42c8b54 718 plus = (uint64_t) -1;
39ed67d1 719 else
a42c8b54 720 minus = (uint64_t) -1;
39ed67d1 721 } else {
2822da4f
LP
722 int cap;
723
724 cap = capability_from_name(t);
725 if (cap < 0) {
39ed67d1
LP
726 log_error("Failed to parse capability %s.", t);
727 return -EINVAL;
728 }
729
730 if (c == ARG_CAPABILITY)
a42c8b54 731 plus |= 1ULL << (uint64_t) cap;
39ed67d1 732 else
a42c8b54 733 minus |= 1ULL << (uint64_t) cap;
5076f0cc 734 }
5076f0cc
LP
735 }
736
f757855e 737 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
738 break;
739 }
740
57fb9fb5
LP
741 case 'j':
742 arg_link_journal = LINK_GUEST;
574edc90 743 arg_link_journal_try = true;
57fb9fb5
LP
744 break;
745
746 case ARG_LINK_JOURNAL:
53e438e3 747 if (streq(optarg, "auto")) {
57fb9fb5 748 arg_link_journal = LINK_AUTO;
53e438e3
LP
749 arg_link_journal_try = false;
750 } else if (streq(optarg, "no")) {
57fb9fb5 751 arg_link_journal = LINK_NO;
53e438e3
LP
752 arg_link_journal_try = false;
753 } else if (streq(optarg, "guest")) {
57fb9fb5 754 arg_link_journal = LINK_GUEST;
53e438e3
LP
755 arg_link_journal_try = false;
756 } else if (streq(optarg, "host")) {
57fb9fb5 757 arg_link_journal = LINK_HOST;
53e438e3
LP
758 arg_link_journal_try = false;
759 } else if (streq(optarg, "try-guest")) {
574edc90
MP
760 arg_link_journal = LINK_GUEST;
761 arg_link_journal_try = true;
762 } else if (streq(optarg, "try-host")) {
763 arg_link_journal = LINK_HOST;
764 arg_link_journal_try = true;
765 } else {
57fb9fb5
LP
766 log_error("Failed to parse link journal mode %s", optarg);
767 return -EINVAL;
768 }
769
770 break;
771
17fe0523 772 case ARG_BIND:
f757855e
LP
773 case ARG_BIND_RO:
774 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
775 if (r < 0)
776 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 777
f757855e 778 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 779 break;
06c17c39 780
f757855e
LP
781 case ARG_TMPFS:
782 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
783 if (r < 0)
784 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 785
f757855e 786 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 787 break;
5a8af538
LP
788
789 case ARG_OVERLAY:
ad85779a
LP
790 case ARG_OVERLAY_RO:
791 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
792 if (r == -EADDRNOTAVAIL)
793 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
794 if (r < 0)
795 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 796
f757855e 797 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 798 break;
06c17c39 799
a5f1cb3b 800 case 'E': {
f4889f65
LP
801 char **n;
802
803 if (!env_assignment_is_valid(optarg)) {
804 log_error("Environment variable assignment '%s' is not valid.", optarg);
805 return -EINVAL;
806 }
807
808 n = strv_env_set(arg_setenv, optarg);
809 if (!n)
810 return log_oom();
811
812 strv_free(arg_setenv);
813 arg_setenv = n;
f757855e
LP
814
815 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
816 break;
817 }
818
284c0b91
LP
819 case 'q':
820 arg_quiet = true;
821 break;
822
8a96d94e 823 case ARG_SHARE_SYSTEM:
a6b5216c 824 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
825 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
826 arg_clone_ns_flags = 0;
8a96d94e
LP
827 break;
828
eb91eb18
LP
829 case ARG_REGISTER:
830 r = parse_boolean(optarg);
831 if (r < 0) {
832 log_error("Failed to parse --register= argument: %s", optarg);
833 return r;
834 }
835
836 arg_register = r;
837 break;
838
89f7c846
LP
839 case ARG_KEEP_UNIT:
840 arg_keep_unit = true;
841 break;
842
6afc95b7
LP
843 case ARG_PERSONALITY:
844
ac45f971 845 arg_personality = personality_from_string(optarg);
050f7277 846 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
847 log_error("Unknown or unsupported personality '%s'.", optarg);
848 return -EINVAL;
849 }
850
f757855e 851 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
852 break;
853
4d9f07b4
LP
854 case ARG_VOLATILE:
855
856 if (!optarg)
f757855e 857 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 858 else {
f757855e 859 VolatileMode m;
4d9f07b4 860
f757855e
LP
861 m = volatile_mode_from_string(optarg);
862 if (m < 0) {
863 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 864 return -EINVAL;
f757855e
LP
865 } else
866 arg_volatile_mode = m;
6d0b55c2
LP
867 }
868
f757855e
LP
869 arg_settings_mask |= SETTING_VOLATILE_MODE;
870 break;
6d0b55c2 871
f757855e
LP
872 case 'p':
873 r = expose_port_parse(&arg_expose_ports, optarg);
874 if (r == -EEXIST)
875 return log_error_errno(r, "Duplicate port specification: %s", optarg);
876 if (r < 0)
877 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 878
f757855e 879 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 880 break;
6d0b55c2 881
f36933fe
LP
882 case ARG_PROPERTY:
883 if (strv_extend(&arg_property, optarg) < 0)
884 return log_oom();
885
886 break;
887
ae209204
ZJS
888 case ARG_PRIVATE_USERS: {
889 int boolean = -1;
0de7acce 890
ae209204
ZJS
891 if (!optarg)
892 boolean = true;
893 else if (!in_charset(optarg, DIGITS))
894 /* do *not* parse numbers as booleans */
895 boolean = parse_boolean(optarg);
896
897 if (boolean == false) {
0de7acce
LP
898 /* no: User namespacing off */
899 arg_userns_mode = USER_NAMESPACE_NO;
900 arg_uid_shift = UID_INVALID;
901 arg_uid_range = UINT32_C(0x10000);
ae209204 902 } else if (boolean == true) {
0de7acce
LP
903 /* yes: User namespacing on, UID range is read from root dir */
904 arg_userns_mode = USER_NAMESPACE_FIXED;
905 arg_uid_shift = UID_INVALID;
906 arg_uid_range = UINT32_C(0x10000);
907 } else if (streq(optarg, "pick")) {
908 /* pick: User namespacing on, UID range is picked randomly */
909 arg_userns_mode = USER_NAMESPACE_PICK;
910 arg_uid_shift = UID_INVALID;
911 arg_uid_range = UINT32_C(0x10000);
912 } else {
6c2058b3 913 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
914 const char *range, *shift;
915
0de7acce
LP
916 /* anything else: User namespacing on, UID range is explicitly configured */
917
6dac160c
LP
918 range = strchr(optarg, ':');
919 if (range) {
6c2058b3
ZJS
920 buffer = strndup(optarg, range - optarg);
921 if (!buffer)
922 return log_oom();
923 shift = buffer;
6dac160c
LP
924
925 range++;
bfd292ec
ZJS
926 r = safe_atou32(range, &arg_uid_range);
927 if (r < 0)
be715731 928 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
929 } else
930 shift = optarg;
931
be715731
ZJS
932 r = parse_uid(shift, &arg_uid_shift);
933 if (r < 0)
934 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
935
936 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
937 }
938
be715731
ZJS
939 if (arg_uid_range <= 0) {
940 log_error("UID range cannot be 0.");
941 return -EINVAL;
942 }
943
0de7acce 944 arg_settings_mask |= SETTING_USERNS;
6dac160c 945 break;
ae209204 946 }
6dac160c 947
0de7acce 948 case 'U':
ccabee0d
LP
949 if (userns_supported()) {
950 arg_userns_mode = USER_NAMESPACE_PICK;
951 arg_uid_shift = UID_INVALID;
952 arg_uid_range = UINT32_C(0x10000);
953
954 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
955 }
956
7336138e
LP
957 break;
958
0de7acce 959 case ARG_PRIVATE_USERS_CHOWN:
19aac838 960 arg_userns_chown = true;
0de7acce
LP
961
962 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
963 break;
964
c6c8f6e2
LP
965 case ARG_KILL_SIGNAL:
966 arg_kill_signal = signal_from_string_try_harder(optarg);
967 if (arg_kill_signal < 0) {
968 log_error("Cannot parse signal: %s", optarg);
969 return -EINVAL;
970 }
971
f757855e
LP
972 arg_settings_mask |= SETTING_KILL_SIGNAL;
973 break;
974
975 case ARG_SETTINGS:
976
977 /* no → do not read files
978 * yes → read files, do not override cmdline, trust only subset
979 * override → read files, override cmdline, trust only subset
980 * trusted → read files, do not override cmdline, trust all
981 */
982
983 r = parse_boolean(optarg);
984 if (r < 0) {
985 if (streq(optarg, "trusted")) {
986 mask_all_settings = false;
987 mask_no_settings = false;
988 arg_settings_trusted = true;
989
990 } else if (streq(optarg, "override")) {
991 mask_all_settings = false;
992 mask_no_settings = true;
993 arg_settings_trusted = -1;
994 } else
995 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
996 } else if (r > 0) {
997 /* yes */
998 mask_all_settings = false;
999 mask_no_settings = false;
1000 arg_settings_trusted = -1;
1001 } else {
1002 /* no */
1003 mask_all_settings = true;
1004 mask_no_settings = false;
1005 arg_settings_trusted = false;
1006 }
1007
c6c8f6e2
LP
1008 break;
1009
5f932eb9
LP
1010 case ARG_CHDIR:
1011 if (!path_is_absolute(optarg)) {
1012 log_error("Working directory %s is not an absolute path.", optarg);
1013 return -EINVAL;
1014 }
1015
1016 r = free_and_strdup(&arg_chdir, optarg);
1017 if (r < 0)
1018 return log_oom();
1019
1020 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1021 break;
1022
b53ede69
PW
1023 case ARG_PIVOT_ROOT:
1024 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1025 if (r < 0)
1026 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1027
1028 arg_settings_mask |= SETTING_PIVOT_ROOT;
1029 break;
1030
9c1e04d0
AP
1031 case ARG_NOTIFY_READY:
1032 r = parse_boolean(optarg);
1033 if (r < 0) {
1034 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1035 return -EINVAL;
1036 }
1037 arg_notify_ready = r;
1038 arg_settings_mask |= SETTING_NOTIFY_READY;
1039 break;
1040
4623e8e6
LP
1041 case ARG_ROOT_HASH: {
1042 void *k;
1043 size_t l;
1044
1045 r = unhexmem(optarg, strlen(optarg), &k, &l);
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1048 if (l < sizeof(sd_id128_t)) {
1049 log_error("Root hash must be at least 128bit long: %s", optarg);
1050 free(k);
1051 return -EINVAL;
1052 }
1053
1054 free(arg_root_hash);
1055 arg_root_hash = k;
1056 arg_root_hash_size = l;
1057 break;
1058 }
1059
960e4569
LP
1060 case ARG_SYSTEM_CALL_FILTER: {
1061 bool negative;
1062 const char *items;
1063
1064 negative = optarg[0] == '~';
1065 items = negative ? optarg + 1 : optarg;
1066
1067 for (;;) {
1068 _cleanup_free_ char *word = NULL;
1069
1070 r = extract_first_word(&items, &word, NULL, 0);
1071 if (r == 0)
1072 break;
1073 if (r == -ENOMEM)
1074 return log_oom();
1075 if (r < 0)
1076 return log_error_errno(r, "Failed to parse system call filter: %m");
1077
1078 if (negative)
1079 r = strv_extend(&arg_syscall_blacklist, word);
1080 else
1081 r = strv_extend(&arg_syscall_whitelist, word);
1082 if (r < 0)
1083 return log_oom();
1084 }
1085
1086 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1087 break;
1088 }
1089
88213476
LP
1090 case '?':
1091 return -EINVAL;
1092
1093 default:
eb9da376 1094 assert_not_reached("Unhandled option");
88213476 1095 }
88213476 1096
0c582db0
LB
1097 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1098 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1099 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1100 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1101
4f086aab
SU
1102 if (arg_userns_mode != USER_NAMESPACE_NO)
1103 arg_mount_settings |= MOUNT_USE_USERNS;
1104
1105 if (arg_private_network)
1106 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1107
1108 parse_mount_settings_env();
1109
48a8d337
LB
1110 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1111 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1112 arg_register = false;
0c582db0
LB
1113 if (arg_start_mode != START_PID1) {
1114 log_error("--boot cannot be used without namespacing.");
1115 return -EINVAL;
1116 }
1117 }
eb91eb18 1118
0de7acce 1119 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1120 arg_userns_chown = true;
1121
cd2dfc6f
LP
1122 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
1123 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846
LP
1124 return -EINVAL;
1125 }
1126
1b9e5b12
LP
1127 if (arg_directory && arg_image) {
1128 log_error("--directory= and --image= may not be combined.");
1129 return -EINVAL;
1130 }
1131
ec16945e
LP
1132 if (arg_template && arg_image) {
1133 log_error("--template= and --image= may not be combined.");
1134 return -EINVAL;
1135 }
1136
8cd328d8
LP
1137 if (arg_ephemeral && arg_template && !arg_directory) {
1138 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1139 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1140 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1141 * --directory=". */
1142
1143 arg_directory = arg_template;
1144 arg_template = NULL;
1145 }
1146
ec16945e
LP
1147 if (arg_template && !(arg_directory || arg_machine)) {
1148 log_error("--template= needs --directory= or --machine=.");
1149 return -EINVAL;
1150 }
1151
1152 if (arg_ephemeral && arg_template) {
1153 log_error("--ephemeral and --template= may not be combined.");
1154 return -EINVAL;
1155 }
1156
df9a75e4
LP
1157 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1158 log_error("--ephemeral and --link-journal= may not be combined.");
1159 return -EINVAL;
1160 }
1161
ccabee0d 1162 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1163 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1164 return -EOPNOTSUPP;
1165 }
1166
1167 if (arg_userns_chown && arg_read_only) {
1168 log_error("--read-only and --private-users-chown may not be combined.");
1169 return -EINVAL;
1170 }
f757855e 1171
22b28dfd
LP
1172 if (arg_network_bridge && arg_network_zone) {
1173 log_error("--network-bridge= and --network-zone= may not be combined.");
1174 return -EINVAL;
1175 }
1176
f757855e
LP
1177 if (argc > optind) {
1178 arg_parameters = strv_copy(argv + optind);
1179 if (!arg_parameters)
1180 return log_oom();
1181
7732f92b 1182 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1183 }
1184
1185 /* Load all settings from .nspawn files */
1186 if (mask_no_settings)
1187 arg_settings_mask = 0;
1188
1189 /* Don't load any settings from .nspawn files */
1190 if (mask_all_settings)
1191 arg_settings_mask = _SETTINGS_MASK_ALL;
1192
520e0d54 1193 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1194
399e391f
ZJS
1195 r = cg_unified_flush();
1196 if (r < 0)
1197 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1198
6aadfa4c
ILG
1199 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1200 if (e)
1201 arg_container_service_name = e;
1202
5a8ff0e6
CB
1203 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1204 if (r < 0)
1205 arg_use_cgns = cg_ns_supported();
1206 else
1207 arg_use_cgns = r;
1208
86c0dd4a
LP
1209 r = custom_mount_check_all();
1210 if (r < 0)
1211 return r;
1212
f757855e
LP
1213 return 1;
1214}
1215
1216static int verify_arguments(void) {
4f086aab
SU
1217 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1218 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1219 return -EINVAL;
1220 }
1221
1222 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1223 log_error("Cannot combine --private-users with read-write mounts.");
1224 return -EINVAL;
1225 }
f757855e
LP
1226
1227 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1228 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1229 return -EINVAL;
1230 }
1231
6d0b55c2
LP
1232 if (arg_expose_ports && !arg_private_network) {
1233 log_error("Cannot use --port= without private networking.");
1234 return -EINVAL;
1235 }
1236
349cc4a5 1237#if ! HAVE_LIBIPTC
1c1ea217
EV
1238 if (arg_expose_ports) {
1239 log_error("--port= is not supported, compiled without libiptc support.");
1240 return -EOPNOTSUPP;
1241 }
1242#endif
1243
7732f92b 1244 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1245 arg_kill_signal = SIGRTMIN+3;
1246
f757855e 1247 return 0;
88213476
LP
1248}
1249
03cfe0d5
LP
1250static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1251 assert(p);
1252
0de7acce 1253 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1254 return 0;
1255
1256 if (uid == UID_INVALID && gid == GID_INVALID)
1257 return 0;
1258
1259 if (uid != UID_INVALID) {
1260 uid += arg_uid_shift;
1261
1262 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1263 return -EOVERFLOW;
1264 }
1265
1266 if (gid != GID_INVALID) {
1267 gid += (gid_t) arg_uid_shift;
1268
1269 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1270 return -EOVERFLOW;
1271 }
1272
1273 if (lchown(p, uid, gid) < 0)
1274 return -errno;
b12afc8c
LP
1275
1276 return 0;
1277}
1278
03cfe0d5
LP
1279static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1280 const char *q;
1281
1282 q = prefix_roota(root, path);
1283 if (mkdir(q, mode) < 0) {
1284 if (errno == EEXIST)
1285 return 0;
1286 return -errno;
1287 }
1288
1289 return userns_lchown(q, uid, gid);
1290}
1291
e58a1277 1292static int setup_timezone(const char *dest) {
03cfe0d5
LP
1293 _cleanup_free_ char *p = NULL, *q = NULL;
1294 const char *where, *check, *what;
d4036145
LP
1295 char *z, *y;
1296 int r;
f8440af5 1297
e58a1277
LP
1298 assert(dest);
1299
1300 /* Fix the timezone, if possible */
d4036145
LP
1301 r = readlink_malloc("/etc/localtime", &p);
1302 if (r < 0) {
0b493a02
MP
1303 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1304 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1305 * with a symbolic link to a time zone data file.
0b493a02
MP
1306 *
1307 * Example:
21dc0227 1308 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1309 */
d4036145
LP
1310 return 0;
1311 }
1312
1313 z = path_startswith(p, "../usr/share/zoneinfo/");
1314 if (!z)
1315 z = path_startswith(p, "/usr/share/zoneinfo/");
1316 if (!z) {
1317 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1318 return 0;
1319 }
1320
03cfe0d5 1321 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1322 r = readlink_malloc(where, &q);
1323 if (r >= 0) {
1324 y = path_startswith(q, "../usr/share/zoneinfo/");
1325 if (!y)
1326 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1327
d4036145
LP
1328 /* Already pointing to the right place? Then do nothing .. */
1329 if (y && streq(y, z))
1330 return 0;
1331 }
1332
03cfe0d5 1333 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1334 check = prefix_roota(dest, check);
03cfe0d5 1335 if (laccess(check, F_OK) < 0) {
d4036145
LP
1336 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1337 return 0;
1338 }
68fb0892 1339
8ccf7e9e
LP
1340 if (unlink(where) < 0 && errno != ENOENT) {
1341 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1342 errno,
1343 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1344 return 0;
1345 }
4d9f07b4 1346
03cfe0d5 1347 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1348 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1349 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1350 errno,
1351 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1352 return 0;
1353 }
e58a1277 1354
03cfe0d5
LP
1355 r = userns_lchown(where, 0, 0);
1356 if (r < 0)
1357 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1358
e58a1277 1359 return 0;
88213476
LP
1360}
1361
7357272e 1362static int resolved_listening(void) {
b053cd5f 1363 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1364 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1365 int r;
1366
7357272e 1367 /* Check if resolved is listening */
b053cd5f
LP
1368
1369 r = sd_bus_open_system(&bus);
1370 if (r < 0)
1371 return r;
1372
7357272e
DM
1373 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1374 if (r <= 0)
1375 return r;
1376
1377 r = sd_bus_get_property_string(bus,
1378 "org.freedesktop.resolve1",
1379 "/org/freedesktop/resolve1",
1380 "org.freedesktop.resolve1.Manager",
1381 "DNSStubListener",
1382 NULL,
1383 &dns_stub_listener_mode);
1384 if (r < 0)
1385 return r;
1386
1387 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1388}
1389
2547bb41 1390static int setup_resolv_conf(const char *dest) {
87447ae4
LP
1391 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1392 const char *where;
1393 int r, found;
2547bb41
LP
1394
1395 assert(dest);
1396
1397 if (arg_private_network)
1398 return 0;
1399
87447ae4
LP
1400 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1401 if (r < 0) {
1402 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1403 return 0;
1404 }
1405
1406 where = strjoina(etc, "/resolv.conf");
1407 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1408 if (found < 0) {
1409 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1410 return 0;
1411 }
79d80fc1 1412
b053cd5f 1413 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
7357272e 1414 resolved_listening() > 0) {
87447ae4 1415
3539724c
LP
1416 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1417 * container, so that the container can use the host's resolver. Given that network namespacing is
1418 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1419 * advantage that the container will be able to follow the host's DNS server configuration changes
1420 * transparently. */
1421
87447ae4
LP
1422 if (found == 0) /* missing? */
1423 (void) touch(resolved);
5367354d 1424
87447ae4 1425 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
60e76d48 1426 if (r >= 0)
87447ae4 1427 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1428 }
1429
1430 /* If that didn't work, let's copy the file */
1c876927 1431 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1432 if (r < 0) {
3539724c
LP
1433 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1434 * resolved or something similar runs inside and the symlink points there.
68a313c5 1435 *
3539724c 1436 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1437 */
87447ae4 1438 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1439 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1440 return 0;
1441 }
2547bb41 1442
03cfe0d5
LP
1443 r = userns_lchown(where, 0, 0);
1444 if (r < 0)
3539724c 1445 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1446
2547bb41
LP
1447 return 0;
1448}
1449
04bc4a3f 1450static int setup_boot_id(const char *dest) {
3bbaff3e 1451 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1452 const char *from, *to;
04bc4a3f
LP
1453 int r;
1454
04bc4a3f
LP
1455 /* Generate a new randomized boot ID, so that each boot-up of
1456 * the container gets a new one */
1457
03cfe0d5
LP
1458 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1459 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1460
1461 r = sd_id128_randomize(&rnd);
f647962d
MS
1462 if (r < 0)
1463 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1464
15b1248a 1465 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1466 if (r < 0)
1467 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1468
60e76d48
ZJS
1469 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1470 if (r >= 0)
1471 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1472 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1473
3bbaff3e 1474 (void) unlink(from);
04bc4a3f
LP
1475 return r;
1476}
1477
e58a1277 1478static int copy_devnodes(const char *dest) {
88213476
LP
1479
1480 static const char devnodes[] =
1481 "null\0"
1482 "zero\0"
1483 "full\0"
1484 "random\0"
1485 "urandom\0"
85614d66
TG
1486 "tty\0"
1487 "net/tun\0";
88213476
LP
1488
1489 const char *d;
e58a1277 1490 int r = 0;
7fd1b19b 1491 _cleanup_umask_ mode_t u;
a258bf26
LP
1492
1493 assert(dest);
124640f1
LP
1494
1495 u = umask(0000);
88213476 1496
03cfe0d5
LP
1497 /* Create /dev/net, so that we can create /dev/net/tun in it */
1498 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1499 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1500
88213476 1501 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1502 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1503 struct stat st;
88213476 1504
7f112f50 1505 from = strappend("/dev/", d);
03cfe0d5 1506 to = prefix_root(dest, from);
88213476
LP
1507
1508 if (stat(from, &st) < 0) {
1509
4a62c710
MS
1510 if (errno != ENOENT)
1511 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1512
a258bf26 1513 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1514
03cfe0d5 1515 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1516 return -EIO;
a258bf26 1517
85614d66 1518 } else {
81f5049b 1519 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1520 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1521 if (errno == EEXIST)
8dbf71ec 1522 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1523 if (errno != EPERM)
1524 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1525
1526 /* Some systems abusively restrict mknod but
1527 * allow bind mounts. */
1528 r = touch(to);
1529 if (r < 0)
1530 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1531 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1532 if (r < 0)
1533 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1534 }
6278cf60 1535
03cfe0d5
LP
1536 r = userns_lchown(to, 0, 0);
1537 if (r < 0)
1538 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1539 }
88213476
LP
1540 }
1541
e58a1277
LP
1542 return r;
1543}
88213476 1544
03cfe0d5
LP
1545static int setup_pts(const char *dest) {
1546 _cleanup_free_ char *options = NULL;
1547 const char *p;
709f6e46 1548 int r;
03cfe0d5 1549
349cc4a5 1550#if HAVE_SELINUX
03cfe0d5
LP
1551 if (arg_selinux_apifs_context)
1552 (void) asprintf(&options,
3dce8915 1553 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1554 arg_uid_shift + TTY_GID,
1555 arg_selinux_apifs_context);
1556 else
1557#endif
1558 (void) asprintf(&options,
3dce8915 1559 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1560 arg_uid_shift + TTY_GID);
f2d88580 1561
03cfe0d5 1562 if (!options)
f2d88580
LP
1563 return log_oom();
1564
03cfe0d5 1565 /* Mount /dev/pts itself */
cc9fce65 1566 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1567 if (mkdir(p, 0755) < 0)
1568 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1569 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1570 if (r < 0)
1571 return r;
709f6e46
MS
1572 r = userns_lchown(p, 0, 0);
1573 if (r < 0)
1574 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1575
1576 /* Create /dev/ptmx symlink */
1577 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1578 if (symlink("pts/ptmx", p) < 0)
1579 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1580 r = userns_lchown(p, 0, 0);
1581 if (r < 0)
1582 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1583
03cfe0d5
LP
1584 /* And fix /dev/pts/ptmx ownership */
1585 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1586 r = userns_lchown(p, 0, 0);
1587 if (r < 0)
1588 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1589
f2d88580
LP
1590 return 0;
1591}
1592
e58a1277 1593static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1594 _cleanup_umask_ mode_t u;
1595 const char *to;
e58a1277 1596 int r;
e58a1277
LP
1597
1598 assert(dest);
1599 assert(console);
1600
1601 u = umask(0000);
1602
03cfe0d5 1603 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1604 if (r < 0)
1605 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1606
a258bf26
LP
1607 /* We need to bind mount the right tty to /dev/console since
1608 * ptys can only exist on pts file systems. To have something
81f5049b 1609 * to bind mount things on we create a empty regular file. */
a258bf26 1610
03cfe0d5 1611 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1612 r = touch(to);
1613 if (r < 0)
1614 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1615
60e76d48 1616 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1617}
1618
8e5430c4
LP
1619static int setup_keyring(void) {
1620 key_serial_t keyring;
1621
1622 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1623 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1624 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1625 * these system calls let's make sure we don't leak anything into the container. */
1626
1627 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1628 if (keyring == -1) {
1629 if (errno == ENOSYS)
1630 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1631 else if (IN_SET(errno, EACCES, EPERM))
1632 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1633 else
1634 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1635 }
1636
1637 return 0;
1638}
1639
e58a1277 1640static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1641 const char *from, *to;
7fd1b19b 1642 _cleanup_umask_ mode_t u;
d9603714 1643 int fd, r;
e58a1277 1644
e58a1277 1645 assert(kmsg_socket >= 0);
a258bf26 1646
e58a1277 1647 u = umask(0000);
a258bf26 1648
03cfe0d5 1649 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1650 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1651 * on the reading side behave very similar to /proc/kmsg,
1652 * their writing side behaves differently from /dev/kmsg in
1653 * that writing blocks when nothing is reading. In order to
1654 * avoid any problems with containers deadlocking due to this
1655 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1656 from = prefix_roota(dest, "/run/kmsg");
1657 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1658
4a62c710 1659 if (mkfifo(from, 0600) < 0)
03cfe0d5 1660 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1661 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1662 if (r < 0)
1663 return r;
e58a1277
LP
1664
1665 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1666 if (fd < 0)
1667 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1668
e58a1277
LP
1669 /* Store away the fd in the socket, so that it stays open as
1670 * long as we run the child */
3ee897d6 1671 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1672 safe_close(fd);
e58a1277 1673
d9603714
DH
1674 if (r < 0)
1675 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1676
03cfe0d5
LP
1677 /* And now make the FIFO unavailable as /run/kmsg... */
1678 (void) unlink(from);
1679
25ea79fe 1680 return 0;
88213476
LP
1681}
1682
1c4baffc 1683static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1684 union in_addr_union *exposed = userdata;
1685
1686 assert(rtnl);
1687 assert(m);
1688 assert(exposed);
1689
7a8f6325 1690 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1691 return 0;
1692}
1693
3a74cea5 1694static int setup_hostname(void) {
3a74cea5 1695
0c582db0 1696 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1697 return 0;
1698
605f81a8 1699 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1700 return -errno;
3a74cea5 1701
7027ff61 1702 return 0;
3a74cea5
LP
1703}
1704
57fb9fb5 1705static int setup_journal(const char *directory) {
e01ff70a 1706 sd_id128_t this_id;
0f5e1382 1707 _cleanup_free_ char *d = NULL;
e01ff70a 1708 const char *p, *q;
8054d749 1709 bool try;
e01ff70a 1710 char id[33];
57fb9fb5
LP
1711 int r;
1712
df9a75e4
LP
1713 /* Don't link journals in ephemeral mode */
1714 if (arg_ephemeral)
1715 return 0;
1716
8054d749
LP
1717 if (arg_link_journal == LINK_NO)
1718 return 0;
1719
1720 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1721
4d680aee 1722 r = sd_id128_get_machine(&this_id);
f647962d
MS
1723 if (r < 0)
1724 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1725
e01ff70a 1726 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1727 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1728 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1729 if (try)
4d680aee 1730 return 0;
df9a75e4 1731 return -EEXIST;
4d680aee
ZJS
1732 }
1733
03cfe0d5
LP
1734 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to create /var: %m");
1737
1738 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to create /var/log: %m");
1741
1742 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1745
e01ff70a
MS
1746 (void) sd_id128_to_string(arg_uuid, id);
1747
03cfe0d5
LP
1748 p = strjoina("/var/log/journal/", id);
1749 q = prefix_roota(directory, p);
27407a01 1750
e1873695 1751 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1752 if (try)
1753 return 0;
27407a01 1754
8054d749
LP
1755 log_error("%s: already a mount point, refusing to use for journal", p);
1756 return -EEXIST;
57fb9fb5
LP
1757 }
1758
e1873695 1759 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1760 if (try)
1761 return 0;
57fb9fb5 1762
8054d749
LP
1763 log_error("%s: already a mount point, refusing to use for journal", q);
1764 return -EEXIST;
57fb9fb5
LP
1765 }
1766
1767 r = readlink_and_make_absolute(p, &d);
1768 if (r >= 0) {
3742095b 1769 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
1770 path_equal(d, q)) {
1771
03cfe0d5 1772 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1773 if (r < 0)
709f6e46 1774 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1775 return 0;
57fb9fb5
LP
1776 }
1777
4a62c710
MS
1778 if (unlink(p) < 0)
1779 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1780 } else if (r == -EINVAL) {
1781
1782 if (arg_link_journal == LINK_GUEST &&
1783 rmdir(p) < 0) {
1784
27407a01
ZJS
1785 if (errno == ENOTDIR) {
1786 log_error("%s already exists and is neither a symlink nor a directory", p);
1787 return r;
4314d33f
MS
1788 } else
1789 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1790 }
4314d33f
MS
1791 } else if (r != -ENOENT)
1792 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1793
1794 if (arg_link_journal == LINK_GUEST) {
1795
1796 if (symlink(q, p) < 0) {
8054d749 1797 if (try) {
56f64d95 1798 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1799 return 0;
4314d33f
MS
1800 } else
1801 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1802 }
1803
03cfe0d5 1804 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1805 if (r < 0)
709f6e46 1806 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1807 return 0;
57fb9fb5
LP
1808 }
1809
1810 if (arg_link_journal == LINK_HOST) {
ccddd104 1811 /* don't create parents here — if the host doesn't have
574edc90 1812 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1813
1814 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1815 if (try) {
56f64d95 1816 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1817 return 0;
4314d33f
MS
1818 } else
1819 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1820 }
1821
27407a01
ZJS
1822 } else if (access(p, F_OK) < 0)
1823 return 0;
57fb9fb5 1824
cdb2b9d0
LP
1825 if (dir_is_empty(q) == 0)
1826 log_warning("%s is not empty, proceeding anyway.", q);
1827
03cfe0d5 1828 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1829 if (r < 0)
1830 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1831
60e76d48
ZJS
1832 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1833 if (r < 0)
4a62c710 1834 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1835
27407a01 1836 return 0;
57fb9fb5
LP
1837}
1838
88213476 1839static int drop_capabilities(void) {
520e0d54 1840 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1841}
1842
db999e0f
LP
1843static int reset_audit_loginuid(void) {
1844 _cleanup_free_ char *p = NULL;
1845 int r;
1846
0c582db0 1847 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1848 return 0;
1849
1850 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1851 if (r == -ENOENT)
db999e0f 1852 return 0;
f647962d
MS
1853 if (r < 0)
1854 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1855
1856 /* Already reset? */
1857 if (streq(p, "4294967295"))
1858 return 0;
1859
ad118bda 1860 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1861 if (r < 0) {
10a87006
LP
1862 log_error_errno(r,
1863 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1864 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1865 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1866 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1867 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1868
db999e0f 1869 sleep(5);
77b6e194 1870 }
db999e0f
LP
1871
1872 return 0;
77b6e194
LP
1873}
1874
24fb1112 1875
785890ac
LP
1876static int setup_propagate(const char *root) {
1877 const char *p, *q;
709f6e46 1878 int r;
785890ac
LP
1879
1880 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1881 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1882 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1883 (void) mkdir_p(p, 0600);
1884
709f6e46
MS
1885 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1886 if (r < 0)
1887 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1888
709f6e46
MS
1889 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1890 if (r < 0)
1891 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1892
709f6e46
MS
1893 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1894 if (r < 0)
1895 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1896
03cfe0d5 1897 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1898 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1899 if (r < 0)
1900 return r;
785890ac 1901
60e76d48
ZJS
1902 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1903 if (r < 0)
1904 return r;
785890ac 1905
19caffac
AC
1906 /* machined will MS_MOVE into that directory, and that's only
1907 * supported for non-shared mounts. */
60e76d48 1908 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1909}
1910
317feb4d 1911static int setup_machine_id(const char *directory) {
691675ba
LP
1912 const char *etc_machine_id;
1913 sd_id128_t id;
3bbaff3e 1914 int r;
e01ff70a 1915
317feb4d
LP
1916 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1917 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1918 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1919 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1920 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1921 * container behaves nicely). */
1922
e01ff70a
MS
1923 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1924
691675ba 1925 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
1926 if (r < 0) {
1927 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1928 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 1929
317feb4d
LP
1930 if (sd_id128_is_null(arg_uuid)) {
1931 r = sd_id128_randomize(&arg_uuid);
1932 if (r < 0)
1933 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1934 }
1935 } else {
1936 if (sd_id128_is_null(id)) {
1937 log_error("Machine ID in container image is zero, refusing.");
1938 return -EINVAL;
1939 }
e01ff70a 1940
317feb4d
LP
1941 arg_uuid = id;
1942 }
691675ba 1943
e01ff70a
MS
1944 return 0;
1945}
1946
7336138e
LP
1947static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1948 int r;
1949
1950 assert(directory);
1951
0de7acce 1952 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
1953 return 0;
1954
1955 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1956 if (r == -EOPNOTSUPP)
1957 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1958 if (r == -EBADE)
1959 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1960 if (r < 0)
1961 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1962 if (r == 0)
1963 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1964 else
1965 log_debug("Patched directory tree to match UID/GID range.");
1966
1967 return r;
1968}
1969
113cea80 1970/*
6d416b9c
LS
1971 * Return values:
1972 * < 0 : wait_for_terminate() failed to get the state of the
1973 * container, the container was terminated by a signal, or
1974 * failed for an unknown reason. No change is made to the
1975 * container argument.
1976 * > 0 : The program executed in the container terminated with an
1977 * error. The exit code of the program executed in the
919699ec
LP
1978 * container is returned. The container argument has been set
1979 * to CONTAINER_TERMINATED.
6d416b9c
LS
1980 * 0 : The container is being rebooted, has been shut down or exited
1981 * successfully. The container argument has been set to either
1982 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 1983 *
6d416b9c
LS
1984 * That is, success is indicated by a return value of zero, and an
1985 * error is indicated by a non-zero value.
113cea80
DH
1986 */
1987static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 1988 siginfo_t status;
919699ec 1989 int r;
113cea80
DH
1990
1991 r = wait_for_terminate(pid, &status);
f647962d
MS
1992 if (r < 0)
1993 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
1994
1995 switch (status.si_code) {
fddbb89c 1996
113cea80 1997 case CLD_EXITED:
b5a2179b 1998 if (status.si_status == 0)
919699ec 1999 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2000 else
919699ec 2001 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2002
919699ec
LP
2003 *container = CONTAINER_TERMINATED;
2004 return status.si_status;
113cea80
DH
2005
2006 case CLD_KILLED:
2007 if (status.si_status == SIGINT) {
919699ec 2008 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2009 *container = CONTAINER_TERMINATED;
919699ec
LP
2010 return 0;
2011
113cea80 2012 } else if (status.si_status == SIGHUP) {
919699ec 2013 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2014 *container = CONTAINER_REBOOTED;
919699ec 2015 return 0;
113cea80 2016 }
919699ec 2017
ec251fe7 2018 /* fall through */
113cea80
DH
2019
2020 case CLD_DUMPED:
fddbb89c 2021 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2022 return -EIO;
113cea80
DH
2023
2024 default:
fddbb89c 2025 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2026 return -EIO;
113cea80 2027 }
113cea80
DH
2028}
2029
023fb90b
LP
2030static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2031 pid_t pid;
2032
4a0b58c4 2033 pid = PTR_TO_PID(userdata);
023fb90b 2034 if (pid > 0) {
c6c8f6e2 2035 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2036 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2037 sd_event_source_set_userdata(s, NULL);
2038 return 0;
2039 }
2040 }
2041
2042 sd_event_exit(sd_event_source_get_event(s), 0);
2043 return 0;
2044}
2045
6916b164
AU
2046static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2047 for (;;) {
2048 siginfo_t si = {};
2049 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2050 return log_error_errno(errno, "Failed to waitid(): %m");
2051 if (si.si_pid == 0) /* No pending children. */
2052 break;
2053 if (si.si_pid == PTR_TO_PID(userdata)) {
2054 /* The main process we care for has exited. Return from
2055 * signal handler but leave the zombie. */
2056 sd_event_exit(sd_event_source_get_event(s), 0);
2057 break;
2058 }
2059 /* Reap all other children. */
2060 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2061 }
2062
2063 return 0;
2064}
2065
ec16945e 2066static int determine_names(void) {
1b9cebf6 2067 int r;
ec16945e 2068
c1521918
LP
2069 if (arg_template && !arg_directory && arg_machine) {
2070
2071 /* If --template= was specified then we should not
2072 * search for a machine, but instead create a new one
2073 * in /var/lib/machine. */
2074
605405c6 2075 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2076 if (!arg_directory)
2077 return log_oom();
2078 }
2079
ec16945e 2080 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2081 if (arg_machine) {
2082 _cleanup_(image_unrefp) Image *i = NULL;
2083
2084 r = image_find(arg_machine, &i);
2085 if (r < 0)
2086 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2087 if (r == 0) {
35bca925 2088 log_error("No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2089 return -ENOENT;
2090 }
2091
aceac2f0 2092 if (i->type == IMAGE_RAW)
0f03c2a4 2093 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2094 else
0f03c2a4 2095 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2096 if (r < 0)
0f3be6ca 2097 return log_oom();
1b9cebf6 2098
aee327b8
LP
2099 if (!arg_ephemeral)
2100 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2101 } else
ec16945e
LP
2102 arg_directory = get_current_dir_name();
2103
0f3be6ca 2104 if (!arg_directory && !arg_image) {
1b9cebf6 2105 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2106 return -EINVAL;
2107 }
2108 }
2109
2110 if (!arg_machine) {
4827ab48 2111
b9ba4dab
LP
2112 if (arg_directory && path_equal(arg_directory, "/"))
2113 arg_machine = gethostname_malloc();
4827ab48
LP
2114 else {
2115 if (arg_image) {
2116 char *e;
2117
2118 arg_machine = strdup(basename(arg_image));
2119
2120 /* Truncate suffix if there is one */
2121 e = endswith(arg_machine, ".raw");
2122 if (e)
2123 *e = 0;
2124 } else
2125 arg_machine = strdup(basename(arg_directory));
2126 }
ec16945e
LP
2127 if (!arg_machine)
2128 return log_oom();
2129
ae691c1d 2130 hostname_cleanup(arg_machine);
ec16945e
LP
2131 if (!machine_name_is_valid(arg_machine)) {
2132 log_error("Failed to determine machine name automatically, please use -M.");
2133 return -EINVAL;
2134 }
b9ba4dab
LP
2135
2136 if (arg_ephemeral) {
2137 char *b;
2138
2139 /* Add a random suffix when this is an
2140 * ephemeral machine, so that we can run many
2141 * instances at once without manually having
2142 * to specify -M each time. */
2143
2144 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2145 return log_oom();
2146
2147 free(arg_machine);
2148 arg_machine = b;
2149 }
ec16945e
LP
2150 }
2151
2152 return 0;
2153}
2154
8d4aa2bb 2155static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2156 char *chased;
2157 int r;
2158
2159 assert(p);
2160
2161 if (!*p)
2162 return 0;
2163
8d4aa2bb 2164 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2165 if (r < 0)
2166 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2167
2168 free(*p);
2169 *p = chased;
2170
2171 return 0;
2172}
2173
03cfe0d5 2174static int determine_uid_shift(const char *directory) {
6dac160c
LP
2175 int r;
2176
0de7acce 2177 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2178 arg_uid_shift = 0;
6dac160c 2179 return 0;
03cfe0d5 2180 }
6dac160c
LP
2181
2182 if (arg_uid_shift == UID_INVALID) {
2183 struct stat st;
2184
03cfe0d5 2185 r = stat(directory, &st);
6dac160c 2186 if (r < 0)
03cfe0d5 2187 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2188
2189 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2190
2191 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2192 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2193 return -EINVAL;
2194 }
2195
2196 arg_uid_range = UINT32_C(0x10000);
2197 }
2198
2199 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2200 log_error("UID base too high for UID range.");
2201 return -EINVAL;
2202 }
2203
6dac160c
LP
2204 return 0;
2205}
2206
03cfe0d5
LP
2207static int inner_child(
2208 Barrier *barrier,
2209 const char *directory,
2210 bool secondary,
2211 int kmsg_socket,
2212 int rtnl_socket,
f757855e 2213 FDSet *fds) {
69c79d3c 2214
03cfe0d5 2215 _cleanup_free_ char *home = NULL;
e01ff70a 2216 char as_uuid[37];
6aadfa4c 2217 unsigned n_env = 1;
03cfe0d5
LP
2218 const char *envp[] = {
2219 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2220 NULL, /* container */
03cfe0d5
LP
2221 NULL, /* TERM */
2222 NULL, /* HOME */
2223 NULL, /* USER */
2224 NULL, /* LOGNAME */
2225 NULL, /* container_uuid */
2226 NULL, /* LISTEN_FDS */
2227 NULL, /* LISTEN_PID */
9c1e04d0 2228 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2229 NULL
2230 };
1a68e1e5 2231 const char *exec_target;
88213476 2232
2371271c 2233 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2234 int r;
88213476 2235
03cfe0d5
LP
2236 assert(barrier);
2237 assert(directory);
2238 assert(kmsg_socket >= 0);
88213476 2239
0de7acce 2240 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2241 /* Tell the parent, that it now can write the UID map. */
2242 (void) barrier_place(barrier); /* #1 */
7027ff61 2243
03cfe0d5
LP
2244 /* Wait until the parent wrote the UID map */
2245 if (!barrier_place_and_sync(barrier)) { /* #2 */
2246 log_error("Parent died too early");
2247 return -ESRCH;
2248 }
88213476
LP
2249 }
2250
6d66bd3b
EV
2251 r = reset_uid_gid();
2252 if (r < 0)
2253 return log_error_errno(r, "Couldn't become new root: %m");
2254
0de7acce 2255 r = mount_all(NULL,
4f086aab 2256 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2257 arg_uid_shift,
2258 arg_uid_range,
2259 arg_selinux_apifs_context);
2260
03cfe0d5
LP
2261 if (r < 0)
2262 return r;
2263
4f086aab 2264 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2265 if (r < 0)
2266 return r;
2267
03cfe0d5
LP
2268 /* Wait until we are cgroup-ified, so that we
2269 * can mount the right cgroup path writable */
2270 if (!barrier_place_and_sync(barrier)) { /* #3 */
2271 log_error("Parent died too early");
2272 return -ESRCH;
88213476
LP
2273 }
2274
5a8ff0e6 2275 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2276 r = unshare(CLONE_NEWCGROUP);
2277 if (r < 0)
2278 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2279 r = mount_cgroups(
2280 "",
2281 arg_unified_cgroup_hierarchy,
2282 arg_userns_mode != USER_NAMESPACE_NO,
2283 arg_uid_shift,
2284 arg_uid_range,
5a8ff0e6 2285 arg_selinux_apifs_context,
ada54120 2286 true);
0996ef00
CB
2287 if (r < 0)
2288 return r;
2289 } else {
2290 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2291 if (r < 0)
2292 return r;
2293 }
ec16945e 2294
03cfe0d5
LP
2295 r = setup_boot_id(NULL);
2296 if (r < 0)
2297 return r;
ec16945e 2298
03cfe0d5
LP
2299 r = setup_kmsg(NULL, kmsg_socket);
2300 if (r < 0)
2301 return r;
2302 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2303
03cfe0d5 2304 umask(0022);
30535c16 2305
03cfe0d5
LP
2306 if (setsid() < 0)
2307 return log_error_errno(errno, "setsid() failed: %m");
2308
2309 if (arg_private_network)
2310 loopback_setup();
2311
7a8f6325
LP
2312 if (arg_expose_ports) {
2313 r = expose_port_send_rtnl(rtnl_socket);
2314 if (r < 0)
2315 return r;
2316 rtnl_socket = safe_close(rtnl_socket);
2317 }
03cfe0d5 2318
709f6e46
MS
2319 r = drop_capabilities();
2320 if (r < 0)
2321 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2322
2323 setup_hostname();
2324
050f7277 2325 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2326 r = safe_personality(arg_personality);
2327 if (r < 0)
2328 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2329 } else if (secondary) {
21022b9d
LP
2330 r = safe_personality(PER_LINUX32);
2331 if (r < 0)
2332 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2333 }
2334
349cc4a5 2335#if HAVE_SELINUX
03cfe0d5 2336 if (arg_selinux_context)
2ed96880 2337 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2338 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2339#endif
2340
ee645080 2341 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2342 if (r < 0)
2343 return r;
2344
6aadfa4c
ILG
2345 /* LXC sets container=lxc, so follow the scheme here */
2346 envp[n_env++] = strjoina("container=", arg_container_service_name);
2347
03cfe0d5
LP
2348 envp[n_env] = strv_find_prefix(environ, "TERM=");
2349 if (envp[n_env])
313cefa1 2350 n_env++;
03cfe0d5
LP
2351
2352 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2353 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2354 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2355 return log_oom();
2356
3bbaff3e 2357 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2358
691675ba 2359 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2360 return log_oom();
03cfe0d5
LP
2361
2362 if (fdset_size(fds) > 0) {
2363 r = fdset_cloexec(fds, false);
2364 if (r < 0)
2365 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2366
2367 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2368 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2369 return log_oom();
2370 }
9c1e04d0
AP
2371 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2372 return log_oom();
03cfe0d5 2373
2371271c
TG
2374 env_use = strv_env_merge(2, envp, arg_setenv);
2375 if (!env_use)
2376 return log_oom();
03cfe0d5
LP
2377
2378 /* Let the parent know that we are ready and
2379 * wait until the parent is ready with the
2380 * setup, too... */
2381 if (!barrier_place_and_sync(barrier)) { /* #4 */
2382 log_error("Parent died too early");
2383 return -ESRCH;
2384 }
2385
5f932eb9
LP
2386 if (arg_chdir)
2387 if (chdir(arg_chdir) < 0)
2388 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2389
7732f92b 2390 if (arg_start_mode == START_PID2) {
75bf701f 2391 r = stub_pid1(arg_uuid);
7732f92b
LP
2392 if (r < 0)
2393 return r;
2394 }
2395
03cfe0d5
LP
2396 /* Now, explicitly close the log, so that we
2397 * then can close all remaining fds. Closing
2398 * the log explicitly first has the benefit
2399 * that the logging subsystem knows about it,
2400 * and is thus ready to be reopened should we
2401 * need it again. Note that the other fds
2402 * closed here are at least the locking and
2403 * barrier fds. */
2404 log_close();
2405 (void) fdset_close_others(fds);
2406
7732f92b 2407 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2408 char **a;
2409 size_t m;
2410
2411 /* Automatically search for the init system */
2412
75f32f04
ZJS
2413 m = strv_length(arg_parameters);
2414 a = newa(char*, m + 2);
2415 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2416 a[1 + m] = NULL;
03cfe0d5 2417
ced58da7 2418 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2419 execve(a[0], a, env_use);
2420
ced58da7 2421 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2422 execve(a[0], a, env_use);
2423
ced58da7 2424 a[0] = (char*) "/sbin/init";
03cfe0d5 2425 execve(a[0], a, env_use);
ced58da7
LP
2426
2427 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2428 } else if (!strv_isempty(arg_parameters)) {
2429 exec_target = arg_parameters[0];
f757855e 2430 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2431 } else {
5f932eb9 2432 if (!arg_chdir)
d929b0f9
ZJS
2433 /* If we cannot change the directory, we'll end up in /, that is expected. */
2434 (void) chdir(home ?: "/root");
5f932eb9 2435
03cfe0d5
LP
2436 execle("/bin/bash", "-bash", NULL, env_use);
2437 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2438
2439 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2440 }
2441
35607a8d 2442 r = -errno;
03cfe0d5 2443 (void) log_open();
1a68e1e5 2444 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2445}
2446
9c1e04d0
AP
2447static int setup_sd_notify_child(void) {
2448 static const int one = 1;
2449 int fd = -1;
2450 union sockaddr_union sa = {
2451 .sa.sa_family = AF_UNIX,
2452 };
2453 int r;
2454
2455 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2456 if (fd < 0)
2457 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2458
2459 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2460 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2461
2462 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2463 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2464 if (r < 0) {
2465 safe_close(fd);
2466 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2467 }
2468
adc7d9f0
EV
2469 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2470 if (r < 0) {
2471 safe_close(fd);
2472 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2473 }
2474
9c1e04d0
AP
2475 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2476 if (r < 0) {
2477 safe_close(fd);
2478 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2479 }
2480
2481 return fd;
2482}
2483
03cfe0d5
LP
2484static int outer_child(
2485 Barrier *barrier,
2486 const char *directory,
2487 const char *console,
2d845785 2488 DissectedImage *dissected_image,
03cfe0d5
LP
2489 bool interactive,
2490 bool secondary,
2491 int pid_socket,
e01ff70a 2492 int uuid_socket,
9c1e04d0 2493 int notify_socket,
03cfe0d5
LP
2494 int kmsg_socket,
2495 int rtnl_socket,
825d5287 2496 int uid_shift_socket,
f757855e 2497 FDSet *fds) {
03cfe0d5
LP
2498
2499 pid_t pid;
2500 ssize_t l;
2501 int r;
9c1e04d0 2502 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2503
2504 assert(barrier);
2505 assert(directory);
2506 assert(console);
2507 assert(pid_socket >= 0);
e01ff70a 2508 assert(uuid_socket >= 0);
9c1e04d0 2509 assert(notify_socket >= 0);
03cfe0d5
LP
2510 assert(kmsg_socket >= 0);
2511
2512 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2513 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2514
2515 if (interactive) {
2516 close_nointr(STDIN_FILENO);
2517 close_nointr(STDOUT_FILENO);
2518 close_nointr(STDERR_FILENO);
2519
2520 r = open_terminal(console, O_RDWR);
2521 if (r != STDIN_FILENO) {
2522 if (r >= 0) {
2523 safe_close(r);
2524 r = -EINVAL;
2525 }
2526
2527 return log_error_errno(r, "Failed to open console: %m");
2528 }
2529
2530 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2531 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2532 return log_error_errno(errno, "Failed to duplicate console: %m");
2533 }
2534
2535 r = reset_audit_loginuid();
2536 if (r < 0)
2537 return r;
2538
2539 /* Mark everything as slave, so that we still
2540 * receive mounts from the real root, but don't
2541 * propagate mounts to the real root. */
60e76d48
ZJS
2542 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2543 if (r < 0)
2544 return r;
03cfe0d5 2545
2d845785 2546 if (dissected_image) {
18b5886e 2547 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2548 if (r < 0)
2549 return r;
2550 }
03cfe0d5 2551
391567f4
LP
2552 r = determine_uid_shift(directory);
2553 if (r < 0)
2554 return r;
2555
0de7acce 2556 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2557 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2558 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2559 if (l < 0)
2560 return log_error_errno(errno, "Failed to send UID shift: %m");
2561 if (l != sizeof(arg_uid_shift)) {
2562 log_error("Short write while sending UID shift.");
2563 return -EIO;
2564 }
0e7ac751 2565
0de7acce 2566 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2567 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2568 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2569 * not it will pick a different one, and send it back to us. */
2570
2571 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2572 if (l < 0)
2573 return log_error_errno(errno, "Failed to recv UID shift: %m");
2574 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2575 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2576 return -EIO;
2577 }
2578 }
2579
2580 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2581 }
2582
03cfe0d5 2583 /* Turn directory into bind mount */
60e76d48
ZJS
2584 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2585 if (r < 0)
2586 return r;
03cfe0d5 2587
b53ede69
PW
2588 r = setup_pivot_root(
2589 directory,
2590 arg_pivot_root_new,
2591 arg_pivot_root_old);
2592 if (r < 0)
2593 return r;
2594
0de7acce
LP
2595 r = setup_volatile(
2596 directory,
2597 arg_volatile_mode,
2598 arg_userns_mode != USER_NAMESPACE_NO,
2599 arg_uid_shift,
2600 arg_uid_range,
2601 arg_selinux_context);
03cfe0d5
LP
2602 if (r < 0)
2603 return r;
2604
0de7acce
LP
2605 r = setup_volatile_state(
2606 directory,
2607 arg_volatile_mode,
2608 arg_userns_mode != USER_NAMESPACE_NO,
2609 arg_uid_shift,
2610 arg_uid_range,
2611 arg_selinux_context);
03cfe0d5
LP
2612 if (r < 0)
2613 return r;
2614
4ad14eff
LP
2615 /* Mark everything as shared so our mounts get propagated down. This is
2616 * required to make new bind mounts available in systemd services
2617 * inside the containter that create a new mount namespace.
2618 * See https://github.com/systemd/systemd/issues/3860
2619 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2620 * shared propagation mode. */
4ad14eff
LP
2621 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2622 if (r < 0)
2623 return r;
2624
2625 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2626 if (r < 0)
2627 return r;
2628
03cfe0d5
LP
2629 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2630 if (r < 0)
2631 return r;
2632
03cfe0d5 2633 if (arg_read_only) {
6b7c9f8b 2634 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2635 if (r < 0)
2636 return log_error_errno(r, "Failed to make tree read-only: %m");
2637 }
2638
0de7acce 2639 r = mount_all(directory,
4f086aab 2640 arg_mount_settings,
0de7acce
LP
2641 arg_uid_shift,
2642 arg_uid_range,
2643 arg_selinux_apifs_context);
03cfe0d5
LP
2644 if (r < 0)
2645 return r;
2646
07fa00f9
LP
2647 r = copy_devnodes(directory);
2648 if (r < 0)
03cfe0d5
LP
2649 return r;
2650
2651 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2652
07fa00f9
LP
2653 r = setup_pts(directory);
2654 if (r < 0)
03cfe0d5
LP
2655 return r;
2656
2657 r = setup_propagate(directory);
2658 if (r < 0)
2659 return r;
2660
2661 r = setup_dev_console(directory, console);
2662 if (r < 0)
2663 return r;
2664
8e5430c4
LP
2665 r = setup_keyring();
2666 if (r < 0)
2667 return r;
2668
960e4569 2669 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
2670 if (r < 0)
2671 return r;
2672
2673 r = setup_timezone(directory);
2674 if (r < 0)
2675 return r;
2676
2677 r = setup_resolv_conf(directory);
2678 if (r < 0)
2679 return r;
2680
e01ff70a
MS
2681 r = setup_machine_id(directory);
2682 if (r < 0)
2683 return r;
2684
03cfe0d5
LP
2685 r = setup_journal(directory);
2686 if (r < 0)
2687 return r;
2688
0de7acce
LP
2689 r = mount_custom(
2690 directory,
2691 arg_custom_mounts,
2692 arg_n_custom_mounts,
2693 arg_userns_mode != USER_NAMESPACE_NO,
2694 arg_uid_shift,
2695 arg_uid_range,
2696 arg_selinux_apifs_context);
03cfe0d5
LP
2697 if (r < 0)
2698 return r;
2699
5a8ff0e6 2700 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2701 r = mount_cgroups(
2702 directory,
2703 arg_unified_cgroup_hierarchy,
2704 arg_userns_mode != USER_NAMESPACE_NO,
2705 arg_uid_shift,
2706 arg_uid_range,
5a8ff0e6 2707 arg_selinux_apifs_context,
ada54120 2708 false);
0996ef00
CB
2709 if (r < 0)
2710 return r;
2711 }
03cfe0d5
LP
2712
2713 r = mount_move_root(directory);
2714 if (r < 0)
2715 return log_error_errno(r, "Failed to move root directory: %m");
2716
9c1e04d0
AP
2717 fd = setup_sd_notify_child();
2718 if (fd < 0)
2719 return fd;
2720
03cfe0d5 2721 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2722 arg_clone_ns_flags |
03cfe0d5 2723 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2724 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2725 if (pid < 0)
2726 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2727 if (pid == 0) {
2728 pid_socket = safe_close(pid_socket);
e01ff70a 2729 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2730 notify_socket = safe_close(notify_socket);
825d5287 2731 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2732
2733 /* The inner child has all namespaces that are
2734 * requested, so that we all are owned by the user if
2735 * user namespaces are turned on. */
2736
f757855e 2737 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2738 if (r < 0)
2739 _exit(EXIT_FAILURE);
2740
2741 _exit(EXIT_SUCCESS);
2742 }
2743
2744 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2745 if (l < 0)
2746 return log_error_errno(errno, "Failed to send PID: %m");
2747 if (l != sizeof(pid)) {
2748 log_error("Short write while sending PID.");
2749 return -EIO;
2750 }
2751
e01ff70a
MS
2752 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2753 if (l < 0)
2754 return log_error_errno(errno, "Failed to send machine ID: %m");
2755 if (l != sizeof(arg_uuid)) {
2756 log_error("Short write while sending machine ID.");
2757 return -EIO;
2758 }
2759
9c1e04d0
AP
2760 l = send_one_fd(notify_socket, fd, 0);
2761 if (l < 0)
2762 return log_error_errno(errno, "Failed to send notify fd: %m");
2763
03cfe0d5 2764 pid_socket = safe_close(pid_socket);
e01ff70a 2765 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2766 notify_socket = safe_close(notify_socket);
327e26d6
KN
2767 kmsg_socket = safe_close(kmsg_socket);
2768 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2769
2770 return 0;
2771}
2772
0e7ac751
LP
2773static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2774 unsigned n_tries = 100;
2775 uid_t candidate;
2776 int r;
2777
2778 assert(shift);
2779 assert(ret_lock_file);
0de7acce 2780 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2781 assert(arg_uid_range == 0x10000U);
2782
2783 candidate = *shift;
2784
2785 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2786
2787 for (;;) {
2788 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2789 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2790
2791 if (--n_tries <= 0)
2792 return -EBUSY;
2793
2794 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2795 goto next;
2796 if ((candidate & UINT32_C(0xFFFF)) != 0)
2797 goto next;
2798
2799 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2800 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2801 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2802 goto next;
2803 if (r < 0)
2804 return r;
2805
2806 /* Make some superficial checks whether the range is currently known in the user database */
2807 if (getpwuid(candidate))
2808 goto next;
2809 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2810 goto next;
2811 if (getgrgid(candidate))
2812 goto next;
2813 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2814 goto next;
2815
2816 *ret_lock_file = lf;
2817 lf = (struct LockFile) LOCK_FILE_INIT;
2818 *shift = candidate;
2819 return 0;
2820
2821 next:
2822 random_bytes(&candidate, sizeof(candidate));
2823 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2824 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2825 }
2826}
2827
03cfe0d5
LP
2828static int setup_uid_map(pid_t pid) {
2829 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2830 int r;
2831
2832 assert(pid > 1);
2833
2834 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2835 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2836 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to write UID map: %m");
2839
2840 /* We always assign the same UID and GID ranges */
2841 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2842 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to write GID map: %m");
2845
2846 return 0;
2847}
2848
9c1e04d0 2849static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
2850 char buf[NOTIFY_BUFFER_MAX+1];
2851 char *p = NULL;
2852 struct iovec iovec = {
2853 .iov_base = buf,
2854 .iov_len = sizeof(buf)-1,
2855 };
2856 union {
2857 struct cmsghdr cmsghdr;
2858 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2859 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2860 } control = {};
2861 struct msghdr msghdr = {
2862 .msg_iov = &iovec,
2863 .msg_iovlen = 1,
2864 .msg_control = &control,
2865 .msg_controllen = sizeof(control),
2866 };
2867 struct cmsghdr *cmsg;
2868 struct ucred *ucred = NULL;
2869 ssize_t n;
2870 pid_t inner_child_pid;
2871 _cleanup_strv_free_ char **tags = NULL;
2872
2873 assert(userdata);
2874
2875 inner_child_pid = PTR_TO_PID(userdata);
2876
2877 if (revents != EPOLLIN) {
2878 log_warning("Got unexpected poll event for notify fd.");
2879 return 0;
2880 }
2881
2882 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2883 if (n < 0) {
3742095b 2884 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
2885 return 0;
2886
2887 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2888 }
2889 cmsg_close_all(&msghdr);
2890
2891 CMSG_FOREACH(cmsg, &msghdr) {
2892 if (cmsg->cmsg_level == SOL_SOCKET &&
2893 cmsg->cmsg_type == SCM_CREDENTIALS &&
2894 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2895
2896 ucred = (struct ucred*) CMSG_DATA(cmsg);
2897 }
2898 }
2899
2900 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 2901 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
2902 return 0;
2903 }
2904
2905 if ((size_t) n >= sizeof(buf)) {
2906 log_warning("Received notify message exceeded maximum size. Ignoring.");
2907 return 0;
2908 }
2909
2910 buf[n] = 0;
2911 tags = strv_split(buf, "\n\r");
2912 if (!tags)
2913 return log_oom();
2914
2915 if (strv_find(tags, "READY=1"))
2916 sd_notifyf(false, "READY=1\n");
2917
2918 p = strv_find_startswith(tags, "STATUS=");
2919 if (p)
2920 sd_notifyf(false, "STATUS=Container running: %s", p);
2921
2922 return 0;
2923}
2924
5773024d 2925static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 2926 int r;
9c1e04d0 2927
5773024d 2928 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
2929 if (r < 0)
2930 return log_error_errno(r, "Failed to allocate notify event source: %m");
2931
5773024d 2932 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
2933
2934 return 0;
2935}
2936
f757855e
LP
2937static int load_settings(void) {
2938 _cleanup_(settings_freep) Settings *settings = NULL;
2939 _cleanup_fclose_ FILE *f = NULL;
2940 _cleanup_free_ char *p = NULL;
2941 const char *fn, *i;
2942 int r;
2943
2944 /* If all settings are masked, there's no point in looking for
2945 * the settings file */
2946 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2947 return 0;
2948
2949 fn = strjoina(arg_machine, ".nspawn");
2950
2951 /* We first look in the admin's directories in /etc and /run */
2952 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2953 _cleanup_free_ char *j = NULL;
2954
605405c6 2955 j = strjoin(i, "/", fn);
f757855e
LP
2956 if (!j)
2957 return log_oom();
2958
2959 f = fopen(j, "re");
2960 if (f) {
2961 p = j;
2962 j = NULL;
2963
b938cb90 2964 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2965 if (arg_settings_trusted < 0)
2966 arg_settings_trusted = true;
2967
2968 break;
2969 }
2970
2971 if (errno != ENOENT)
2972 return log_error_errno(errno, "Failed to open %s: %m", j);
2973 }
2974
2975 if (!f) {
2976 /* After that, let's look for a file next to the
2977 * actual image we shall boot. */
2978
2979 if (arg_image) {
2980 p = file_in_same_dir(arg_image, fn);
2981 if (!p)
2982 return log_oom();
2983 } else if (arg_directory) {
2984 p = file_in_same_dir(arg_directory, fn);
2985 if (!p)
2986 return log_oom();
2987 }
2988
2989 if (p) {
2990 f = fopen(p, "re");
2991 if (!f && errno != ENOENT)
2992 return log_error_errno(errno, "Failed to open %s: %m", p);
2993
b938cb90 2994 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2995 if (arg_settings_trusted < 0)
2996 arg_settings_trusted = false;
2997 }
2998 }
2999
3000 if (!f)
3001 return 0;
3002
3003 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3004
3005 r = settings_load(f, p, &settings);
3006 if (r < 0)
3007 return r;
3008
3009 /* Copy over bits from the settings, unless they have been
3010 * explicitly masked by command line switches. */
3011
7732f92b
LP
3012 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3013 settings->start_mode >= 0) {
3014 arg_start_mode = settings->start_mode;
f757855e
LP
3015
3016 strv_free(arg_parameters);
3017 arg_parameters = settings->parameters;
3018 settings->parameters = NULL;
3019 }
3020
b53ede69
PW
3021 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3022 settings->pivot_root_new) {
3023 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3024 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3025 }
3026
5f932eb9
LP
3027 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3028 settings->working_directory) {
3029 free(arg_chdir);
3030 arg_chdir = settings->working_directory;
3031 settings->working_directory = NULL;
3032 }
3033
f757855e
LP
3034 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3035 settings->environment) {
3036 strv_free(arg_setenv);
3037 arg_setenv = settings->environment;
3038 settings->environment = NULL;
3039 }
3040
3041 if ((arg_settings_mask & SETTING_USER) == 0 &&
3042 settings->user) {
3043 free(arg_user);
3044 arg_user = settings->user;
3045 settings->user = NULL;
3046 }
3047
3048 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3049 uint64_t plus;
f757855e 3050
0e265674
LP
3051 plus = settings->capability;
3052 if (settings_private_network(settings))
3053 plus |= (1ULL << CAP_NET_ADMIN);
3054
3055 if (!arg_settings_trusted && plus != 0) {
3056 if (settings->capability != 0)
3057 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3058 } else
520e0d54 3059 arg_caps_retain |= plus;
f757855e 3060
520e0d54 3061 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3062 }
3063
3064 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3065 settings->kill_signal > 0)
3066 arg_kill_signal = settings->kill_signal;
3067
3068 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3069 settings->personality != PERSONALITY_INVALID)
3070 arg_personality = settings->personality;
3071
3072 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3073 !sd_id128_is_null(settings->machine_id)) {
3074
3075 if (!arg_settings_trusted)
3076 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3077 else
3078 arg_uuid = settings->machine_id;
3079 }
3080
3081 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3082 settings->read_only >= 0)
3083 arg_read_only = settings->read_only;
3084
3085 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3086 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3087 arg_volatile_mode = settings->volatile_mode;
3088
3089 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3090 settings->n_custom_mounts > 0) {
3091
3092 if (!arg_settings_trusted)
3093 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3094 else {
3095 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3096 arg_custom_mounts = settings->custom_mounts;
3097 arg_n_custom_mounts = settings->n_custom_mounts;
3098
3099 settings->custom_mounts = NULL;
3100 settings->n_custom_mounts = 0;
3101 }
3102 }
3103
3104 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3105 (settings->private_network >= 0 ||
3106 settings->network_veth >= 0 ||
3107 settings->network_bridge ||
22b28dfd 3108 settings->network_zone ||
f757855e
LP
3109 settings->network_interfaces ||
3110 settings->network_macvlan ||
f6d6bad1
LP
3111 settings->network_ipvlan ||
3112 settings->network_veth_extra)) {
f757855e
LP
3113
3114 if (!arg_settings_trusted)
3115 log_warning("Ignoring network settings, file %s is not trusted.", p);
3116 else {
f6d6bad1 3117 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3118 arg_private_network = settings_private_network(settings);
3119
f757855e
LP
3120 strv_free(arg_network_interfaces);
3121 arg_network_interfaces = settings->network_interfaces;
3122 settings->network_interfaces = NULL;
3123
3124 strv_free(arg_network_macvlan);
3125 arg_network_macvlan = settings->network_macvlan;
3126 settings->network_macvlan = NULL;
3127
3128 strv_free(arg_network_ipvlan);
3129 arg_network_ipvlan = settings->network_ipvlan;
3130 settings->network_ipvlan = NULL;
3131
f6d6bad1
LP
3132 strv_free(arg_network_veth_extra);
3133 arg_network_veth_extra = settings->network_veth_extra;
3134 settings->network_veth_extra = NULL;
3135
f757855e
LP
3136 free(arg_network_bridge);
3137 arg_network_bridge = settings->network_bridge;
3138 settings->network_bridge = NULL;
22b28dfd
LP
3139
3140 free(arg_network_zone);
3141 arg_network_zone = settings->network_zone;
3142 settings->network_zone = NULL;
f757855e
LP
3143 }
3144 }
3145
3146 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3147 settings->expose_ports) {
3148
3149 if (!arg_settings_trusted)
3150 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3151 else {
3152 expose_port_free_all(arg_expose_ports);
3153 arg_expose_ports = settings->expose_ports;
3154 settings->expose_ports = NULL;
3155 }
3156 }
3157
0de7acce
LP
3158 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3159 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3160
3161 if (!arg_settings_trusted)
3162 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3163 else {
3164 arg_userns_mode = settings->userns_mode;
3165 arg_uid_shift = settings->uid_shift;
3166 arg_uid_range = settings->uid_range;
3167 arg_userns_chown = settings->userns_chown;
3168 }
3169 }
3170
9c1e04d0
AP
3171 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3172 arg_notify_ready = settings->notify_ready;
3173
960e4569
LP
3174 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3175
3176 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
3177 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
3178 else {
3179 strv_free(arg_syscall_whitelist);
3180 strv_free(arg_syscall_blacklist);
3181
3182 arg_syscall_whitelist = settings->syscall_whitelist;
3183 arg_syscall_blacklist = settings->syscall_blacklist;
3184
3185 settings->syscall_whitelist = settings->syscall_blacklist = NULL;
3186 }
3187 }
3188
f757855e
LP
3189 return 0;
3190}
3191
b0067625
ZJS
3192static int run(int master,
3193 const char* console,
2d845785 3194 DissectedImage *dissected_image,
b0067625
ZJS
3195 bool interactive,
3196 bool secondary,
3197 FDSet *fds,
3198 char veth_name[IFNAMSIZ], bool *veth_created,
3199 union in_addr_union *exposed,
3200 pid_t *pid, int *ret) {
3201
3202 static const struct sigaction sa = {
3203 .sa_handler = nop_signal_handler,
e28c7cd0 3204 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3205 };
3206
3207 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3208 _cleanup_close_ int etc_passwd_lock = -1;
3209 _cleanup_close_pair_ int
3210 kmsg_socket_pair[2] = { -1, -1 },
3211 rtnl_socket_pair[2] = { -1, -1 },
3212 pid_socket_pair[2] = { -1, -1 },
3213 uuid_socket_pair[2] = { -1, -1 },
3214 notify_socket_pair[2] = { -1, -1 },
3215 uid_shift_socket_pair[2] = { -1, -1 };
3216 _cleanup_close_ int notify_socket= -1;
3217 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3218 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3219 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3220 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3221 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3222 ContainerStatus container_status = 0;
3223 char last_char = 0;
3224 int ifi = 0, r;
3225 ssize_t l;
3226 sigset_t mask_chld;
3227
3228 assert_se(sigemptyset(&mask_chld) == 0);
3229 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3230
3231 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3232 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3233 * check with getpwuid() if the specific user already exists. Note that /etc might be
3234 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3235 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3236 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3237 * really ours. */
3238
3239 etc_passwd_lock = take_etc_passwd_lock(NULL);
3240 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3241 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3242 }
3243
3244 r = barrier_create(&barrier);
3245 if (r < 0)
3246 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3247
3248 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3249 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3250
3251 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3252 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3253
3254 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3255 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3256
3257 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3258 return log_error_errno(errno, "Failed to create id socket pair: %m");
3259
3260 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3261 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3262
3263 if (arg_userns_mode != USER_NAMESPACE_NO)
3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3265 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3266
3267 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3268 * parent's blocking calls and give it a chance to call wait() and terminate. */
3269 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3270 if (r < 0)
3271 return log_error_errno(errno, "Failed to change the signal mask: %m");
3272
3273 r = sigaction(SIGCHLD, &sa, NULL);
3274 if (r < 0)
3275 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3276
3277 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3278 if (*pid < 0)
3279 return log_error_errno(errno, "clone() failed%s: %m",
3280 errno == EINVAL ?
3281 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3282
3283 if (*pid == 0) {
3284 /* The outer child only has a file system namespace. */
3285 barrier_set_role(&barrier, BARRIER_CHILD);
3286
3287 master = safe_close(master);
3288
3289 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3290 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3291 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3292 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3293 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3294 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3295
3296 (void) reset_all_signal_handlers();
3297 (void) reset_signal_mask();
3298
3299 r = outer_child(&barrier,
3300 arg_directory,
3301 console,
2d845785 3302 dissected_image,
b0067625
ZJS
3303 interactive,
3304 secondary,
3305 pid_socket_pair[1],
3306 uuid_socket_pair[1],
3307 notify_socket_pair[1],
3308 kmsg_socket_pair[1],
3309 rtnl_socket_pair[1],
3310 uid_shift_socket_pair[1],
3311 fds);
3312 if (r < 0)
3313 _exit(EXIT_FAILURE);
3314
3315 _exit(EXIT_SUCCESS);
3316 }
3317
3318 barrier_set_role(&barrier, BARRIER_PARENT);
3319
3320 fds = fdset_free(fds);
3321
3322 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3323 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3324 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3325 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3326 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3327 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3328
3329 if (arg_userns_mode != USER_NAMESPACE_NO) {
3330 /* The child just let us know the UID shift it might have read from the image. */
3331 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3332 if (l < 0)
3333 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3334 if (l != sizeof arg_uid_shift) {
3335 log_error("Short read while reading UID shift.");
3336 return -EIO;
3337 }
3338
3339 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3340 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3341 * image, but if that's already in use, pick a new one, and report back to the child,
3342 * which one we now picked. */
3343
3344 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3345 if (r < 0)
3346 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3347
3348 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3349 if (l < 0)
3350 return log_error_errno(errno, "Failed to send UID shift: %m");
3351 if (l != sizeof arg_uid_shift) {
3352 log_error("Short write while writing UID shift.");
3353 return -EIO;
3354 }
3355 }
3356 }
3357
3358 /* Wait for the outer child. */
3359 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3360 if (r != 0)
3361 return r < 0 ? r : -EIO;
3362
3363 /* And now retrieve the PID of the inner child. */
3364 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3365 if (l < 0)
3366 return log_error_errno(errno, "Failed to read inner child PID: %m");
3367 if (l != sizeof *pid) {
3368 log_error("Short read while reading inner child PID.");
3369 return -EIO;
3370 }
3371
3372 /* We also retrieve container UUID in case it was generated by outer child */
3373 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3374 if (l < 0)
3375 return log_error_errno(errno, "Failed to read container machine ID: %m");
3376 if (l != sizeof(arg_uuid)) {
3377 log_error("Short read while reading container machined ID.");
3378 return -EIO;
3379 }
3380
3381 /* We also retrieve the socket used for notifications generated by outer child */
3382 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3383 if (notify_socket < 0)
3384 return log_error_errno(notify_socket,
3385 "Failed to receive notification socket from the outer child: %m");
3386
3387 log_debug("Init process invoked as PID "PID_FMT, *pid);
3388
3389 if (arg_userns_mode != USER_NAMESPACE_NO) {
3390 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3391 log_error("Child died too early.");
3392 return -ESRCH;
3393 }
3394
3395 r = setup_uid_map(*pid);
3396 if (r < 0)
3397 return r;
3398
3399 (void) barrier_place(&barrier); /* #2 */
3400 }
3401
3402 if (arg_private_network) {
3403
3404 r = move_network_interfaces(*pid, arg_network_interfaces);
3405 if (r < 0)
3406 return r;
3407
3408 if (arg_network_veth) {
3409 r = setup_veth(arg_machine, *pid, veth_name,
3410 arg_network_bridge || arg_network_zone);
3411 if (r < 0)
3412 return r;
3413 else if (r > 0)
3414 ifi = r;
3415
3416 if (arg_network_bridge) {
3417 /* Add the interface to a bridge */
3418 r = setup_bridge(veth_name, arg_network_bridge, false);
3419 if (r < 0)
3420 return r;
3421 if (r > 0)
3422 ifi = r;
3423 } else if (arg_network_zone) {
3424 /* Add the interface to a bridge, possibly creating it */
3425 r = setup_bridge(veth_name, arg_network_zone, true);
3426 if (r < 0)
3427 return r;
3428 if (r > 0)
3429 ifi = r;
3430 }
3431 }
3432
3433 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3434 if (r < 0)
3435 return r;
3436
3437 /* We created the primary and extra veth links now; let's remember this, so that we know to
3438 remove them later on. Note that we don't bother with removing veth links that were created
3439 here when their setup failed half-way, because in that case the kernel should be able to
3440 remove them on its own, since they cannot be referenced by anything yet. */
3441 *veth_created = true;
3442
3443 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3444 if (r < 0)
3445 return r;
3446
3447 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3448 if (r < 0)
3449 return r;
3450 }
3451
3452 if (arg_register) {
3453 r = register_machine(
3454 arg_machine,
3455 *pid,
3456 arg_directory,
3457 arg_uuid,
3458 ifi,
3459 arg_slice,
3460 arg_custom_mounts, arg_n_custom_mounts,
3461 arg_kill_signal,
3462 arg_property,
3463 arg_keep_unit,
3464 arg_container_service_name);
3465 if (r < 0)
3466 return r;
cd2dfc6f
LP
3467 } else if (!arg_keep_unit) {
3468 r = allocate_scope(
3469 arg_machine,
3470 *pid,
3471 arg_slice,
3472 arg_custom_mounts, arg_n_custom_mounts,
3473 arg_kill_signal,
3474 arg_property);
3475 if (r < 0)
3476 return r;
3477
3478 } else if (arg_slice || arg_property)
3479 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3480
f0bef277 3481 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3482 if (r < 0)
3483 return r;
3484
3485 if (arg_keep_unit) {
3486 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3487 if (r < 0)
3488 return r;
3489 }
3490
3491 r = chown_cgroup(*pid, arg_uid_shift);
3492 if (r < 0)
3493 return r;
3494
3495 /* Notify the child that the parent is ready with all
3496 * its setup (including cgroup-ification), and that
3497 * the child can now hand over control to the code to
3498 * run inside the container. */
3499 (void) barrier_place(&barrier); /* #3 */
3500
3501 /* Block SIGCHLD here, before notifying child.
3502 * process_pty() will handle it with the other signals. */
3503 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3504
3505 /* Reset signal to default */
3506 r = default_signals(SIGCHLD, -1);
3507 if (r < 0)
3508 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3509
3510 r = sd_event_new(&event);
3511 if (r < 0)
3512 return log_error_errno(r, "Failed to get default event source: %m");
3513
5773024d 3514 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3515 if (r < 0)
3516 return r;
3517
3518 /* Let the child know that we are ready and wait that the child is completely ready now. */
3519 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3520 log_error("Child died too early.");
3521 return -ESRCH;
3522 }
3523
3524 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3525 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3526 etc_passwd_lock = safe_close(etc_passwd_lock);
3527
3528 sd_notifyf(false,
3529 "STATUS=Container running.\n"
3530 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3531 if (!arg_notify_ready)
3532 sd_notify(false, "READY=1\n");
3533
3534 if (arg_kill_signal > 0) {
3535 /* Try to kill the init system on SIGINT or SIGTERM */
3536 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3537 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3538 } else {
3539 /* Immediately exit */
3540 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3541 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3542 }
3543
6916b164
AU
3544 /* Exit when the child exits */
3545 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3546
3547 if (arg_expose_ports) {
3548 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3549 if (r < 0)
3550 return r;
3551
3552 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3553 }
3554
3555 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3556
3557 r = pty_forward_new(event, master,
3558 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3559 &forward);
3560 if (r < 0)
3561 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3562
3563 r = sd_event_loop(event);
3564 if (r < 0)
3565 return log_error_errno(r, "Failed to run event loop: %m");
3566
3567 pty_forward_get_last_char(forward, &last_char);
3568
3569 forward = pty_forward_free(forward);
3570
3571 if (!arg_quiet && last_char != '\n')
3572 putc('\n', stdout);
3573
3574 /* Kill if it is not dead yet anyway */
3575 if (arg_register && !arg_keep_unit)
3576 terminate_machine(*pid);
3577
3578 /* Normally redundant, but better safe than sorry */
c67b0082 3579 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3580
3581 r = wait_for_container(*pid, &container_status);
3582 *pid = 0;
3583
3584 if (r < 0)
3585 /* We failed to wait for the container, or the container exited abnormally. */
3586 return r;
3587 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3588 /* r > 0 → The container exited with a non-zero status.
3589 * As a special case, we need to replace 133 with a different value,
3590 * because 133 is special-cased in the service file to reboot the container.
3591 * otherwise → The container exited with zero status and a reboot was not requested.
3592 */
2a49b612 3593 if (r == EXIT_FORCE_RESTART)
27e29a1e 3594 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3595 *ret = r;
b0067625
ZJS
3596 return 0; /* finito */
3597 }
3598
3599 /* CONTAINER_REBOOTED, loop again */
3600
3601 if (arg_keep_unit) {
3602 /* Special handling if we are running as a service: instead of simply
3603 * restarting the machine we want to restart the entire service, so let's
3604 * inform systemd about this with the special exit code 133. The service
3605 * file uses RestartForceExitStatus=133 so that this results in a full
3606 * nspawn restart. This is necessary since we might have cgroup parameters
3607 * set we want to have flushed out. */
2a49b612
ZJS
3608 *ret = EXIT_FORCE_RESTART;
3609 return 0; /* finito */
b0067625
ZJS
3610 }
3611
3612 expose_port_flush(arg_expose_ports, exposed);
3613
3614 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3615 *veth_created = false;
3616 return 1; /* loop again */
3617}
3618
03cfe0d5
LP
3619int main(int argc, char *argv[]) {
3620
2d845785
LP
3621 _cleanup_free_ char *console = NULL;
3622 _cleanup_close_ int master = -1;
03cfe0d5 3623 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 3624 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 3625 char veth_name[IFNAMSIZ] = "";
17cbb288 3626 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 3627 pid_t pid = 0;
03cfe0d5
LP
3628 union in_addr_union exposed = {};
3629 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
3630 bool interactive, veth_created = false, remove_tmprootdir = false;
3631 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 3632 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
3633 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3634 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
3635
3636 log_parse_environment();
3637 log_open();
415fc41c 3638
7732f92b
LP
3639 /* Make sure rename_process() in the stub init process can work */
3640 saved_argv = argv;
3641 saved_argc = argc;
3642
03cfe0d5
LP
3643 r = parse_argv(argc, argv);
3644 if (r <= 0)
3645 goto finish;
3646
03cfe0d5
LP
3647 if (geteuid() != 0) {
3648 log_error("Need to be root.");
3649 r = -EPERM;
3650 goto finish;
3651 }
f757855e
LP
3652 r = determine_names();
3653 if (r < 0)
3654 goto finish;
3655
3656 r = load_settings();
3657 if (r < 0)
3658 goto finish;
3659
3660 r = verify_arguments();
3661 if (r < 0)
3662 goto finish;
03cfe0d5
LP
3663
3664 n_fd_passed = sd_listen_fds(false);
3665 if (n_fd_passed > 0) {
3666 r = fdset_new_listen_fds(&fds, false);
3667 if (r < 0) {
3668 log_error_errno(r, "Failed to collect file descriptors: %m");
3669 goto finish;
3670 }
3671 }
3672
3673 if (arg_directory) {
3674 assert(!arg_image);
3675
3676 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3677 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3678 r = -EINVAL;
3679 goto finish;
3680 }
3681
3682 if (arg_ephemeral) {
3683 _cleanup_free_ char *np = NULL;
3684
8d4aa2bb 3685 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
3686 if (r < 0)
3687 goto finish;
3688
03cfe0d5
LP
3689 /* If the specified path is a mount point we
3690 * generate the new snapshot immediately
3691 * inside it under a random name. However if
3692 * the specified is not a mount point we
3693 * create the new snapshot in the parent
3694 * directory, just next to it. */
e1873695 3695 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
3696 if (r < 0) {
3697 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3698 goto finish;
3699 }
3700 if (r > 0)
770b5ce4 3701 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3702 else
770b5ce4 3703 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 3704 if (r < 0) {
0f3be6ca 3705 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
3706 goto finish;
3707 }
3708
3709 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3710 if (r < 0) {
3711 log_error_errno(r, "Failed to lock %s: %m", np);
3712 goto finish;
3713 }
3714
17cbb288
LP
3715 r = btrfs_subvol_snapshot(arg_directory, np,
3716 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3717 BTRFS_SNAPSHOT_FALLBACK_COPY |
3718 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3719 BTRFS_SNAPSHOT_RECURSIVE |
3720 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3721 if (r < 0) {
3722 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3723 goto finish;
ec16945e
LP
3724 }
3725
3726 free(arg_directory);
3727 arg_directory = np;
8a16a7b4 3728 np = NULL;
ec16945e 3729
17cbb288 3730 remove_directory = true;
30535c16
LP
3731
3732 } else {
cb638b5e 3733 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
3734 if (r < 0)
3735 goto finish;
3736
30535c16
LP
3737 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3738 if (r == -EBUSY) {
3739 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3740 goto finish;
3741 }
3742 if (r < 0) {
3743 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3744 goto finish;
30535c16
LP
3745 }
3746
3747 if (arg_template) {
8d4aa2bb 3748 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
3749 if (r < 0)
3750 goto finish;
3751
17cbb288
LP
3752 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3753 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3754 BTRFS_SNAPSHOT_FALLBACK_COPY |
3755 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3756 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3757 BTRFS_SNAPSHOT_RECURSIVE |
3758 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3759 if (r == -EEXIST) {
3760 if (!arg_quiet)
3761 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3762 } else if (r < 0) {
83521414 3763 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3764 goto finish;
3765 } else {
3766 if (!arg_quiet)
3767 log_info("Populated %s from template %s.", arg_directory, arg_template);
3768 }
3769 }
ec16945e
LP
3770 }
3771
7732f92b 3772 if (arg_start_mode == START_BOOT) {
1b9e5b12 3773 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3774 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3775 r = -EINVAL;
1b9e5b12
LP
3776 goto finish;
3777 }
3778 } else {
3779 const char *p;
3780
16fb773e
LP
3781 p = strjoina(arg_directory, "/usr/");
3782 if (laccess(p, F_OK) < 0) {
3783 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3784 r = -EINVAL;
1b9e5b12 3785 goto finish;
1b9e5b12
LP
3786 }
3787 }
ec16945e 3788
6b9132a9 3789 } else {
ec16945e
LP
3790 assert(arg_image);
3791 assert(!arg_template);
3792
8d4aa2bb 3793 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
3794 if (r < 0)
3795 goto finish;
3796
0f3be6ca
LP
3797 if (arg_ephemeral) {
3798 _cleanup_free_ char *np = NULL;
3799
3800 r = tempfn_random(arg_image, "machine.", &np);
3801 if (r < 0) {
3802 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3803 goto finish;
3804 }
3805
3806 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3807 if (r < 0) {
3808 r = log_error_errno(r, "Failed to create image lock: %m");
3809 goto finish;
3810 }
3811
1c876927 3812 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
3813 if (r < 0) {
3814 r = log_error_errno(r, "Failed to copy image file: %m");
3815 goto finish;
3816 }
3817
3818 free(arg_image);
3819 arg_image = np;
3820 np = NULL;
3821
3822 remove_image = true;
3823 } else {
3824 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3825 if (r == -EBUSY) {
3826 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3827 goto finish;
3828 }
3829 if (r < 0) {
3830 r = log_error_errno(r, "Failed to create image lock: %m");
3831 goto finish;
3832 }
4623e8e6 3833
78ebe980
LP
3834 if (!arg_root_hash) {
3835 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3836 if (r < 0) {
3837 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3838 goto finish;
3839 }
3840 }
30535c16
LP
3841 }
3842
c67b0082 3843 if (!mkdtemp(tmprootdir)) {
0f3be6ca 3844 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 3845 goto finish;
1b9e5b12 3846 }
6b9132a9 3847
c67b0082
LP
3848 remove_tmprootdir = true;
3849
3850 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
3851 if (!arg_directory) {
3852 r = log_oom();
3853 goto finish;
6b9132a9 3854 }
88213476 3855
2d845785
LP
3856 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3857 if (r < 0) {
3858 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
3859 goto finish;
3860 }
1b9e5b12 3861
e0f9e7bd
LP
3862 r = dissect_image(
3863 loop->fd,
3864 arg_root_hash, arg_root_hash_size,
3865 DISSECT_IMAGE_REQUIRE_ROOT,
3866 &dissected_image);
2d845785
LP
3867 if (r == -ENOPKG) {
3868 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3869
3870 log_notice("Note that the disk image needs to\n"
3871 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3872 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3873 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3874 " d) or contain a file system without a partition table\n"
3875 "in order to be bootable with systemd-nspawn.");
1b9e5b12 3876 goto finish;
2d845785 3877 }
4623e8e6
LP
3878 if (r == -EADDRNOTAVAIL) {
3879 log_error_errno(r, "No root partition for specified root hash found.");
3880 goto finish;
3881 }
2d845785
LP
3882 if (r == -EOPNOTSUPP) {
3883 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3884 goto finish;
3885 }
3886 if (r < 0) {
3887 log_error_errno(r, "Failed to dissect image: %m");
842f3b0f
LP
3888 goto finish;
3889 }
1b9e5b12 3890
4623e8e6
LP
3891 if (!arg_root_hash && dissected_image->can_verity)
3892 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3893
3894 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
3895 if (r < 0)
3896 goto finish;
0f3be6ca
LP
3897
3898 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3899 if (remove_image && unlink(arg_image) >= 0)
3900 remove_image = false;
842f3b0f 3901 }
842f3b0f 3902
86c0dd4a 3903 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
3904 if (r < 0)
3905 goto finish;
bd15ab41
TH
3906
3907 r = detect_unified_cgroup_hierarchy(arg_directory);
3908 if (r < 0)
3909 goto finish;
5a8af538 3910
03cfe0d5
LP
3911 interactive =
3912 isatty(STDIN_FILENO) > 0 &&
3913 isatty(STDOUT_FILENO) > 0;
9c857b9d 3914
db7feb7e
LP
3915 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3916 if (master < 0) {
ec16945e 3917 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3918 goto finish;
3919 }
3920
611b312b
LP
3921 r = ptsname_malloc(master, &console);
3922 if (r < 0) {
3923 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3924 goto finish;
68b02049
DW
3925 }
3926
3927 if (arg_selinux_apifs_context) {
3928 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3929 if (r < 0)
3930 goto finish;
a258bf26
LP
3931 }
3932
a258bf26 3933 if (unlockpt(master) < 0) {
ec16945e 3934 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3935 goto finish;
3936 }
3937
9c857b9d
LP
3938 if (!arg_quiet)
3939 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3940 arg_machine, arg_image ?: arg_directory);
3941
72c0a2c2 3942 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3943
03cfe0d5
LP
3944 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3945 r = log_error_errno(errno, "Failed to become subreaper: %m");
3946 goto finish;
3947 }
3948
d87be9b0 3949 for (;;) {
b0067625
ZJS
3950 r = run(master,
3951 console,
2d845785 3952 dissected_image,
b0067625
ZJS
3953 interactive, secondary,
3954 fds,
3955 veth_name, &veth_created,
3956 &exposed,
3957 &pid, &ret);
3958 if (r <= 0)
d87be9b0 3959 break;
d87be9b0 3960 }
88213476
LP
3961
3962finish:
af4ec430 3963 sd_notify(false,
2a49b612
ZJS
3964 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3965 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 3966
9444b1f2 3967 if (pid > 0)
c67b0082 3968 (void) kill(pid, SIGKILL);
88213476 3969
503546da 3970 /* Try to flush whatever is still queued in the pty */
6a0f896b 3971 if (master >= 0) {
1c876927 3972 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
3973 master = safe_close(master);
3974 }
3975
3976 if (pid > 0)
3977 (void) wait_for_terminate(pid, NULL);
503546da 3978
17cbb288 3979 if (remove_directory && arg_directory) {
ec16945e
LP
3980 int k;
3981
17cbb288 3982 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 3983 if (k < 0)
17cbb288 3984 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
3985 }
3986
0f3be6ca
LP
3987 if (remove_image && arg_image) {
3988 if (unlink(arg_image) < 0)
3989 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3990 }
3991
c67b0082
LP
3992 if (remove_tmprootdir) {
3993 if (rmdir(tmprootdir) < 0)
3994 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3995 }
3996
785890ac
LP
3997 if (arg_machine) {
3998 const char *p;
3999
63c372cb 4000 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4001 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4002 }
4003
7a8f6325 4004 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4005
4006 if (veth_created)
4007 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4008 (void) remove_bridge(arg_network_zone);
f757855e 4009
04d391da 4010 free(arg_directory);
ec16945e
LP
4011 free(arg_template);
4012 free(arg_image);
7027ff61 4013 free(arg_machine);
c74e630d 4014 free(arg_user);
b53ede69
PW
4015 free(arg_pivot_root_new);
4016 free(arg_pivot_root_old);
5f932eb9 4017 free(arg_chdir);
c74e630d 4018 strv_free(arg_setenv);
f757855e 4019 free(arg_network_bridge);
c74e630d
LP
4020 strv_free(arg_network_interfaces);
4021 strv_free(arg_network_macvlan);
4bbfe7ad 4022 strv_free(arg_network_ipvlan);
f6d6bad1 4023 strv_free(arg_network_veth_extra);
f757855e
LP
4024 strv_free(arg_parameters);
4025 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4026 expose_port_free_all(arg_expose_ports);
4623e8e6 4027 free(arg_root_hash);
6d0b55c2 4028
ec16945e 4029 return r < 0 ? EXIT_FAILURE : ret;
88213476 4030}