]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
tree-wide: replace all readdir cycles with FOREACH_DIRENT{,_ALL} (#4853)
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
f97b34a6 60#include "format-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
118 CONTAINER_REBOOTED
119} ContainerStatus;
120
57fb9fb5
LP
121typedef enum LinkJournal {
122 LINK_NO,
123 LINK_AUTO,
124 LINK_HOST,
125 LINK_GUEST
126} LinkJournal;
88213476
LP
127
128static char *arg_directory = NULL;
ec16945e 129static char *arg_template = NULL;
5f932eb9 130static char *arg_chdir = NULL;
687d0825 131static char *arg_user = NULL;
9444b1f2 132static sd_id128_t arg_uuid = {};
7027ff61 133static char *arg_machine = NULL;
c74e630d
LP
134static const char *arg_selinux_context = NULL;
135static const char *arg_selinux_apifs_context = NULL;
9444b1f2 136static const char *arg_slice = NULL;
ff01d048 137static bool arg_private_network = false;
bc2f673e 138static bool arg_read_only = false;
7732f92b 139static StartMode arg_start_mode = START_PID1;
ec16945e 140static bool arg_ephemeral = false;
57fb9fb5 141static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 142static bool arg_link_journal_try = false;
520e0d54 143static uint64_t arg_caps_retain =
50b52222
LP
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 155 (1ULL << CAP_MKNOD) |
5076f0cc
LP
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
5076f0cc 159 (1ULL << CAP_SETFCAP) |
50b52222 160 (1ULL << CAP_SETGID) |
5076f0cc
LP
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
50b52222 164 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
50b52222 169 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
170static CustomMount *arg_custom_mounts = NULL;
171static unsigned arg_n_custom_mounts = 0;
f4889f65 172static char **arg_setenv = NULL;
284c0b91 173static bool arg_quiet = false;
eb91eb18 174static bool arg_register = true;
89f7c846 175static bool arg_keep_unit = false;
aa28aefe 176static char **arg_network_interfaces = NULL;
c74e630d 177static char **arg_network_macvlan = NULL;
4bbfe7ad 178static char **arg_network_ipvlan = NULL;
69c79d3c 179static bool arg_network_veth = false;
f6d6bad1 180static char **arg_network_veth_extra = NULL;
f757855e 181static char *arg_network_bridge = NULL;
22b28dfd 182static char *arg_network_zone = NULL;
050f7277 183static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 184static char *arg_image = NULL;
f757855e 185static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
f36933fe 187static char **arg_property = NULL;
0de7acce 188static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 189static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 190static bool arg_userns_chown = false;
c6c8f6e2 191static int arg_kill_signal = 0;
5da38d07 192static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
193static SettingsMask arg_settings_mask = 0;
194static int arg_settings_trusted = -1;
195static char **arg_parameters = NULL;
6aadfa4c 196static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 197static bool arg_notify_ready = false;
5a8ff0e6 198static bool arg_use_cgns = true;
0c582db0 199static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 200static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
88213476 201
601185b4 202static void help(void) {
88213476
LP
203 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
204 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
205 " -h --help Show this help\n"
206 " --version Print version string\n"
69c79d3c 207 " -q --quiet Do not show status information\n"
1b9e5b12 208 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
209 " --template=PATH Initialize root directory from template directory,\n"
210 " if missing\n"
211 " -x --ephemeral Run container with snapshot of root directory, and\n"
212 " remove it after exit\n"
213 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 214 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 215 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 216 " --chdir=PATH Set working directory in the container\n"
a8828ed9 217 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 218 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 219 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 220 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 221 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 222 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 223 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 224 " Similar, but with user configured UID/GID range\n"
24597ee0 225 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
226 " --private-network Disable network in container\n"
227 " --network-interface=INTERFACE\n"
228 " Assign an existing network interface to the\n"
229 " container\n"
c74e630d
LP
230 " --network-macvlan=INTERFACE\n"
231 " Create a macvlan network interface based on an\n"
232 " existing network interface to the container\n"
4bbfe7ad
TG
233 " --network-ipvlan=INTERFACE\n"
234 " Create a ipvlan network interface based on an\n"
235 " existing network interface to the container\n"
a8eaaee7 236 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 237 " and container\n"
f6d6bad1
LP
238 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
239 " Add an additional virtual Ethernet link between\n"
240 " host and container\n"
ab046dde 241 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
242 " Add a virtual Ethernet connection to the container\n"
243 " and attach it to an existing bridge on the host\n"
244 " --network-zone=NAME Similar, but attach the new interface to an\n"
245 " an automatically managed bridge interface\n"
6d0b55c2 246 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 247 " Expose a container IP port on the host\n"
82adf6af
LP
248 " -Z --selinux-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " processes in the container\n"
251 " -L --selinux-apifs-context=SECLABEL\n"
252 " Set the SELinux security context to be used by\n"
253 " API/tmpfs file systems in the container\n"
a8828ed9
DW
254 " --capability=CAP In addition to the default, retain specified\n"
255 " capability\n"
256 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 257 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
258 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
259 " host, try-guest, try-host\n"
574edc90 260 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 261 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
262 " --bind=PATH[:PATH[:OPTIONS]]\n"
263 " Bind mount a file or directory from the host into\n"
a8828ed9 264 " the container\n"
5e5bfa6e
EY
265 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
266 " Similar, but creates a read-only bind mount\n"
06c17c39 267 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
268 " --overlay=PATH[:PATH...]:PATH\n"
269 " Create an overlay mount from the host to \n"
270 " the container\n"
271 " --overlay-ro=PATH[:PATH...]:PATH\n"
272 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 273 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 274 " --register=BOOLEAN Register container as machine\n"
89f7c846 275 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 276 " the service unit nspawn is running in\n"
6d0b55c2 277 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 278 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 279 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 280 , program_invocation_short_name);
88213476
LP
281}
282
86c0dd4a 283static int custom_mount_check_all(void) {
5a8af538 284 unsigned i;
5a8af538 285
5a8af538
LP
286 for (i = 0; i < arg_n_custom_mounts; i++) {
287 CustomMount *m = &arg_custom_mounts[i];
288
0de7acce 289 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
290
291 if (arg_userns_chown) {
292 log_error("--private-users-chown may not be combined with custom root mounts.");
293 return -EINVAL;
294 } else if (arg_uid_shift == UID_INVALID) {
295 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
296 return -EINVAL;
297 }
825d5287 298 }
5a8af538
LP
299 }
300
301 return 0;
302}
303
0fd9563f 304static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 305 const char *e;
5da38d07
TH
306 int r, all_unified, systemd_unified;
307
efdb0237
LP
308 /* Allow the user to control whether the unified hierarchy is used */
309 e = getenv("UNIFIED_CGROUP_HIERARCHY");
310 if (e) {
311 r = parse_boolean(e);
312 if (r < 0)
313 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
314 if (r > 0)
315 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
316 else
317 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 318
efdb0237
LP
319 return 0;
320 }
321
98afd6af
ZJS
322 all_unified = cg_all_unified();
323 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
324
325 if (all_unified < 0 || systemd_unified < 0)
326 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
327 "Failed to determine whether the unified cgroups hierarchy is used: %m");
328
efdb0237 329 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
330 if (all_unified > 0) {
331 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
332 * routine only detects 231, so we'll have a false negative here for 230. */
333 r = systemd_installation_has_version(directory, 230);
334 if (r < 0)
335 return log_error_errno(r, "Failed to determine systemd version in container: %m");
336 if (r > 0)
337 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
338 else
339 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
340 } else if (systemd_unified > 0) {
341 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
342 r = systemd_installation_has_version(directory, 232);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine systemd version in container: %m");
345 if (r > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
347 else
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
349 } else
5da38d07 350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 351
efdb0237
LP
352 return 0;
353}
354
0c582db0
LB
355static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
356 int r;
357
358 r = getenv_bool(name);
359 if (r == -ENXIO)
360 return;
361 if (r < 0)
362 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
363 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
364}
365
4f086aab
SU
366static void parse_mount_settings_env(void) {
367 int r;
368 const char *e;
369
370 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
371 if (!e)
372 return;
373
374 if (streq(e, "network")) {
375 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
376 return;
377 }
378
379 r = parse_boolean(e);
380 if (r < 0) {
381 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
382 return;
383 } else if (r > 0)
384 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
385 else
386 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
387
388 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
389}
390
88213476
LP
391static int parse_argv(int argc, char *argv[]) {
392
a41fe3a2 393 enum {
acbeb427
ZJS
394 ARG_VERSION = 0x100,
395 ARG_PRIVATE_NETWORK,
bc2f673e 396 ARG_UUID,
5076f0cc 397 ARG_READ_ONLY,
57fb9fb5 398 ARG_CAPABILITY,
420c7379 399 ARG_DROP_CAPABILITY,
17fe0523
LP
400 ARG_LINK_JOURNAL,
401 ARG_BIND,
f4889f65 402 ARG_BIND_RO,
06c17c39 403 ARG_TMPFS,
5a8af538
LP
404 ARG_OVERLAY,
405 ARG_OVERLAY_RO,
eb91eb18 406 ARG_SHARE_SYSTEM,
89f7c846 407 ARG_REGISTER,
aa28aefe 408 ARG_KEEP_UNIT,
69c79d3c 409 ARG_NETWORK_INTERFACE,
c74e630d 410 ARG_NETWORK_MACVLAN,
4bbfe7ad 411 ARG_NETWORK_IPVLAN,
ab046dde 412 ARG_NETWORK_BRIDGE,
22b28dfd 413 ARG_NETWORK_ZONE,
f6d6bad1 414 ARG_NETWORK_VETH_EXTRA,
6afc95b7 415 ARG_PERSONALITY,
4d9f07b4 416 ARG_VOLATILE,
ec16945e 417 ARG_TEMPLATE,
f36933fe 418 ARG_PROPERTY,
6dac160c 419 ARG_PRIVATE_USERS,
c6c8f6e2 420 ARG_KILL_SIGNAL,
f757855e 421 ARG_SETTINGS,
5f932eb9 422 ARG_CHDIR,
7336138e 423 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 424 ARG_NOTIFY_READY,
a41fe3a2
LP
425 };
426
88213476 427 static const struct option options[] = {
27eb8e90
ZJS
428 { "help", no_argument, NULL, 'h' },
429 { "version", no_argument, NULL, ARG_VERSION },
430 { "directory", required_argument, NULL, 'D' },
431 { "template", required_argument, NULL, ARG_TEMPLATE },
432 { "ephemeral", no_argument, NULL, 'x' },
433 { "user", required_argument, NULL, 'u' },
434 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
435 { "as-pid2", no_argument, NULL, 'a' },
436 { "boot", no_argument, NULL, 'b' },
437 { "uuid", required_argument, NULL, ARG_UUID },
438 { "read-only", no_argument, NULL, ARG_READ_ONLY },
439 { "capability", required_argument, NULL, ARG_CAPABILITY },
440 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
441 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
442 { "bind", required_argument, NULL, ARG_BIND },
443 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
444 { "tmpfs", required_argument, NULL, ARG_TMPFS },
445 { "overlay", required_argument, NULL, ARG_OVERLAY },
446 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
447 { "machine", required_argument, NULL, 'M' },
448 { "slice", required_argument, NULL, 'S' },
449 { "setenv", required_argument, NULL, 'E' },
450 { "selinux-context", required_argument, NULL, 'Z' },
451 { "selinux-apifs-context", required_argument, NULL, 'L' },
452 { "quiet", no_argument, NULL, 'q' },
453 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
454 { "register", required_argument, NULL, ARG_REGISTER },
455 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
456 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
457 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
458 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
459 { "network-veth", no_argument, NULL, 'n' },
460 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
461 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
462 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
463 { "personality", required_argument, NULL, ARG_PERSONALITY },
464 { "image", required_argument, NULL, 'i' },
465 { "volatile", optional_argument, NULL, ARG_VOLATILE },
466 { "port", required_argument, NULL, 'p' },
467 { "property", required_argument, NULL, ARG_PROPERTY },
468 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
469 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
470 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
471 { "settings", required_argument, NULL, ARG_SETTINGS },
472 { "chdir", required_argument, NULL, ARG_CHDIR },
473 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 474 {}
88213476
LP
475 };
476
9444b1f2 477 int c, r;
6aadfa4c 478 const char *p, *e;
a42c8b54 479 uint64_t plus = 0, minus = 0;
f757855e 480 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
481
482 assert(argc >= 0);
483 assert(argv);
484
19aac838 485 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
486
487 switch (c) {
488
489 case 'h':
601185b4
ZJS
490 help();
491 return 0;
88213476 492
acbeb427 493 case ARG_VERSION:
3f6fd1ba 494 return version();
acbeb427 495
88213476 496 case 'D':
0f03c2a4 497 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 498 if (r < 0)
0f03c2a4 499 return r;
ec16945e
LP
500 break;
501
502 case ARG_TEMPLATE:
0f03c2a4 503 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 504 if (r < 0)
0f03c2a4 505 return r;
88213476
LP
506 break;
507
1b9e5b12 508 case 'i':
0f03c2a4 509 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 510 if (r < 0)
0f03c2a4 511 return r;
ec16945e
LP
512 break;
513
514 case 'x':
515 arg_ephemeral = true;
1b9e5b12
LP
516 break;
517
687d0825 518 case 'u':
2fc09a9c
DM
519 r = free_and_strdup(&arg_user, optarg);
520 if (r < 0)
7027ff61 521 return log_oom();
687d0825 522
f757855e 523 arg_settings_mask |= SETTING_USER;
687d0825
MV
524 break;
525
22b28dfd
LP
526 case ARG_NETWORK_ZONE: {
527 char *j;
528
529 j = strappend("vz-", optarg);
530 if (!j)
531 return log_oom();
532
533 if (!ifname_valid(j)) {
534 log_error("Network zone name not valid: %s", j);
535 free(j);
536 return -EINVAL;
537 }
538
539 free(arg_network_zone);
540 arg_network_zone = j;
541
542 arg_network_veth = true;
543 arg_private_network = true;
544 arg_settings_mask |= SETTING_NETWORK;
545 break;
546 }
547
ab046dde 548 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
549
550 if (!ifname_valid(optarg)) {
551 log_error("Bridge interface name not valid: %s", optarg);
552 return -EINVAL;
553 }
554
f757855e
LP
555 r = free_and_strdup(&arg_network_bridge, optarg);
556 if (r < 0)
557 return log_oom();
ab046dde
TG
558
559 /* fall through */
560
0dfaa006 561 case 'n':
69c79d3c
LP
562 arg_network_veth = true;
563 arg_private_network = true;
f757855e 564 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
565 break;
566
f6d6bad1
LP
567 case ARG_NETWORK_VETH_EXTRA:
568 r = veth_extra_parse(&arg_network_veth_extra, optarg);
569 if (r < 0)
570 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
571
572 arg_private_network = true;
573 arg_settings_mask |= SETTING_NETWORK;
574 break;
575
aa28aefe 576 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
577
578 if (!ifname_valid(optarg)) {
579 log_error("Network interface name not valid: %s", optarg);
580 return -EINVAL;
581 }
582
c74e630d
LP
583 if (strv_extend(&arg_network_interfaces, optarg) < 0)
584 return log_oom();
585
586 arg_private_network = true;
f757855e 587 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
588 break;
589
590 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
591
592 if (!ifname_valid(optarg)) {
593 log_error("MACVLAN network interface name not valid: %s", optarg);
594 return -EINVAL;
595 }
596
c74e630d 597 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
598 return log_oom();
599
4bbfe7ad 600 arg_private_network = true;
f757855e 601 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
602 break;
603
604 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
605
606 if (!ifname_valid(optarg)) {
607 log_error("IPVLAN network interface name not valid: %s", optarg);
608 return -EINVAL;
609 }
610
4bbfe7ad
TG
611 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
612 return log_oom();
613
aa28aefe
LP
614 /* fall through */
615
ff01d048
LP
616 case ARG_PRIVATE_NETWORK:
617 arg_private_network = true;
f757855e 618 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
619 break;
620
0f0dbc46 621 case 'b':
7732f92b
LP
622 if (arg_start_mode == START_PID2) {
623 log_error("--boot and --as-pid2 may not be combined.");
624 return -EINVAL;
625 }
626
627 arg_start_mode = START_BOOT;
628 arg_settings_mask |= SETTING_START_MODE;
629 break;
630
631 case 'a':
632 if (arg_start_mode == START_BOOT) {
633 log_error("--boot and --as-pid2 may not be combined.");
634 return -EINVAL;
635 }
636
637 arg_start_mode = START_PID2;
638 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
639 break;
640
144f0fc0 641 case ARG_UUID:
9444b1f2 642 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
643 if (r < 0)
644 return log_error_errno(r, "Invalid UUID: %s", optarg);
645
646 if (sd_id128_is_null(arg_uuid)) {
647 log_error("Machine UUID may not be all zeroes.");
648 return -EINVAL;
aa96c6cb 649 }
f757855e
LP
650
651 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 652 break;
aa96c6cb 653
9444b1f2 654 case 'S':
c74e630d 655 arg_slice = optarg;
144f0fc0
LP
656 break;
657
7027ff61 658 case 'M':
c1521918 659 if (isempty(optarg))
97b11eed 660 arg_machine = mfree(arg_machine);
c1521918 661 else {
0c3c4284 662 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
663 log_error("Invalid machine name: %s", optarg);
664 return -EINVAL;
665 }
7027ff61 666
0c3c4284
LP
667 r = free_and_strdup(&arg_machine, optarg);
668 if (r < 0)
eb91eb18
LP
669 return log_oom();
670
671 break;
672 }
7027ff61 673
82adf6af
LP
674 case 'Z':
675 arg_selinux_context = optarg;
a8828ed9
DW
676 break;
677
82adf6af
LP
678 case 'L':
679 arg_selinux_apifs_context = optarg;
a8828ed9
DW
680 break;
681
bc2f673e
LP
682 case ARG_READ_ONLY:
683 arg_read_only = true;
f757855e 684 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
685 break;
686
420c7379
LP
687 case ARG_CAPABILITY:
688 case ARG_DROP_CAPABILITY: {
6cbe4ed1 689 p = optarg;
9ed794a3 690 for (;;) {
6cbe4ed1 691 _cleanup_free_ char *t = NULL;
5076f0cc 692
6cbe4ed1
SS
693 r = extract_first_word(&p, &t, ",", 0);
694 if (r < 0)
695 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 696
6cbe4ed1
SS
697 if (r == 0)
698 break;
5076f0cc 699
39ed67d1
LP
700 if (streq(t, "all")) {
701 if (c == ARG_CAPABILITY)
a42c8b54 702 plus = (uint64_t) -1;
39ed67d1 703 else
a42c8b54 704 minus = (uint64_t) -1;
39ed67d1 705 } else {
2822da4f
LP
706 int cap;
707
708 cap = capability_from_name(t);
709 if (cap < 0) {
39ed67d1
LP
710 log_error("Failed to parse capability %s.", t);
711 return -EINVAL;
712 }
713
714 if (c == ARG_CAPABILITY)
a42c8b54 715 plus |= 1ULL << (uint64_t) cap;
39ed67d1 716 else
a42c8b54 717 minus |= 1ULL << (uint64_t) cap;
5076f0cc 718 }
5076f0cc
LP
719 }
720
f757855e 721 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
722 break;
723 }
724
57fb9fb5
LP
725 case 'j':
726 arg_link_journal = LINK_GUEST;
574edc90 727 arg_link_journal_try = true;
57fb9fb5
LP
728 break;
729
730 case ARG_LINK_JOURNAL:
53e438e3 731 if (streq(optarg, "auto")) {
57fb9fb5 732 arg_link_journal = LINK_AUTO;
53e438e3
LP
733 arg_link_journal_try = false;
734 } else if (streq(optarg, "no")) {
57fb9fb5 735 arg_link_journal = LINK_NO;
53e438e3
LP
736 arg_link_journal_try = false;
737 } else if (streq(optarg, "guest")) {
57fb9fb5 738 arg_link_journal = LINK_GUEST;
53e438e3
LP
739 arg_link_journal_try = false;
740 } else if (streq(optarg, "host")) {
57fb9fb5 741 arg_link_journal = LINK_HOST;
53e438e3
LP
742 arg_link_journal_try = false;
743 } else if (streq(optarg, "try-guest")) {
574edc90
MP
744 arg_link_journal = LINK_GUEST;
745 arg_link_journal_try = true;
746 } else if (streq(optarg, "try-host")) {
747 arg_link_journal = LINK_HOST;
748 arg_link_journal_try = true;
749 } else {
57fb9fb5
LP
750 log_error("Failed to parse link journal mode %s", optarg);
751 return -EINVAL;
752 }
753
754 break;
755
17fe0523 756 case ARG_BIND:
f757855e
LP
757 case ARG_BIND_RO:
758 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
759 if (r < 0)
760 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 761
f757855e 762 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 763 break;
06c17c39 764
f757855e
LP
765 case ARG_TMPFS:
766 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
767 if (r < 0)
768 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 769
f757855e 770 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 771 break;
5a8af538
LP
772
773 case ARG_OVERLAY:
ad85779a
LP
774 case ARG_OVERLAY_RO:
775 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
776 if (r == -EADDRNOTAVAIL)
777 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
778 if (r < 0)
779 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 780
f757855e 781 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 782 break;
06c17c39 783
a5f1cb3b 784 case 'E': {
f4889f65
LP
785 char **n;
786
787 if (!env_assignment_is_valid(optarg)) {
788 log_error("Environment variable assignment '%s' is not valid.", optarg);
789 return -EINVAL;
790 }
791
792 n = strv_env_set(arg_setenv, optarg);
793 if (!n)
794 return log_oom();
795
796 strv_free(arg_setenv);
797 arg_setenv = n;
f757855e
LP
798
799 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
800 break;
801 }
802
284c0b91
LP
803 case 'q':
804 arg_quiet = true;
805 break;
806
8a96d94e 807 case ARG_SHARE_SYSTEM:
a6b5216c 808 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
809 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
810 arg_clone_ns_flags = 0;
8a96d94e
LP
811 break;
812
eb91eb18
LP
813 case ARG_REGISTER:
814 r = parse_boolean(optarg);
815 if (r < 0) {
816 log_error("Failed to parse --register= argument: %s", optarg);
817 return r;
818 }
819
820 arg_register = r;
821 break;
822
89f7c846
LP
823 case ARG_KEEP_UNIT:
824 arg_keep_unit = true;
825 break;
826
6afc95b7
LP
827 case ARG_PERSONALITY:
828
ac45f971 829 arg_personality = personality_from_string(optarg);
050f7277 830 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
831 log_error("Unknown or unsupported personality '%s'.", optarg);
832 return -EINVAL;
833 }
834
f757855e 835 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
836 break;
837
4d9f07b4
LP
838 case ARG_VOLATILE:
839
840 if (!optarg)
f757855e 841 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 842 else {
f757855e 843 VolatileMode m;
4d9f07b4 844
f757855e
LP
845 m = volatile_mode_from_string(optarg);
846 if (m < 0) {
847 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 848 return -EINVAL;
f757855e
LP
849 } else
850 arg_volatile_mode = m;
6d0b55c2
LP
851 }
852
f757855e
LP
853 arg_settings_mask |= SETTING_VOLATILE_MODE;
854 break;
6d0b55c2 855
f757855e
LP
856 case 'p':
857 r = expose_port_parse(&arg_expose_ports, optarg);
858 if (r == -EEXIST)
859 return log_error_errno(r, "Duplicate port specification: %s", optarg);
860 if (r < 0)
861 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 862
f757855e 863 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 864 break;
6d0b55c2 865
f36933fe
LP
866 case ARG_PROPERTY:
867 if (strv_extend(&arg_property, optarg) < 0)
868 return log_oom();
869
870 break;
871
ae209204
ZJS
872 case ARG_PRIVATE_USERS: {
873 int boolean = -1;
0de7acce 874
ae209204
ZJS
875 if (!optarg)
876 boolean = true;
877 else if (!in_charset(optarg, DIGITS))
878 /* do *not* parse numbers as booleans */
879 boolean = parse_boolean(optarg);
880
881 if (boolean == false) {
0de7acce
LP
882 /* no: User namespacing off */
883 arg_userns_mode = USER_NAMESPACE_NO;
884 arg_uid_shift = UID_INVALID;
885 arg_uid_range = UINT32_C(0x10000);
ae209204 886 } else if (boolean == true) {
0de7acce
LP
887 /* yes: User namespacing on, UID range is read from root dir */
888 arg_userns_mode = USER_NAMESPACE_FIXED;
889 arg_uid_shift = UID_INVALID;
890 arg_uid_range = UINT32_C(0x10000);
891 } else if (streq(optarg, "pick")) {
892 /* pick: User namespacing on, UID range is picked randomly */
893 arg_userns_mode = USER_NAMESPACE_PICK;
894 arg_uid_shift = UID_INVALID;
895 arg_uid_range = UINT32_C(0x10000);
896 } else {
6c2058b3 897 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
898 const char *range, *shift;
899
0de7acce
LP
900 /* anything else: User namespacing on, UID range is explicitly configured */
901
6dac160c
LP
902 range = strchr(optarg, ':');
903 if (range) {
6c2058b3
ZJS
904 buffer = strndup(optarg, range - optarg);
905 if (!buffer)
906 return log_oom();
907 shift = buffer;
6dac160c
LP
908
909 range++;
bfd292ec
ZJS
910 r = safe_atou32(range, &arg_uid_range);
911 if (r < 0)
be715731 912 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
913 } else
914 shift = optarg;
915
be715731
ZJS
916 r = parse_uid(shift, &arg_uid_shift);
917 if (r < 0)
918 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
919
920 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
921 }
922
be715731
ZJS
923 if (arg_uid_range <= 0) {
924 log_error("UID range cannot be 0.");
925 return -EINVAL;
926 }
927
0de7acce 928 arg_settings_mask |= SETTING_USERNS;
6dac160c 929 break;
ae209204 930 }
6dac160c 931
0de7acce 932 case 'U':
ccabee0d
LP
933 if (userns_supported()) {
934 arg_userns_mode = USER_NAMESPACE_PICK;
935 arg_uid_shift = UID_INVALID;
936 arg_uid_range = UINT32_C(0x10000);
937
938 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
939 }
940
7336138e
LP
941 break;
942
0de7acce 943 case ARG_PRIVATE_USERS_CHOWN:
19aac838 944 arg_userns_chown = true;
0de7acce
LP
945
946 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
947 break;
948
c6c8f6e2
LP
949 case ARG_KILL_SIGNAL:
950 arg_kill_signal = signal_from_string_try_harder(optarg);
951 if (arg_kill_signal < 0) {
952 log_error("Cannot parse signal: %s", optarg);
953 return -EINVAL;
954 }
955
f757855e
LP
956 arg_settings_mask |= SETTING_KILL_SIGNAL;
957 break;
958
959 case ARG_SETTINGS:
960
961 /* no → do not read files
962 * yes → read files, do not override cmdline, trust only subset
963 * override → read files, override cmdline, trust only subset
964 * trusted → read files, do not override cmdline, trust all
965 */
966
967 r = parse_boolean(optarg);
968 if (r < 0) {
969 if (streq(optarg, "trusted")) {
970 mask_all_settings = false;
971 mask_no_settings = false;
972 arg_settings_trusted = true;
973
974 } else if (streq(optarg, "override")) {
975 mask_all_settings = false;
976 mask_no_settings = true;
977 arg_settings_trusted = -1;
978 } else
979 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
980 } else if (r > 0) {
981 /* yes */
982 mask_all_settings = false;
983 mask_no_settings = false;
984 arg_settings_trusted = -1;
985 } else {
986 /* no */
987 mask_all_settings = true;
988 mask_no_settings = false;
989 arg_settings_trusted = false;
990 }
991
c6c8f6e2
LP
992 break;
993
5f932eb9
LP
994 case ARG_CHDIR:
995 if (!path_is_absolute(optarg)) {
996 log_error("Working directory %s is not an absolute path.", optarg);
997 return -EINVAL;
998 }
999
1000 r = free_and_strdup(&arg_chdir, optarg);
1001 if (r < 0)
1002 return log_oom();
1003
1004 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1005 break;
1006
9c1e04d0
AP
1007 case ARG_NOTIFY_READY:
1008 r = parse_boolean(optarg);
1009 if (r < 0) {
1010 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1011 return -EINVAL;
1012 }
1013 arg_notify_ready = r;
1014 arg_settings_mask |= SETTING_NOTIFY_READY;
1015 break;
1016
88213476
LP
1017 case '?':
1018 return -EINVAL;
1019
1020 default:
eb9da376 1021 assert_not_reached("Unhandled option");
88213476 1022 }
88213476 1023
0c582db0
LB
1024 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1025 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1026 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1027 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1028
4f086aab
SU
1029 if (arg_userns_mode != USER_NAMESPACE_NO)
1030 arg_mount_settings |= MOUNT_USE_USERNS;
1031
1032 if (arg_private_network)
1033 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1034
1035 parse_mount_settings_env();
1036
48a8d337
LB
1037 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1038 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1039 arg_register = false;
0c582db0
LB
1040 if (arg_start_mode != START_PID1) {
1041 log_error("--boot cannot be used without namespacing.");
1042 return -EINVAL;
1043 }
1044 }
eb91eb18 1045
0de7acce 1046 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1047 arg_userns_chown = true;
1048
89f7c846
LP
1049 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1050 log_error("--keep-unit may not be used when invoked from a user session.");
1051 return -EINVAL;
1052 }
1053
1b9e5b12
LP
1054 if (arg_directory && arg_image) {
1055 log_error("--directory= and --image= may not be combined.");
1056 return -EINVAL;
1057 }
1058
ec16945e
LP
1059 if (arg_template && arg_image) {
1060 log_error("--template= and --image= may not be combined.");
1061 return -EINVAL;
1062 }
1063
8cd328d8
LP
1064 if (arg_ephemeral && arg_template && !arg_directory) {
1065 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1066 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1067 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1068 * --directory=". */
1069
1070 arg_directory = arg_template;
1071 arg_template = NULL;
1072 }
1073
ec16945e
LP
1074 if (arg_template && !(arg_directory || arg_machine)) {
1075 log_error("--template= needs --directory= or --machine=.");
1076 return -EINVAL;
1077 }
1078
1079 if (arg_ephemeral && arg_template) {
1080 log_error("--ephemeral and --template= may not be combined.");
1081 return -EINVAL;
1082 }
1083
df9a75e4
LP
1084 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1085 log_error("--ephemeral and --link-journal= may not be combined.");
1086 return -EINVAL;
1087 }
1088
ccabee0d 1089 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1090 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1091 return -EOPNOTSUPP;
1092 }
1093
1094 if (arg_userns_chown && arg_read_only) {
1095 log_error("--read-only and --private-users-chown may not be combined.");
1096 return -EINVAL;
1097 }
f757855e 1098
22b28dfd
LP
1099 if (arg_network_bridge && arg_network_zone) {
1100 log_error("--network-bridge= and --network-zone= may not be combined.");
1101 return -EINVAL;
1102 }
1103
f757855e
LP
1104 if (argc > optind) {
1105 arg_parameters = strv_copy(argv + optind);
1106 if (!arg_parameters)
1107 return log_oom();
1108
7732f92b 1109 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1110 }
1111
1112 /* Load all settings from .nspawn files */
1113 if (mask_no_settings)
1114 arg_settings_mask = 0;
1115
1116 /* Don't load any settings from .nspawn files */
1117 if (mask_all_settings)
1118 arg_settings_mask = _SETTINGS_MASK_ALL;
1119
520e0d54 1120 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1121
6aadfa4c
ILG
1122 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1123 if (e)
1124 arg_container_service_name = e;
1125
5a8ff0e6
CB
1126 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1127 if (r < 0)
1128 arg_use_cgns = cg_ns_supported();
1129 else
1130 arg_use_cgns = r;
1131
86c0dd4a
LP
1132 r = custom_mount_check_all();
1133 if (r < 0)
1134 return r;
1135
f757855e
LP
1136 return 1;
1137}
1138
1139static int verify_arguments(void) {
4f086aab
SU
1140 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1141 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1142 return -EINVAL;
1143 }
1144
1145 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1146 log_error("Cannot combine --private-users with read-write mounts.");
1147 return -EINVAL;
1148 }
f757855e
LP
1149
1150 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1151 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1152 return -EINVAL;
1153 }
1154
6d0b55c2
LP
1155 if (arg_expose_ports && !arg_private_network) {
1156 log_error("Cannot use --port= without private networking.");
1157 return -EINVAL;
1158 }
1159
1c1ea217
EV
1160#ifndef HAVE_LIBIPTC
1161 if (arg_expose_ports) {
1162 log_error("--port= is not supported, compiled without libiptc support.");
1163 return -EOPNOTSUPP;
1164 }
1165#endif
1166
7732f92b 1167 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1168 arg_kill_signal = SIGRTMIN+3;
1169
f757855e 1170 return 0;
88213476
LP
1171}
1172
03cfe0d5
LP
1173static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1174 assert(p);
1175
0de7acce 1176 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1177 return 0;
1178
1179 if (uid == UID_INVALID && gid == GID_INVALID)
1180 return 0;
1181
1182 if (uid != UID_INVALID) {
1183 uid += arg_uid_shift;
1184
1185 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1186 return -EOVERFLOW;
1187 }
1188
1189 if (gid != GID_INVALID) {
1190 gid += (gid_t) arg_uid_shift;
1191
1192 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1193 return -EOVERFLOW;
1194 }
1195
1196 if (lchown(p, uid, gid) < 0)
1197 return -errno;
b12afc8c
LP
1198
1199 return 0;
1200}
1201
03cfe0d5
LP
1202static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1203 const char *q;
1204
1205 q = prefix_roota(root, path);
1206 if (mkdir(q, mode) < 0) {
1207 if (errno == EEXIST)
1208 return 0;
1209 return -errno;
1210 }
1211
1212 return userns_lchown(q, uid, gid);
1213}
1214
e58a1277 1215static int setup_timezone(const char *dest) {
03cfe0d5
LP
1216 _cleanup_free_ char *p = NULL, *q = NULL;
1217 const char *where, *check, *what;
d4036145
LP
1218 char *z, *y;
1219 int r;
f8440af5 1220
e58a1277
LP
1221 assert(dest);
1222
1223 /* Fix the timezone, if possible */
d4036145
LP
1224 r = readlink_malloc("/etc/localtime", &p);
1225 if (r < 0) {
0b493a02
MP
1226 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1227 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1228 * with a symbolic link to a time zone data file.
0b493a02
MP
1229 *
1230 * Example:
21dc0227 1231 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1232 */
d4036145
LP
1233 return 0;
1234 }
1235
1236 z = path_startswith(p, "../usr/share/zoneinfo/");
1237 if (!z)
1238 z = path_startswith(p, "/usr/share/zoneinfo/");
1239 if (!z) {
1240 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1241 return 0;
1242 }
1243
03cfe0d5 1244 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1245 r = readlink_malloc(where, &q);
1246 if (r >= 0) {
1247 y = path_startswith(q, "../usr/share/zoneinfo/");
1248 if (!y)
1249 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1250
d4036145
LP
1251 /* Already pointing to the right place? Then do nothing .. */
1252 if (y && streq(y, z))
1253 return 0;
1254 }
1255
03cfe0d5 1256 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1257 check = prefix_roota(dest, check);
03cfe0d5 1258 if (laccess(check, F_OK) < 0) {
d4036145
LP
1259 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1260 return 0;
1261 }
68fb0892 1262
79d80fc1
TG
1263 r = unlink(where);
1264 if (r < 0 && errno != ENOENT) {
56f64d95 1265 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1266 return 0;
1267 }
4d9f07b4 1268
03cfe0d5 1269 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1270 if (symlink(what, where) < 0) {
56f64d95 1271 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1272 return 0;
1273 }
e58a1277 1274
03cfe0d5
LP
1275 r = userns_lchown(where, 0, 0);
1276 if (r < 0)
1277 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1278
e58a1277 1279 return 0;
88213476
LP
1280}
1281
2547bb41 1282static int setup_resolv_conf(const char *dest) {
03cfe0d5 1283 const char *where = NULL;
79d80fc1 1284 int r;
2547bb41
LP
1285
1286 assert(dest);
1287
1288 if (arg_private_network)
1289 return 0;
1290
1291 /* Fix resolv.conf, if possible */
03cfe0d5 1292 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1293
7debb05d
CH
1294 if (access("/run/systemd/resolve/resolv.conf", F_OK) >= 0 &&
1295 access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
3539724c
LP
1296 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1297 * container, so that the container can use the host's resolver. Given that network namespacing is
1298 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1299 * advantage that the container will be able to follow the host's DNS server configuration changes
1300 * transparently. */
1301
5367354d
FB
1302 (void) touch(where);
1303
60e76d48
ZJS
1304 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1305 if (r >= 0)
1306 return mount_verbose(LOG_ERR, NULL, where, NULL,
1307 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1308 }
1309
1310 /* If that didn't work, let's copy the file */
f2068bcc 1311 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1312 if (r < 0) {
3539724c
LP
1313 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1314 * resolved or something similar runs inside and the symlink points there.
68a313c5 1315 *
3539724c 1316 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1317 */
1318 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1319 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1320 return 0;
1321 }
2547bb41 1322
03cfe0d5
LP
1323 r = userns_lchown(where, 0, 0);
1324 if (r < 0)
3539724c 1325 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1326
2547bb41
LP
1327 return 0;
1328}
1329
04bc4a3f 1330static int setup_boot_id(const char *dest) {
3bbaff3e 1331 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1332 const char *from, *to;
04bc4a3f
LP
1333 int r;
1334
04bc4a3f
LP
1335 /* Generate a new randomized boot ID, so that each boot-up of
1336 * the container gets a new one */
1337
03cfe0d5
LP
1338 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1339 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1340
1341 r = sd_id128_randomize(&rnd);
f647962d
MS
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1344
15b1248a 1345 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1346 if (r < 0)
1347 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1348
60e76d48
ZJS
1349 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1350 if (r >= 0)
1351 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1352 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1353
3bbaff3e 1354 (void) unlink(from);
04bc4a3f
LP
1355 return r;
1356}
1357
e58a1277 1358static int copy_devnodes(const char *dest) {
88213476
LP
1359
1360 static const char devnodes[] =
1361 "null\0"
1362 "zero\0"
1363 "full\0"
1364 "random\0"
1365 "urandom\0"
85614d66
TG
1366 "tty\0"
1367 "net/tun\0";
88213476
LP
1368
1369 const char *d;
e58a1277 1370 int r = 0;
7fd1b19b 1371 _cleanup_umask_ mode_t u;
a258bf26
LP
1372
1373 assert(dest);
124640f1
LP
1374
1375 u = umask(0000);
88213476 1376
03cfe0d5
LP
1377 /* Create /dev/net, so that we can create /dev/net/tun in it */
1378 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1379 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1380
88213476 1381 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1382 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1383 struct stat st;
88213476 1384
7f112f50 1385 from = strappend("/dev/", d);
03cfe0d5 1386 to = prefix_root(dest, from);
88213476
LP
1387
1388 if (stat(from, &st) < 0) {
1389
4a62c710
MS
1390 if (errno != ENOENT)
1391 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1392
a258bf26 1393 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1394
03cfe0d5 1395 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1396 return -EIO;
a258bf26 1397
85614d66 1398 } else {
81f5049b 1399 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1400 /*
1401 * This is some sort of protection too against
1402 * recursive userns chown on shared /dev/
1403 */
1404 if (errno == EEXIST)
1405 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1406 if (errno != EPERM)
1407 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1408
1409 /* Some systems abusively restrict mknod but
1410 * allow bind mounts. */
1411 r = touch(to);
1412 if (r < 0)
1413 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1414 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1415 if (r < 0)
1416 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1417 }
6278cf60 1418
03cfe0d5
LP
1419 r = userns_lchown(to, 0, 0);
1420 if (r < 0)
1421 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1422 }
88213476
LP
1423 }
1424
e58a1277
LP
1425 return r;
1426}
88213476 1427
03cfe0d5
LP
1428static int setup_pts(const char *dest) {
1429 _cleanup_free_ char *options = NULL;
1430 const char *p;
709f6e46 1431 int r;
03cfe0d5
LP
1432
1433#ifdef HAVE_SELINUX
1434 if (arg_selinux_apifs_context)
1435 (void) asprintf(&options,
3dce8915 1436 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1437 arg_uid_shift + TTY_GID,
1438 arg_selinux_apifs_context);
1439 else
1440#endif
1441 (void) asprintf(&options,
3dce8915 1442 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1443 arg_uid_shift + TTY_GID);
f2d88580 1444
03cfe0d5 1445 if (!options)
f2d88580
LP
1446 return log_oom();
1447
03cfe0d5 1448 /* Mount /dev/pts itself */
cc9fce65 1449 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1450 if (mkdir(p, 0755) < 0)
1451 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1452 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1453 if (r < 0)
1454 return r;
709f6e46
MS
1455 r = userns_lchown(p, 0, 0);
1456 if (r < 0)
1457 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1458
1459 /* Create /dev/ptmx symlink */
1460 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1461 if (symlink("pts/ptmx", p) < 0)
1462 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1463 r = userns_lchown(p, 0, 0);
1464 if (r < 0)
1465 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1466
03cfe0d5
LP
1467 /* And fix /dev/pts/ptmx ownership */
1468 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1469 r = userns_lchown(p, 0, 0);
1470 if (r < 0)
1471 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1472
f2d88580
LP
1473 return 0;
1474}
1475
e58a1277 1476static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1477 _cleanup_umask_ mode_t u;
1478 const char *to;
e58a1277 1479 int r;
e58a1277
LP
1480
1481 assert(dest);
1482 assert(console);
1483
1484 u = umask(0000);
1485
03cfe0d5 1486 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1487 if (r < 0)
1488 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1489
a258bf26
LP
1490 /* We need to bind mount the right tty to /dev/console since
1491 * ptys can only exist on pts file systems. To have something
81f5049b 1492 * to bind mount things on we create a empty regular file. */
a258bf26 1493
03cfe0d5 1494 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1495 r = touch(to);
1496 if (r < 0)
1497 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1498
60e76d48 1499 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1500}
1501
1502static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1503 const char *from, *to;
7fd1b19b 1504 _cleanup_umask_ mode_t u;
d9603714 1505 int fd, r;
e58a1277 1506
e58a1277 1507 assert(kmsg_socket >= 0);
a258bf26 1508
e58a1277 1509 u = umask(0000);
a258bf26 1510
03cfe0d5 1511 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1512 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1513 * on the reading side behave very similar to /proc/kmsg,
1514 * their writing side behaves differently from /dev/kmsg in
1515 * that writing blocks when nothing is reading. In order to
1516 * avoid any problems with containers deadlocking due to this
1517 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1518 from = prefix_roota(dest, "/run/kmsg");
1519 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1520
4a62c710 1521 if (mkfifo(from, 0600) < 0)
03cfe0d5 1522 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1523 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1524 if (r < 0)
1525 return r;
e58a1277
LP
1526
1527 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1528 if (fd < 0)
1529 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1530
e58a1277
LP
1531 /* Store away the fd in the socket, so that it stays open as
1532 * long as we run the child */
3ee897d6 1533 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1534 safe_close(fd);
e58a1277 1535
d9603714
DH
1536 if (r < 0)
1537 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1538
03cfe0d5
LP
1539 /* And now make the FIFO unavailable as /run/kmsg... */
1540 (void) unlink(from);
1541
25ea79fe 1542 return 0;
88213476
LP
1543}
1544
1c4baffc 1545static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1546 union in_addr_union *exposed = userdata;
1547
1548 assert(rtnl);
1549 assert(m);
1550 assert(exposed);
1551
7a8f6325 1552 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1553 return 0;
1554}
1555
3a74cea5 1556static int setup_hostname(void) {
3a74cea5 1557
0c582db0 1558 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1559 return 0;
1560
605f81a8 1561 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1562 return -errno;
3a74cea5 1563
7027ff61 1564 return 0;
3a74cea5
LP
1565}
1566
57fb9fb5 1567static int setup_journal(const char *directory) {
e01ff70a 1568 sd_id128_t this_id;
0f5e1382 1569 _cleanup_free_ char *d = NULL;
e01ff70a 1570 const char *p, *q;
8054d749 1571 bool try;
e01ff70a 1572 char id[33];
57fb9fb5
LP
1573 int r;
1574
df9a75e4
LP
1575 /* Don't link journals in ephemeral mode */
1576 if (arg_ephemeral)
1577 return 0;
1578
8054d749
LP
1579 if (arg_link_journal == LINK_NO)
1580 return 0;
1581
1582 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1583
4d680aee 1584 r = sd_id128_get_machine(&this_id);
f647962d
MS
1585 if (r < 0)
1586 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1587
e01ff70a 1588 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1589 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1590 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1591 if (try)
4d680aee 1592 return 0;
df9a75e4 1593 return -EEXIST;
4d680aee
ZJS
1594 }
1595
03cfe0d5
LP
1596 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1597 if (r < 0)
1598 return log_error_errno(r, "Failed to create /var: %m");
1599
1600 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1601 if (r < 0)
1602 return log_error_errno(r, "Failed to create /var/log: %m");
1603
1604 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1605 if (r < 0)
1606 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1607
e01ff70a
MS
1608 (void) sd_id128_to_string(arg_uuid, id);
1609
03cfe0d5
LP
1610 p = strjoina("/var/log/journal/", id);
1611 q = prefix_roota(directory, p);
27407a01 1612
e1873695 1613 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1614 if (try)
1615 return 0;
27407a01 1616
8054d749
LP
1617 log_error("%s: already a mount point, refusing to use for journal", p);
1618 return -EEXIST;
57fb9fb5
LP
1619 }
1620
e1873695 1621 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1622 if (try)
1623 return 0;
57fb9fb5 1624
8054d749
LP
1625 log_error("%s: already a mount point, refusing to use for journal", q);
1626 return -EEXIST;
57fb9fb5
LP
1627 }
1628
1629 r = readlink_and_make_absolute(p, &d);
1630 if (r >= 0) {
1631 if ((arg_link_journal == LINK_GUEST ||
1632 arg_link_journal == LINK_AUTO) &&
1633 path_equal(d, q)) {
1634
03cfe0d5 1635 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1636 if (r < 0)
709f6e46 1637 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1638 return 0;
57fb9fb5
LP
1639 }
1640
4a62c710
MS
1641 if (unlink(p) < 0)
1642 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1643 } else if (r == -EINVAL) {
1644
1645 if (arg_link_journal == LINK_GUEST &&
1646 rmdir(p) < 0) {
1647
27407a01
ZJS
1648 if (errno == ENOTDIR) {
1649 log_error("%s already exists and is neither a symlink nor a directory", p);
1650 return r;
4314d33f
MS
1651 } else
1652 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1653 }
4314d33f
MS
1654 } else if (r != -ENOENT)
1655 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1656
1657 if (arg_link_journal == LINK_GUEST) {
1658
1659 if (symlink(q, p) < 0) {
8054d749 1660 if (try) {
56f64d95 1661 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1662 return 0;
4314d33f
MS
1663 } else
1664 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1665 }
1666
03cfe0d5 1667 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1668 if (r < 0)
709f6e46 1669 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1670 return 0;
57fb9fb5
LP
1671 }
1672
1673 if (arg_link_journal == LINK_HOST) {
ccddd104 1674 /* don't create parents here — if the host doesn't have
574edc90 1675 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1676
1677 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1678 if (try) {
56f64d95 1679 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1680 return 0;
4314d33f
MS
1681 } else
1682 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1683 }
1684
27407a01
ZJS
1685 } else if (access(p, F_OK) < 0)
1686 return 0;
57fb9fb5 1687
cdb2b9d0
LP
1688 if (dir_is_empty(q) == 0)
1689 log_warning("%s is not empty, proceeding anyway.", q);
1690
03cfe0d5 1691 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1692 if (r < 0)
1693 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1694
60e76d48
ZJS
1695 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1696 if (r < 0)
4a62c710 1697 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1698
27407a01 1699 return 0;
57fb9fb5
LP
1700}
1701
88213476 1702static int drop_capabilities(void) {
520e0d54 1703 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1704}
1705
db999e0f
LP
1706static int reset_audit_loginuid(void) {
1707 _cleanup_free_ char *p = NULL;
1708 int r;
1709
0c582db0 1710 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1711 return 0;
1712
1713 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1714 if (r == -ENOENT)
db999e0f 1715 return 0;
f647962d
MS
1716 if (r < 0)
1717 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1718
1719 /* Already reset? */
1720 if (streq(p, "4294967295"))
1721 return 0;
1722
ad118bda 1723 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1724 if (r < 0) {
10a87006
LP
1725 log_error_errno(r,
1726 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1727 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1728 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1729 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1730 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1731
db999e0f 1732 sleep(5);
77b6e194 1733 }
db999e0f
LP
1734
1735 return 0;
77b6e194
LP
1736}
1737
24fb1112 1738
785890ac
LP
1739static int setup_propagate(const char *root) {
1740 const char *p, *q;
709f6e46 1741 int r;
785890ac
LP
1742
1743 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1744 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1745 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1746 (void) mkdir_p(p, 0600);
1747
709f6e46
MS
1748 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1749 if (r < 0)
1750 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1751
709f6e46
MS
1752 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1753 if (r < 0)
1754 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1755
709f6e46
MS
1756 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1759
03cfe0d5 1760 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1761 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1762 if (r < 0)
1763 return r;
785890ac 1764
60e76d48
ZJS
1765 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1766 if (r < 0)
1767 return r;
785890ac 1768
19caffac
AC
1769 /* machined will MS_MOVE into that directory, and that's only
1770 * supported for non-shared mounts. */
60e76d48 1771 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1772}
1773
1b9e5b12
LP
1774static int setup_image(char **device_path, int *loop_nr) {
1775 struct loop_info64 info = {
1776 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1777 };
1778 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1779 _cleanup_free_ char* loopdev = NULL;
1780 struct stat st;
1781 int r, nr;
1782
1783 assert(device_path);
1784 assert(loop_nr);
ec16945e 1785 assert(arg_image);
1b9e5b12
LP
1786
1787 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1788 if (fd < 0)
1789 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1790
4a62c710
MS
1791 if (fstat(fd, &st) < 0)
1792 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1793
1794 if (S_ISBLK(st.st_mode)) {
1795 char *p;
1796
1797 p = strdup(arg_image);
1798 if (!p)
1799 return log_oom();
1800
1801 *device_path = p;
1802
1803 *loop_nr = -1;
1804
1805 r = fd;
1806 fd = -1;
1807
1808 return r;
1809 }
1810
1811 if (!S_ISREG(st.st_mode)) {
070edd97 1812 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1813 return -EINVAL;
1814 }
1815
1816 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1817 if (control < 0)
1818 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1819
1820 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1821 if (nr < 0)
1822 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1823
1824 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1825 return log_oom();
1826
1827 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1828 if (loop < 0)
1829 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1830
4a62c710
MS
1831 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1832 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1833
1834 if (arg_read_only)
1835 info.lo_flags |= LO_FLAGS_READ_ONLY;
1836
4a62c710
MS
1837 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1838 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1839
1840 *device_path = loopdev;
1841 loopdev = NULL;
1842
1843 *loop_nr = nr;
1844
1845 r = loop;
1846 loop = -1;
1847
1848 return r;
1849}
1850
ada4799a
LP
1851#define PARTITION_TABLE_BLURB \
1852 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1853 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1854 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1855 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1856 "to be bootable with systemd-nspawn."
1857
1b9e5b12
LP
1858static int dissect_image(
1859 int fd,
727fd4fd
LP
1860 char **root_device, bool *root_device_rw,
1861 char **home_device, bool *home_device_rw,
1862 char **srv_device, bool *srv_device_rw,
a6bc7db9 1863 char **esp_device,
1b9e5b12
LP
1864 bool *secondary) {
1865
1866#ifdef HAVE_BLKID
a6bc7db9 1867 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1868#ifdef GPT_ROOT_NATIVE
1869 int root_nr = -1;
1870#endif
1871#ifdef GPT_ROOT_SECONDARY
1872 int secondary_root_nr = -1;
1873#endif
a6bc7db9 1874 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1875 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1876 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1877 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1878 _cleanup_udev_unref_ struct udev *udev = NULL;
1879 struct udev_list_entry *first, *item;
f6c51a81 1880 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1881 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1882 const char *pttype = NULL;
1883 blkid_partlist pl;
1884 struct stat st;
c09ef2e4 1885 unsigned i;
1b9e5b12
LP
1886 int r;
1887
1888 assert(fd >= 0);
1889 assert(root_device);
1890 assert(home_device);
1891 assert(srv_device);
a6bc7db9 1892 assert(esp_device);
1b9e5b12 1893 assert(secondary);
ec16945e 1894 assert(arg_image);
1b9e5b12
LP
1895
1896 b = blkid_new_probe();
1897 if (!b)
1898 return log_oom();
1899
1900 errno = 0;
1901 r = blkid_probe_set_device(b, fd, 0, 0);
1902 if (r != 0) {
1903 if (errno == 0)
1904 return log_oom();
1905
e1427b13 1906 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1907 }
1908
1909 blkid_probe_enable_partitions(b, 1);
1910 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1911
1912 errno = 0;
1913 r = blkid_do_safeprobe(b);
1914 if (r == -2 || r == 1) {
ada4799a
LP
1915 log_error("Failed to identify any partition table on\n"
1916 " %s\n"
1917 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1918 return -EINVAL;
1919 } else if (r != 0) {
1920 if (errno == 0)
1921 errno = EIO;
e1427b13 1922 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1923 }
1924
48861960 1925 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1926
1927 is_gpt = streq_ptr(pttype, "gpt");
1928 is_mbr = streq_ptr(pttype, "dos");
1929
1930 if (!is_gpt && !is_mbr) {
1931 log_error("No GPT or MBR partition table discovered on\n"
1932 " %s\n"
1933 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1934 return -EINVAL;
1935 }
1936
1937 errno = 0;
1938 pl = blkid_probe_get_partitions(b);
1939 if (!pl) {
1940 if (errno == 0)
1941 return log_oom();
1942
1943 log_error("Failed to list partitions of %s", arg_image);
1944 return -errno;
1945 }
1946
1947 udev = udev_new();
1948 if (!udev)
1949 return log_oom();
1950
4a62c710
MS
1951 if (fstat(fd, &st) < 0)
1952 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1953
c09ef2e4
LP
1954 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1955 if (!d)
1b9e5b12
LP
1956 return log_oom();
1957
c09ef2e4
LP
1958 for (i = 0;; i++) {
1959 int n, m;
1b9e5b12 1960
c09ef2e4
LP
1961 if (i >= 10) {
1962 log_error("Kernel partitions never appeared.");
1963 return -ENXIO;
1964 }
1965
1966 e = udev_enumerate_new(udev);
1967 if (!e)
1968 return log_oom();
1969
1970 r = udev_enumerate_add_match_parent(e, d);
1971 if (r < 0)
1972 return log_oom();
1973
1974 r = udev_enumerate_scan_devices(e);
1975 if (r < 0)
1976 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1977
1978 /* Count the partitions enumerated by the kernel */
1979 n = 0;
1980 first = udev_enumerate_get_list_entry(e);
1981 udev_list_entry_foreach(item, first)
1982 n++;
1983
1984 /* Count the partitions enumerated by blkid */
1985 m = blkid_partlist_numof_partitions(pl);
1986 if (n == m + 1)
1987 break;
1988 if (n > m + 1) {
1989 log_error("blkid and kernel partition list do not match.");
1990 return -EIO;
1991 }
1992 if (n < m + 1) {
1993 unsigned j;
1994
1995 /* The kernel has probed fewer partitions than
1996 * blkid? Maybe the kernel prober is still
1997 * running or it got EBUSY because udev
1998 * already opened the device. Let's reprobe
1999 * the device, which is a synchronous call
2000 * that waits until probing is complete. */
2001
2002 for (j = 0; j < 20; j++) {
2003
2004 r = ioctl(fd, BLKRRPART, 0);
2005 if (r < 0)
2006 r = -errno;
2007 if (r >= 0 || r != -EBUSY)
2008 break;
2009
2010 /* If something else has the device
2011 * open, such as an udev rule, the
2012 * ioctl will return EBUSY. Since
2013 * there's no way to wait until it
2014 * isn't busy anymore, let's just wait
2015 * a bit, and try again.
2016 *
2017 * This is really something they
2018 * should fix in the kernel! */
2019
2020 usleep(50 * USEC_PER_MSEC);
2021 }
2022
2023 if (r < 0)
2024 return log_error_errno(r, "Failed to reread partition table: %m");
2025 }
2026
2027 e = udev_enumerate_unref(e);
2028 }
1b9e5b12
LP
2029
2030 first = udev_enumerate_get_list_entry(e);
2031 udev_list_entry_foreach(item, first) {
2032 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2033 const char *node;
727fd4fd 2034 unsigned long long flags;
1b9e5b12
LP
2035 blkid_partition pp;
2036 dev_t qn;
2037 int nr;
2038
2039 errno = 0;
2040 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2041 if (!q) {
2042 if (!errno)
2043 errno = ENOMEM;
2044
e1427b13 2045 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2046 }
2047
2048 qn = udev_device_get_devnum(q);
2049 if (major(qn) == 0)
2050 continue;
2051
2052 if (st.st_rdev == qn)
2053 continue;
2054
2055 node = udev_device_get_devnode(q);
2056 if (!node)
2057 continue;
2058
2059 pp = blkid_partlist_devno_to_partition(pl, qn);
2060 if (!pp)
2061 continue;
2062
727fd4fd 2063 flags = blkid_partition_get_flags(pp);
727fd4fd 2064
1b9e5b12
LP
2065 nr = blkid_partition_get_partno(pp);
2066 if (nr < 0)
2067 continue;
2068
ada4799a
LP
2069 if (is_gpt) {
2070 sd_id128_t type_id;
2071 const char *stype;
1b9e5b12 2072
f6c51a81
LP
2073 if (flags & GPT_FLAG_NO_AUTO)
2074 continue;
2075
ada4799a
LP
2076 stype = blkid_partition_get_type_string(pp);
2077 if (!stype)
2078 continue;
1b9e5b12 2079
ada4799a 2080 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2081 continue;
2082
ada4799a 2083 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2084
ada4799a
LP
2085 if (home && nr >= home_nr)
2086 continue;
1b9e5b12 2087
ada4799a
LP
2088 home_nr = nr;
2089 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2090
ada4799a
LP
2091 r = free_and_strdup(&home, node);
2092 if (r < 0)
2093 return log_oom();
727fd4fd 2094
ada4799a
LP
2095 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2096
2097 if (srv && nr >= srv_nr)
2098 continue;
2099
2100 srv_nr = nr;
2101 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2102
2103 r = free_and_strdup(&srv, node);
2104 if (r < 0)
2105 return log_oom();
a6bc7db9
LP
2106 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2107
2108 if (esp && nr >= esp_nr)
2109 continue;
2110
2111 esp_nr = nr;
2112
2113 r = free_and_strdup(&esp, node);
2114 if (r < 0)
2115 return log_oom();
ada4799a 2116 }
1b9e5b12 2117#ifdef GPT_ROOT_NATIVE
ada4799a 2118 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2119
ada4799a
LP
2120 if (root && nr >= root_nr)
2121 continue;
1b9e5b12 2122
ada4799a
LP
2123 root_nr = nr;
2124 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2125
ada4799a
LP
2126 r = free_and_strdup(&root, node);
2127 if (r < 0)
2128 return log_oom();
2129 }
1b9e5b12
LP
2130#endif
2131#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2132 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2133
2134 if (secondary_root && nr >= secondary_root_nr)
2135 continue;
2136
2137 secondary_root_nr = nr;
2138 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2139
2140 r = free_and_strdup(&secondary_root, node);
2141 if (r < 0)
2142 return log_oom();
2143 }
2144#endif
f6c51a81
LP
2145 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2146
2147 if (generic)
2148 multiple_generic = true;
2149 else {
2150 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2151
2152 r = free_and_strdup(&generic, node);
2153 if (r < 0)
2154 return log_oom();
2155 }
2156 }
ada4799a
LP
2157
2158 } else if (is_mbr) {
2159 int type;
1b9e5b12 2160
f6c51a81
LP
2161 if (flags != 0x80) /* Bootable flag */
2162 continue;
2163
ada4799a
LP
2164 type = blkid_partition_get_type(pp);
2165 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2166 continue;
2167
f6c51a81
LP
2168 if (generic)
2169 multiple_generic = true;
2170 else {
2171 generic_rw = true;
727fd4fd 2172
f6c51a81
LP
2173 r = free_and_strdup(&root, node);
2174 if (r < 0)
2175 return log_oom();
2176 }
1b9e5b12 2177 }
1b9e5b12
LP
2178 }
2179
1b9e5b12
LP
2180 if (root) {
2181 *root_device = root;
2182 root = NULL;
727fd4fd
LP
2183
2184 *root_device_rw = root_rw;
1b9e5b12
LP
2185 *secondary = false;
2186 } else if (secondary_root) {
2187 *root_device = secondary_root;
2188 secondary_root = NULL;
727fd4fd
LP
2189
2190 *root_device_rw = secondary_root_rw;
1b9e5b12 2191 *secondary = true;
f6c51a81
LP
2192 } else if (generic) {
2193
2194 /* There were no partitions with precise meanings
2195 * around, but we found generic partitions. In this
2196 * case, if there's only one, we can go ahead and boot
2197 * it, otherwise we bail out, because we really cannot
2198 * make any sense of it. */
2199
2200 if (multiple_generic) {
2201 log_error("Identified multiple bootable Linux partitions on\n"
2202 " %s\n"
2203 PARTITION_TABLE_BLURB, arg_image);
2204 return -EINVAL;
2205 }
2206
2207 *root_device = generic;
2208 generic = NULL;
2209
2210 *root_device_rw = generic_rw;
2211 *secondary = false;
2212 } else {
2213 log_error("Failed to identify root partition in disk image\n"
2214 " %s\n"
2215 PARTITION_TABLE_BLURB, arg_image);
2216 return -EINVAL;
1b9e5b12
LP
2217 }
2218
2219 if (home) {
2220 *home_device = home;
2221 home = NULL;
727fd4fd
LP
2222
2223 *home_device_rw = home_rw;
1b9e5b12
LP
2224 }
2225
2226 if (srv) {
2227 *srv_device = srv;
2228 srv = NULL;
727fd4fd
LP
2229
2230 *srv_device_rw = srv_rw;
1b9e5b12
LP
2231 }
2232
a6bc7db9
LP
2233 if (esp) {
2234 *esp_device = esp;
2235 esp = NULL;
2236 }
2237
1b9e5b12
LP
2238 return 0;
2239#else
2240 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2241 return -EOPNOTSUPP;
1b9e5b12
LP
2242#endif
2243}
2244
727fd4fd 2245static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2246#ifdef HAVE_BLKID
2247 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2bce2acc 2248 const char *fstype, *p, *options;
1b9e5b12
LP
2249 int r;
2250
2251 assert(what);
2252 assert(where);
2253
727fd4fd
LP
2254 if (arg_read_only)
2255 rw = false;
2256
1b9e5b12 2257 if (directory)
63c372cb 2258 p = strjoina(where, directory);
1b9e5b12
LP
2259 else
2260 p = where;
2261
2262 errno = 0;
2263 b = blkid_new_probe_from_filename(what);
2264 if (!b) {
2265 if (errno == 0)
2266 return log_oom();
e1427b13 2267 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2268 }
2269
2270 blkid_probe_enable_superblocks(b, 1);
2271 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2272
2273 errno = 0;
2274 r = blkid_do_safeprobe(b);
2275 if (r == -1 || r == 1) {
2276 log_error("Cannot determine file system type of %s", what);
2277 return -EINVAL;
2278 } else if (r != 0) {
2279 if (errno == 0)
2280 errno = EIO;
e1427b13 2281 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2282 }
2283
2284 errno = 0;
2285 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2286 if (errno == 0)
2287 errno = EINVAL;
2288 log_error("Failed to determine file system type of %s", what);
2289 return -errno;
2290 }
2291
2292 if (streq(fstype, "crypto_LUKS")) {
2293 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2294 return -EOPNOTSUPP;
1b9e5b12
LP
2295 }
2296
2bce2acc
LP
2297 /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
2298 * sparse when possible. */
2299 if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
2300 const char *l;
2301
2302 l = path_startswith(what, "/dev");
2303 if (l && startswith(l, "loop"))
2304 options = "discard";
2305 }
2306
2307 return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
1b9e5b12
LP
2308#else
2309 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2310 return -EOPNOTSUPP;
1b9e5b12
LP
2311#endif
2312}
2313
317feb4d 2314static int setup_machine_id(const char *directory) {
691675ba
LP
2315 const char *etc_machine_id;
2316 sd_id128_t id;
3bbaff3e 2317 int r;
e01ff70a 2318
317feb4d
LP
2319 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2320 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2321 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2322 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2323 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2324 * container behaves nicely). */
2325
e01ff70a
MS
2326 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2327
691675ba 2328 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2329 if (r < 0) {
2330 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2331 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2332
317feb4d
LP
2333 if (sd_id128_is_null(arg_uuid)) {
2334 r = sd_id128_randomize(&arg_uuid);
2335 if (r < 0)
2336 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2337 }
2338 } else {
2339 if (sd_id128_is_null(id)) {
2340 log_error("Machine ID in container image is zero, refusing.");
2341 return -EINVAL;
2342 }
e01ff70a 2343
317feb4d
LP
2344 arg_uuid = id;
2345 }
691675ba 2346
e01ff70a
MS
2347 return 0;
2348}
2349
7336138e
LP
2350static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2351 int r;
2352
2353 assert(directory);
2354
0de7acce 2355 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2356 return 0;
2357
2358 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2359 if (r == -EOPNOTSUPP)
2360 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2361 if (r == -EBADE)
2362 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2363 if (r < 0)
2364 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2365 if (r == 0)
2366 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2367 else
2368 log_debug("Patched directory tree to match UID/GID range.");
2369
2370 return r;
2371}
2372
727fd4fd
LP
2373static int mount_devices(
2374 const char *where,
2375 const char *root_device, bool root_device_rw,
2376 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2377 const char *srv_device, bool srv_device_rw,
2378 const char *esp_device) {
1b9e5b12
LP
2379 int r;
2380
2381 assert(where);
2382
2383 if (root_device) {
727fd4fd 2384 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2387 }
2388
2389 if (home_device) {
727fd4fd 2390 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2391 if (r < 0)
2392 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2393 }
2394
2395 if (srv_device) {
727fd4fd 2396 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2397 if (r < 0)
2398 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2399 }
2400
a6bc7db9
LP
2401 if (esp_device) {
2402 const char *mp, *x;
2403
2404 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2405
2406 mp = "/efi";
2407 x = strjoina(arg_directory, mp);
2408 r = dir_is_empty(x);
2409 if (r == -ENOENT) {
2410 mp = "/boot";
2411 x = strjoina(arg_directory, mp);
2412 r = dir_is_empty(x);
2413 }
2414
2415 if (r > 0) {
2416 r = mount_device(esp_device, arg_directory, mp, true);
2417 if (r < 0)
2418 return log_error_errno(r, "Failed to mount ESP: %m");
2419 }
2420 }
2421
1b9e5b12
LP
2422 return 0;
2423}
2424
2425static void loop_remove(int nr, int *image_fd) {
2426 _cleanup_close_ int control = -1;
e8c8ddcc 2427 int r;
1b9e5b12
LP
2428
2429 if (nr < 0)
2430 return;
2431
2432 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2433 r = ioctl(*image_fd, LOOP_CLR_FD);
2434 if (r < 0)
5e4074aa 2435 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2436 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2437 }
2438
2439 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2440 if (control < 0) {
56f64d95 2441 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2442 return;
e8c8ddcc 2443 }
1b9e5b12 2444
e8c8ddcc
TG
2445 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2446 if (r < 0)
5e4074aa 2447 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2448}
2449
113cea80 2450/*
6d416b9c
LS
2451 * Return values:
2452 * < 0 : wait_for_terminate() failed to get the state of the
2453 * container, the container was terminated by a signal, or
2454 * failed for an unknown reason. No change is made to the
2455 * container argument.
2456 * > 0 : The program executed in the container terminated with an
2457 * error. The exit code of the program executed in the
919699ec
LP
2458 * container is returned. The container argument has been set
2459 * to CONTAINER_TERMINATED.
6d416b9c
LS
2460 * 0 : The container is being rebooted, has been shut down or exited
2461 * successfully. The container argument has been set to either
2462 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2463 *
6d416b9c
LS
2464 * That is, success is indicated by a return value of zero, and an
2465 * error is indicated by a non-zero value.
113cea80
DH
2466 */
2467static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2468 siginfo_t status;
919699ec 2469 int r;
113cea80
DH
2470
2471 r = wait_for_terminate(pid, &status);
f647962d
MS
2472 if (r < 0)
2473 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2474
2475 switch (status.si_code) {
fddbb89c 2476
113cea80 2477 case CLD_EXITED:
b5a2179b 2478 if (status.si_status == 0)
919699ec 2479 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2480 else
919699ec 2481 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2482
919699ec
LP
2483 *container = CONTAINER_TERMINATED;
2484 return status.si_status;
113cea80
DH
2485
2486 case CLD_KILLED:
2487 if (status.si_status == SIGINT) {
919699ec 2488 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2489 *container = CONTAINER_TERMINATED;
919699ec
LP
2490 return 0;
2491
113cea80 2492 } else if (status.si_status == SIGHUP) {
919699ec 2493 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2494 *container = CONTAINER_REBOOTED;
919699ec 2495 return 0;
113cea80 2496 }
919699ec 2497
113cea80
DH
2498 /* CLD_KILLED fallthrough */
2499
2500 case CLD_DUMPED:
fddbb89c 2501 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2502 return -EIO;
113cea80
DH
2503
2504 default:
fddbb89c 2505 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2506 return -EIO;
113cea80 2507 }
113cea80
DH
2508}
2509
023fb90b
LP
2510static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2511 pid_t pid;
2512
4a0b58c4 2513 pid = PTR_TO_PID(userdata);
023fb90b 2514 if (pid > 0) {
c6c8f6e2 2515 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2516 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2517 sd_event_source_set_userdata(s, NULL);
2518 return 0;
2519 }
2520 }
2521
2522 sd_event_exit(sd_event_source_get_event(s), 0);
2523 return 0;
2524}
2525
ec16945e 2526static int determine_names(void) {
1b9cebf6 2527 int r;
ec16945e 2528
c1521918
LP
2529 if (arg_template && !arg_directory && arg_machine) {
2530
2531 /* If --template= was specified then we should not
2532 * search for a machine, but instead create a new one
2533 * in /var/lib/machine. */
2534
605405c6 2535 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2536 if (!arg_directory)
2537 return log_oom();
2538 }
2539
ec16945e 2540 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2541 if (arg_machine) {
2542 _cleanup_(image_unrefp) Image *i = NULL;
2543
2544 r = image_find(arg_machine, &i);
2545 if (r < 0)
2546 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2547 if (r == 0) {
1b9cebf6
LP
2548 log_error("No image for machine '%s': %m", arg_machine);
2549 return -ENOENT;
2550 }
2551
aceac2f0 2552 if (i->type == IMAGE_RAW)
0f03c2a4 2553 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2554 else
0f03c2a4 2555 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2556 if (r < 0)
0f3be6ca 2557 return log_oom();
1b9cebf6 2558
aee327b8
LP
2559 if (!arg_ephemeral)
2560 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2561 } else
ec16945e
LP
2562 arg_directory = get_current_dir_name();
2563
0f3be6ca 2564 if (!arg_directory && !arg_image) {
1b9cebf6 2565 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2566 return -EINVAL;
2567 }
2568 }
2569
2570 if (!arg_machine) {
b9ba4dab
LP
2571 if (arg_directory && path_equal(arg_directory, "/"))
2572 arg_machine = gethostname_malloc();
2573 else
2574 arg_machine = strdup(basename(arg_image ?: arg_directory));
ec16945e
LP
2575 if (!arg_machine)
2576 return log_oom();
2577
ae691c1d 2578 hostname_cleanup(arg_machine);
ec16945e
LP
2579 if (!machine_name_is_valid(arg_machine)) {
2580 log_error("Failed to determine machine name automatically, please use -M.");
2581 return -EINVAL;
2582 }
b9ba4dab
LP
2583
2584 if (arg_ephemeral) {
2585 char *b;
2586
2587 /* Add a random suffix when this is an
2588 * ephemeral machine, so that we can run many
2589 * instances at once without manually having
2590 * to specify -M each time. */
2591
2592 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2593 return log_oom();
2594
2595 free(arg_machine);
2596 arg_machine = b;
2597 }
ec16945e
LP
2598 }
2599
2600 return 0;
2601}
2602
8d4aa2bb 2603static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2604 char *chased;
2605 int r;
2606
2607 assert(p);
2608
2609 if (!*p)
2610 return 0;
2611
8d4aa2bb 2612 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2613 if (r < 0)
2614 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2615
2616 free(*p);
2617 *p = chased;
2618
2619 return 0;
2620}
2621
03cfe0d5 2622static int determine_uid_shift(const char *directory) {
6dac160c
LP
2623 int r;
2624
0de7acce 2625 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2626 arg_uid_shift = 0;
6dac160c 2627 return 0;
03cfe0d5 2628 }
6dac160c
LP
2629
2630 if (arg_uid_shift == UID_INVALID) {
2631 struct stat st;
2632
03cfe0d5 2633 r = stat(directory, &st);
6dac160c 2634 if (r < 0)
03cfe0d5 2635 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2636
2637 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2638
2639 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2640 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2641 return -EINVAL;
2642 }
2643
2644 arg_uid_range = UINT32_C(0x10000);
2645 }
2646
2647 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2648 log_error("UID base too high for UID range.");
2649 return -EINVAL;
2650 }
2651
6dac160c
LP
2652 return 0;
2653}
2654
03cfe0d5
LP
2655static int inner_child(
2656 Barrier *barrier,
2657 const char *directory,
2658 bool secondary,
2659 int kmsg_socket,
2660 int rtnl_socket,
f757855e 2661 FDSet *fds) {
69c79d3c 2662
03cfe0d5 2663 _cleanup_free_ char *home = NULL;
e01ff70a 2664 char as_uuid[37];
6aadfa4c 2665 unsigned n_env = 1;
03cfe0d5
LP
2666 const char *envp[] = {
2667 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2668 NULL, /* container */
03cfe0d5
LP
2669 NULL, /* TERM */
2670 NULL, /* HOME */
2671 NULL, /* USER */
2672 NULL, /* LOGNAME */
2673 NULL, /* container_uuid */
2674 NULL, /* LISTEN_FDS */
2675 NULL, /* LISTEN_PID */
9c1e04d0 2676 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2677 NULL
2678 };
88213476 2679
2371271c 2680 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2681 int r;
88213476 2682
03cfe0d5
LP
2683 assert(barrier);
2684 assert(directory);
2685 assert(kmsg_socket >= 0);
88213476 2686
efdb0237
LP
2687 cg_unified_flush();
2688
0de7acce 2689 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2690 /* Tell the parent, that it now can write the UID map. */
2691 (void) barrier_place(barrier); /* #1 */
7027ff61 2692
03cfe0d5
LP
2693 /* Wait until the parent wrote the UID map */
2694 if (!barrier_place_and_sync(barrier)) { /* #2 */
2695 log_error("Parent died too early");
2696 return -ESRCH;
2697 }
88213476
LP
2698 }
2699
6d66bd3b
EV
2700 r = reset_uid_gid();
2701 if (r < 0)
2702 return log_error_errno(r, "Couldn't become new root: %m");
2703
0de7acce 2704 r = mount_all(NULL,
4f086aab 2705 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2706 arg_uid_shift,
2707 arg_uid_range,
2708 arg_selinux_apifs_context);
2709
03cfe0d5
LP
2710 if (r < 0)
2711 return r;
2712
4f086aab 2713 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2714 if (r < 0)
2715 return r;
2716
03cfe0d5
LP
2717 /* Wait until we are cgroup-ified, so that we
2718 * can mount the right cgroup path writable */
2719 if (!barrier_place_and_sync(barrier)) { /* #3 */
2720 log_error("Parent died too early");
2721 return -ESRCH;
88213476
LP
2722 }
2723
5a8ff0e6 2724 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2725 r = unshare(CLONE_NEWCGROUP);
2726 if (r < 0)
2727 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2728 r = mount_cgroups(
2729 "",
2730 arg_unified_cgroup_hierarchy,
2731 arg_userns_mode != USER_NAMESPACE_NO,
2732 arg_uid_shift,
2733 arg_uid_range,
5a8ff0e6 2734 arg_selinux_apifs_context,
ada54120 2735 true);
0996ef00
CB
2736 if (r < 0)
2737 return r;
2738 } else {
2739 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2740 if (r < 0)
2741 return r;
2742 }
ec16945e 2743
03cfe0d5
LP
2744 r = setup_boot_id(NULL);
2745 if (r < 0)
2746 return r;
ec16945e 2747
03cfe0d5
LP
2748 r = setup_kmsg(NULL, kmsg_socket);
2749 if (r < 0)
2750 return r;
2751 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2752
03cfe0d5 2753 umask(0022);
30535c16 2754
03cfe0d5
LP
2755 if (setsid() < 0)
2756 return log_error_errno(errno, "setsid() failed: %m");
2757
2758 if (arg_private_network)
2759 loopback_setup();
2760
7a8f6325
LP
2761 if (arg_expose_ports) {
2762 r = expose_port_send_rtnl(rtnl_socket);
2763 if (r < 0)
2764 return r;
2765 rtnl_socket = safe_close(rtnl_socket);
2766 }
03cfe0d5 2767
709f6e46
MS
2768 r = drop_capabilities();
2769 if (r < 0)
2770 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2771
2772 setup_hostname();
2773
050f7277 2774 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2775 if (personality(arg_personality) < 0)
2776 return log_error_errno(errno, "personality() failed: %m");
2777 } else if (secondary) {
2778 if (personality(PER_LINUX32) < 0)
2779 return log_error_errno(errno, "personality() failed: %m");
2780 }
2781
2782#ifdef HAVE_SELINUX
2783 if (arg_selinux_context)
2ed96880 2784 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2785 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2786#endif
2787
ee645080 2788 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2789 if (r < 0)
2790 return r;
2791
6aadfa4c
ILG
2792 /* LXC sets container=lxc, so follow the scheme here */
2793 envp[n_env++] = strjoina("container=", arg_container_service_name);
2794
03cfe0d5
LP
2795 envp[n_env] = strv_find_prefix(environ, "TERM=");
2796 if (envp[n_env])
313cefa1 2797 n_env++;
03cfe0d5
LP
2798
2799 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2800 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2801 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2802 return log_oom();
2803
3bbaff3e 2804 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2805
691675ba 2806 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2807 return log_oom();
03cfe0d5
LP
2808
2809 if (fdset_size(fds) > 0) {
2810 r = fdset_cloexec(fds, false);
2811 if (r < 0)
2812 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2813
2814 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2815 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2816 return log_oom();
2817 }
9c1e04d0
AP
2818 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2819 return log_oom();
03cfe0d5 2820
2371271c
TG
2821 env_use = strv_env_merge(2, envp, arg_setenv);
2822 if (!env_use)
2823 return log_oom();
03cfe0d5
LP
2824
2825 /* Let the parent know that we are ready and
2826 * wait until the parent is ready with the
2827 * setup, too... */
2828 if (!barrier_place_and_sync(barrier)) { /* #4 */
2829 log_error("Parent died too early");
2830 return -ESRCH;
2831 }
2832
5f932eb9
LP
2833 if (arg_chdir)
2834 if (chdir(arg_chdir) < 0)
2835 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2836
7732f92b
LP
2837 if (arg_start_mode == START_PID2) {
2838 r = stub_pid1();
2839 if (r < 0)
2840 return r;
2841 }
2842
03cfe0d5
LP
2843 /* Now, explicitly close the log, so that we
2844 * then can close all remaining fds. Closing
2845 * the log explicitly first has the benefit
2846 * that the logging subsystem knows about it,
2847 * and is thus ready to be reopened should we
2848 * need it again. Note that the other fds
2849 * closed here are at least the locking and
2850 * barrier fds. */
2851 log_close();
2852 (void) fdset_close_others(fds);
2853
7732f92b 2854 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2855 char **a;
2856 size_t m;
2857
2858 /* Automatically search for the init system */
2859
75f32f04
ZJS
2860 m = strv_length(arg_parameters);
2861 a = newa(char*, m + 2);
2862 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2863 a[1 + m] = NULL;
03cfe0d5
LP
2864
2865 a[0] = (char*) "/usr/lib/systemd/systemd";
2866 execve(a[0], a, env_use);
2867
2868 a[0] = (char*) "/lib/systemd/systemd";
2869 execve(a[0], a, env_use);
2870
2871 a[0] = (char*) "/sbin/init";
2872 execve(a[0], a, env_use);
f757855e
LP
2873 } else if (!strv_isempty(arg_parameters))
2874 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2875 else {
5f932eb9 2876 if (!arg_chdir)
d929b0f9
ZJS
2877 /* If we cannot change the directory, we'll end up in /, that is expected. */
2878 (void) chdir(home ?: "/root");
5f932eb9 2879
03cfe0d5
LP
2880 execle("/bin/bash", "-bash", NULL, env_use);
2881 execle("/bin/sh", "-sh", NULL, env_use);
2882 }
2883
35607a8d 2884 r = -errno;
03cfe0d5 2885 (void) log_open();
35607a8d 2886 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2887}
2888
9c1e04d0
AP
2889static int setup_sd_notify_child(void) {
2890 static const int one = 1;
2891 int fd = -1;
2892 union sockaddr_union sa = {
2893 .sa.sa_family = AF_UNIX,
2894 };
2895 int r;
2896
2897 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2898 if (fd < 0)
2899 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2900
2901 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2902 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2903
2904 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2905 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2906 if (r < 0) {
2907 safe_close(fd);
2908 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2909 }
2910
2911 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2912 if (r < 0) {
2913 safe_close(fd);
2914 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2915 }
2916
2917 return fd;
2918}
2919
03cfe0d5
LP
2920static int outer_child(
2921 Barrier *barrier,
2922 const char *directory,
2923 const char *console,
2924 const char *root_device, bool root_device_rw,
2925 const char *home_device, bool home_device_rw,
2926 const char *srv_device, bool srv_device_rw,
a6bc7db9 2927 const char *esp_device,
03cfe0d5
LP
2928 bool interactive,
2929 bool secondary,
2930 int pid_socket,
e01ff70a 2931 int uuid_socket,
9c1e04d0 2932 int notify_socket,
03cfe0d5
LP
2933 int kmsg_socket,
2934 int rtnl_socket,
825d5287 2935 int uid_shift_socket,
f757855e 2936 FDSet *fds) {
03cfe0d5
LP
2937
2938 pid_t pid;
2939 ssize_t l;
2940 int r;
9c1e04d0 2941 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2942
2943 assert(barrier);
2944 assert(directory);
2945 assert(console);
2946 assert(pid_socket >= 0);
e01ff70a 2947 assert(uuid_socket >= 0);
9c1e04d0 2948 assert(notify_socket >= 0);
03cfe0d5
LP
2949 assert(kmsg_socket >= 0);
2950
efdb0237
LP
2951 cg_unified_flush();
2952
03cfe0d5
LP
2953 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2954 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2955
2956 if (interactive) {
2957 close_nointr(STDIN_FILENO);
2958 close_nointr(STDOUT_FILENO);
2959 close_nointr(STDERR_FILENO);
2960
2961 r = open_terminal(console, O_RDWR);
2962 if (r != STDIN_FILENO) {
2963 if (r >= 0) {
2964 safe_close(r);
2965 r = -EINVAL;
2966 }
2967
2968 return log_error_errno(r, "Failed to open console: %m");
2969 }
2970
2971 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2972 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2973 return log_error_errno(errno, "Failed to duplicate console: %m");
2974 }
2975
2976 r = reset_audit_loginuid();
2977 if (r < 0)
2978 return r;
2979
2980 /* Mark everything as slave, so that we still
2981 * receive mounts from the real root, but don't
2982 * propagate mounts to the real root. */
60e76d48
ZJS
2983 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2984 if (r < 0)
2985 return r;
03cfe0d5
LP
2986
2987 r = mount_devices(directory,
2988 root_device, root_device_rw,
2989 home_device, home_device_rw,
a6bc7db9
LP
2990 srv_device, srv_device_rw,
2991 esp_device);
03cfe0d5
LP
2992 if (r < 0)
2993 return r;
2994
391567f4
LP
2995 r = determine_uid_shift(directory);
2996 if (r < 0)
2997 return r;
2998
0fd9563f
ZJS
2999 r = detect_unified_cgroup_hierarchy(directory);
3000 if (r < 0)
3001 return r;
3002
0de7acce 3003 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3004 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3005 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3006 if (l < 0)
3007 return log_error_errno(errno, "Failed to send UID shift: %m");
3008 if (l != sizeof(arg_uid_shift)) {
3009 log_error("Short write while sending UID shift.");
3010 return -EIO;
3011 }
0e7ac751 3012
0de7acce 3013 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3014 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3015 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3016 * not it will pick a different one, and send it back to us. */
3017
3018 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3019 if (l < 0)
3020 return log_error_errno(errno, "Failed to recv UID shift: %m");
3021 if (l != sizeof(arg_uid_shift)) {
595bfe7d 3022 log_error("Short read while receiving UID shift.");
0e7ac751
LP
3023 return -EIO;
3024 }
3025 }
3026
3027 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3028 }
3029
03cfe0d5 3030 /* Turn directory into bind mount */
60e76d48
ZJS
3031 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3032 if (r < 0)
3033 return r;
03cfe0d5 3034
19caffac
AC
3035 /* Mark everything as shared so our mounts get propagated down. This is
3036 * required to make new bind mounts available in systemd services
3037 * inside the containter that create a new mount namespace.
3038 * See https://github.com/systemd/systemd/issues/3860
3039 * Further submounts (such as /dev) done after this will inherit the
3040 * shared propagation mode.*/
60e76d48
ZJS
3041 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3042 if (r < 0)
3043 return r;
19caffac 3044
7336138e 3045 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
3046 if (r < 0)
3047 return r;
3048
0de7acce
LP
3049 r = setup_volatile(
3050 directory,
3051 arg_volatile_mode,
3052 arg_userns_mode != USER_NAMESPACE_NO,
3053 arg_uid_shift,
3054 arg_uid_range,
3055 arg_selinux_context);
03cfe0d5
LP
3056 if (r < 0)
3057 return r;
3058
0de7acce
LP
3059 r = setup_volatile_state(
3060 directory,
3061 arg_volatile_mode,
3062 arg_userns_mode != USER_NAMESPACE_NO,
3063 arg_uid_shift,
3064 arg_uid_range,
3065 arg_selinux_context);
03cfe0d5
LP
3066 if (r < 0)
3067 return r;
3068
03cfe0d5
LP
3069 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3070 if (r < 0)
3071 return r;
3072
03cfe0d5 3073 if (arg_read_only) {
6b7c9f8b 3074 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3075 if (r < 0)
3076 return log_error_errno(r, "Failed to make tree read-only: %m");
3077 }
3078
0de7acce 3079 r = mount_all(directory,
4f086aab 3080 arg_mount_settings,
0de7acce
LP
3081 arg_uid_shift,
3082 arg_uid_range,
3083 arg_selinux_apifs_context);
03cfe0d5
LP
3084 if (r < 0)
3085 return r;
3086
07fa00f9
LP
3087 r = copy_devnodes(directory);
3088 if (r < 0)
03cfe0d5
LP
3089 return r;
3090
3091 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3092
07fa00f9
LP
3093 r = setup_pts(directory);
3094 if (r < 0)
03cfe0d5
LP
3095 return r;
3096
3097 r = setup_propagate(directory);
3098 if (r < 0)
3099 return r;
3100
3101 r = setup_dev_console(directory, console);
3102 if (r < 0)
3103 return r;
3104
520e0d54 3105 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3106 if (r < 0)
3107 return r;
3108
3109 r = setup_timezone(directory);
3110 if (r < 0)
3111 return r;
3112
3113 r = setup_resolv_conf(directory);
3114 if (r < 0)
3115 return r;
3116
e01ff70a
MS
3117 r = setup_machine_id(directory);
3118 if (r < 0)
3119 return r;
3120
03cfe0d5
LP
3121 r = setup_journal(directory);
3122 if (r < 0)
3123 return r;
3124
0de7acce
LP
3125 r = mount_custom(
3126 directory,
3127 arg_custom_mounts,
3128 arg_n_custom_mounts,
3129 arg_userns_mode != USER_NAMESPACE_NO,
3130 arg_uid_shift,
3131 arg_uid_range,
3132 arg_selinux_apifs_context);
03cfe0d5
LP
3133 if (r < 0)
3134 return r;
3135
5a8ff0e6 3136 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3137 r = mount_cgroups(
3138 directory,
3139 arg_unified_cgroup_hierarchy,
3140 arg_userns_mode != USER_NAMESPACE_NO,
3141 arg_uid_shift,
3142 arg_uid_range,
5a8ff0e6 3143 arg_selinux_apifs_context,
ada54120 3144 false);
0996ef00
CB
3145 if (r < 0)
3146 return r;
3147 }
03cfe0d5
LP
3148
3149 r = mount_move_root(directory);
3150 if (r < 0)
3151 return log_error_errno(r, "Failed to move root directory: %m");
3152
9c1e04d0
AP
3153 fd = setup_sd_notify_child();
3154 if (fd < 0)
3155 return fd;
3156
03cfe0d5 3157 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3158 arg_clone_ns_flags |
03cfe0d5 3159 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3160 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3161 if (pid < 0)
3162 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3163 if (pid == 0) {
3164 pid_socket = safe_close(pid_socket);
e01ff70a 3165 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3166 notify_socket = safe_close(notify_socket);
825d5287 3167 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3168
3169 /* The inner child has all namespaces that are
3170 * requested, so that we all are owned by the user if
3171 * user namespaces are turned on. */
3172
f757855e 3173 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3174 if (r < 0)
3175 _exit(EXIT_FAILURE);
3176
3177 _exit(EXIT_SUCCESS);
3178 }
3179
3180 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3181 if (l < 0)
3182 return log_error_errno(errno, "Failed to send PID: %m");
3183 if (l != sizeof(pid)) {
3184 log_error("Short write while sending PID.");
3185 return -EIO;
3186 }
3187
e01ff70a
MS
3188 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3189 if (l < 0)
3190 return log_error_errno(errno, "Failed to send machine ID: %m");
3191 if (l != sizeof(arg_uuid)) {
3192 log_error("Short write while sending machine ID.");
3193 return -EIO;
3194 }
3195
9c1e04d0
AP
3196 l = send_one_fd(notify_socket, fd, 0);
3197 if (l < 0)
3198 return log_error_errno(errno, "Failed to send notify fd: %m");
3199
03cfe0d5 3200 pid_socket = safe_close(pid_socket);
e01ff70a 3201 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3202 notify_socket = safe_close(notify_socket);
327e26d6
KN
3203 kmsg_socket = safe_close(kmsg_socket);
3204 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3205
3206 return 0;
3207}
3208
0e7ac751
LP
3209static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3210 unsigned n_tries = 100;
3211 uid_t candidate;
3212 int r;
3213
3214 assert(shift);
3215 assert(ret_lock_file);
0de7acce 3216 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3217 assert(arg_uid_range == 0x10000U);
3218
3219 candidate = *shift;
3220
3221 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3222
3223 for (;;) {
3224 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3225 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3226
3227 if (--n_tries <= 0)
3228 return -EBUSY;
3229
3230 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3231 goto next;
3232 if ((candidate & UINT32_C(0xFFFF)) != 0)
3233 goto next;
3234
3235 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3236 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3237 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3238 goto next;
3239 if (r < 0)
3240 return r;
3241
3242 /* Make some superficial checks whether the range is currently known in the user database */
3243 if (getpwuid(candidate))
3244 goto next;
3245 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3246 goto next;
3247 if (getgrgid(candidate))
3248 goto next;
3249 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3250 goto next;
3251
3252 *ret_lock_file = lf;
3253 lf = (struct LockFile) LOCK_FILE_INIT;
3254 *shift = candidate;
3255 return 0;
3256
3257 next:
3258 random_bytes(&candidate, sizeof(candidate));
3259 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3260 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3261 }
3262}
3263
03cfe0d5
LP
3264static int setup_uid_map(pid_t pid) {
3265 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3266 int r;
3267
3268 assert(pid > 1);
3269
3270 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3271 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3272 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3273 if (r < 0)
3274 return log_error_errno(r, "Failed to write UID map: %m");
3275
3276 /* We always assign the same UID and GID ranges */
3277 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3278 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3279 if (r < 0)
3280 return log_error_errno(r, "Failed to write GID map: %m");
3281
3282 return 0;
3283}
3284
9c1e04d0 3285static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3286 char buf[NOTIFY_BUFFER_MAX+1];
3287 char *p = NULL;
3288 struct iovec iovec = {
3289 .iov_base = buf,
3290 .iov_len = sizeof(buf)-1,
3291 };
3292 union {
3293 struct cmsghdr cmsghdr;
3294 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3295 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3296 } control = {};
3297 struct msghdr msghdr = {
3298 .msg_iov = &iovec,
3299 .msg_iovlen = 1,
3300 .msg_control = &control,
3301 .msg_controllen = sizeof(control),
3302 };
3303 struct cmsghdr *cmsg;
3304 struct ucred *ucred = NULL;
3305 ssize_t n;
3306 pid_t inner_child_pid;
3307 _cleanup_strv_free_ char **tags = NULL;
3308
3309 assert(userdata);
3310
3311 inner_child_pid = PTR_TO_PID(userdata);
3312
3313 if (revents != EPOLLIN) {
3314 log_warning("Got unexpected poll event for notify fd.");
3315 return 0;
3316 }
3317
3318 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3319 if (n < 0) {
3320 if (errno == EAGAIN || errno == EINTR)
3321 return 0;
3322
3323 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3324 }
3325 cmsg_close_all(&msghdr);
3326
3327 CMSG_FOREACH(cmsg, &msghdr) {
3328 if (cmsg->cmsg_level == SOL_SOCKET &&
3329 cmsg->cmsg_type == SCM_CREDENTIALS &&
3330 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3331
3332 ucred = (struct ucred*) CMSG_DATA(cmsg);
3333 }
3334 }
3335
3336 if (!ucred || ucred->pid != inner_child_pid) {
3337 log_warning("Received notify message without valid credentials. Ignoring.");
3338 return 0;
3339 }
3340
3341 if ((size_t) n >= sizeof(buf)) {
3342 log_warning("Received notify message exceeded maximum size. Ignoring.");
3343 return 0;
3344 }
3345
3346 buf[n] = 0;
3347 tags = strv_split(buf, "\n\r");
3348 if (!tags)
3349 return log_oom();
3350
3351 if (strv_find(tags, "READY=1"))
3352 sd_notifyf(false, "READY=1\n");
3353
3354 p = strv_find_startswith(tags, "STATUS=");
3355 if (p)
3356 sd_notifyf(false, "STATUS=Container running: %s", p);
3357
3358 return 0;
3359}
3360
3361static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3362 int r;
3363 sd_event_source *notify_event_source;
3364
3365 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3366 if (r < 0)
3367 return log_error_errno(r, "Failed to allocate notify event source: %m");
3368
3369 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3370
3371 return 0;
3372}
3373
f757855e
LP
3374static int load_settings(void) {
3375 _cleanup_(settings_freep) Settings *settings = NULL;
3376 _cleanup_fclose_ FILE *f = NULL;
3377 _cleanup_free_ char *p = NULL;
3378 const char *fn, *i;
3379 int r;
3380
3381 /* If all settings are masked, there's no point in looking for
3382 * the settings file */
3383 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3384 return 0;
3385
3386 fn = strjoina(arg_machine, ".nspawn");
3387
3388 /* We first look in the admin's directories in /etc and /run */
3389 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3390 _cleanup_free_ char *j = NULL;
3391
605405c6 3392 j = strjoin(i, "/", fn);
f757855e
LP
3393 if (!j)
3394 return log_oom();
3395
3396 f = fopen(j, "re");
3397 if (f) {
3398 p = j;
3399 j = NULL;
3400
b938cb90 3401 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3402 if (arg_settings_trusted < 0)
3403 arg_settings_trusted = true;
3404
3405 break;
3406 }
3407
3408 if (errno != ENOENT)
3409 return log_error_errno(errno, "Failed to open %s: %m", j);
3410 }
3411
3412 if (!f) {
3413 /* After that, let's look for a file next to the
3414 * actual image we shall boot. */
3415
3416 if (arg_image) {
3417 p = file_in_same_dir(arg_image, fn);
3418 if (!p)
3419 return log_oom();
3420 } else if (arg_directory) {
3421 p = file_in_same_dir(arg_directory, fn);
3422 if (!p)
3423 return log_oom();
3424 }
3425
3426 if (p) {
3427 f = fopen(p, "re");
3428 if (!f && errno != ENOENT)
3429 return log_error_errno(errno, "Failed to open %s: %m", p);
3430
b938cb90 3431 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3432 if (arg_settings_trusted < 0)
3433 arg_settings_trusted = false;
3434 }
3435 }
3436
3437 if (!f)
3438 return 0;
3439
3440 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3441
3442 r = settings_load(f, p, &settings);
3443 if (r < 0)
3444 return r;
3445
3446 /* Copy over bits from the settings, unless they have been
3447 * explicitly masked by command line switches. */
3448
7732f92b
LP
3449 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3450 settings->start_mode >= 0) {
3451 arg_start_mode = settings->start_mode;
f757855e
LP
3452
3453 strv_free(arg_parameters);
3454 arg_parameters = settings->parameters;
3455 settings->parameters = NULL;
3456 }
3457
5f932eb9
LP
3458 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3459 settings->working_directory) {
3460 free(arg_chdir);
3461 arg_chdir = settings->working_directory;
3462 settings->working_directory = NULL;
3463 }
3464
f757855e
LP
3465 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3466 settings->environment) {
3467 strv_free(arg_setenv);
3468 arg_setenv = settings->environment;
3469 settings->environment = NULL;
3470 }
3471
3472 if ((arg_settings_mask & SETTING_USER) == 0 &&
3473 settings->user) {
3474 free(arg_user);
3475 arg_user = settings->user;
3476 settings->user = NULL;
3477 }
3478
3479 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3480 uint64_t plus;
f757855e 3481
0e265674
LP
3482 plus = settings->capability;
3483 if (settings_private_network(settings))
3484 plus |= (1ULL << CAP_NET_ADMIN);
3485
3486 if (!arg_settings_trusted && plus != 0) {
3487 if (settings->capability != 0)
3488 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3489 } else
520e0d54 3490 arg_caps_retain |= plus;
f757855e 3491
520e0d54 3492 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3493 }
3494
3495 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3496 settings->kill_signal > 0)
3497 arg_kill_signal = settings->kill_signal;
3498
3499 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3500 settings->personality != PERSONALITY_INVALID)
3501 arg_personality = settings->personality;
3502
3503 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3504 !sd_id128_is_null(settings->machine_id)) {
3505
3506 if (!arg_settings_trusted)
3507 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3508 else
3509 arg_uuid = settings->machine_id;
3510 }
3511
3512 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3513 settings->read_only >= 0)
3514 arg_read_only = settings->read_only;
3515
3516 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3517 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3518 arg_volatile_mode = settings->volatile_mode;
3519
3520 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3521 settings->n_custom_mounts > 0) {
3522
3523 if (!arg_settings_trusted)
3524 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3525 else {
3526 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3527 arg_custom_mounts = settings->custom_mounts;
3528 arg_n_custom_mounts = settings->n_custom_mounts;
3529
3530 settings->custom_mounts = NULL;
3531 settings->n_custom_mounts = 0;
3532 }
3533 }
3534
3535 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3536 (settings->private_network >= 0 ||
3537 settings->network_veth >= 0 ||
3538 settings->network_bridge ||
22b28dfd 3539 settings->network_zone ||
f757855e
LP
3540 settings->network_interfaces ||
3541 settings->network_macvlan ||
f6d6bad1
LP
3542 settings->network_ipvlan ||
3543 settings->network_veth_extra)) {
f757855e
LP
3544
3545 if (!arg_settings_trusted)
3546 log_warning("Ignoring network settings, file %s is not trusted.", p);
3547 else {
f6d6bad1 3548 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3549 arg_private_network = settings_private_network(settings);
3550
f757855e
LP
3551 strv_free(arg_network_interfaces);
3552 arg_network_interfaces = settings->network_interfaces;
3553 settings->network_interfaces = NULL;
3554
3555 strv_free(arg_network_macvlan);
3556 arg_network_macvlan = settings->network_macvlan;
3557 settings->network_macvlan = NULL;
3558
3559 strv_free(arg_network_ipvlan);
3560 arg_network_ipvlan = settings->network_ipvlan;
3561 settings->network_ipvlan = NULL;
3562
f6d6bad1
LP
3563 strv_free(arg_network_veth_extra);
3564 arg_network_veth_extra = settings->network_veth_extra;
3565 settings->network_veth_extra = NULL;
3566
f757855e
LP
3567 free(arg_network_bridge);
3568 arg_network_bridge = settings->network_bridge;
3569 settings->network_bridge = NULL;
22b28dfd
LP
3570
3571 free(arg_network_zone);
3572 arg_network_zone = settings->network_zone;
3573 settings->network_zone = NULL;
f757855e
LP
3574 }
3575 }
3576
3577 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3578 settings->expose_ports) {
3579
3580 if (!arg_settings_trusted)
3581 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3582 else {
3583 expose_port_free_all(arg_expose_ports);
3584 arg_expose_ports = settings->expose_ports;
3585 settings->expose_ports = NULL;
3586 }
3587 }
3588
0de7acce
LP
3589 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3590 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3591
3592 if (!arg_settings_trusted)
3593 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3594 else {
3595 arg_userns_mode = settings->userns_mode;
3596 arg_uid_shift = settings->uid_shift;
3597 arg_uid_range = settings->uid_range;
3598 arg_userns_chown = settings->userns_chown;
3599 }
3600 }
3601
9c1e04d0
AP
3602 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3603 arg_notify_ready = settings->notify_ready;
3604
f757855e
LP
3605 return 0;
3606}
3607
b0067625
ZJS
3608static int run(int master,
3609 const char* console,
3610 const char *root_device, bool root_device_rw,
3611 const char *home_device, bool home_device_rw,
3612 const char *srv_device, bool srv_device_rw,
3613 const char *esp_device,
3614 bool interactive,
3615 bool secondary,
3616 FDSet *fds,
3617 char veth_name[IFNAMSIZ], bool *veth_created,
3618 union in_addr_union *exposed,
3619 pid_t *pid, int *ret) {
3620
3621 static const struct sigaction sa = {
3622 .sa_handler = nop_signal_handler,
e28c7cd0 3623 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3624 };
3625
3626 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3627 _cleanup_close_ int etc_passwd_lock = -1;
3628 _cleanup_close_pair_ int
3629 kmsg_socket_pair[2] = { -1, -1 },
3630 rtnl_socket_pair[2] = { -1, -1 },
3631 pid_socket_pair[2] = { -1, -1 },
3632 uuid_socket_pair[2] = { -1, -1 },
3633 notify_socket_pair[2] = { -1, -1 },
3634 uid_shift_socket_pair[2] = { -1, -1 };
3635 _cleanup_close_ int notify_socket= -1;
3636 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3637 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3638 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3639 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3640 ContainerStatus container_status = 0;
3641 char last_char = 0;
3642 int ifi = 0, r;
3643 ssize_t l;
3644 sigset_t mask_chld;
3645
3646 assert_se(sigemptyset(&mask_chld) == 0);
3647 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3648
3649 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3650 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3651 * check with getpwuid() if the specific user already exists. Note that /etc might be
3652 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3653 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3654 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3655 * really ours. */
3656
3657 etc_passwd_lock = take_etc_passwd_lock(NULL);
3658 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3659 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3660 }
3661
3662 r = barrier_create(&barrier);
3663 if (r < 0)
3664 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3665
3666 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3667 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3668
3669 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3670 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3671
3672 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3673 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3674
3675 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3676 return log_error_errno(errno, "Failed to create id socket pair: %m");
3677
3678 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3679 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3680
3681 if (arg_userns_mode != USER_NAMESPACE_NO)
3682 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3683 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3684
3685 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3686 * parent's blocking calls and give it a chance to call wait() and terminate. */
3687 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3688 if (r < 0)
3689 return log_error_errno(errno, "Failed to change the signal mask: %m");
3690
3691 r = sigaction(SIGCHLD, &sa, NULL);
3692 if (r < 0)
3693 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3694
3695 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3696 if (*pid < 0)
3697 return log_error_errno(errno, "clone() failed%s: %m",
3698 errno == EINVAL ?
3699 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3700
3701 if (*pid == 0) {
3702 /* The outer child only has a file system namespace. */
3703 barrier_set_role(&barrier, BARRIER_CHILD);
3704
3705 master = safe_close(master);
3706
3707 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3708 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3709 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3710 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3711 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3712 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3713
3714 (void) reset_all_signal_handlers();
3715 (void) reset_signal_mask();
3716
3717 r = outer_child(&barrier,
3718 arg_directory,
3719 console,
3720 root_device, root_device_rw,
3721 home_device, home_device_rw,
3722 srv_device, srv_device_rw,
3723 esp_device,
3724 interactive,
3725 secondary,
3726 pid_socket_pair[1],
3727 uuid_socket_pair[1],
3728 notify_socket_pair[1],
3729 kmsg_socket_pair[1],
3730 rtnl_socket_pair[1],
3731 uid_shift_socket_pair[1],
3732 fds);
3733 if (r < 0)
3734 _exit(EXIT_FAILURE);
3735
3736 _exit(EXIT_SUCCESS);
3737 }
3738
3739 barrier_set_role(&barrier, BARRIER_PARENT);
3740
3741 fds = fdset_free(fds);
3742
3743 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3744 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3745 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3746 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3747 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3748 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3749
3750 if (arg_userns_mode != USER_NAMESPACE_NO) {
3751 /* The child just let us know the UID shift it might have read from the image. */
3752 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3753 if (l < 0)
3754 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3755 if (l != sizeof arg_uid_shift) {
3756 log_error("Short read while reading UID shift.");
3757 return -EIO;
3758 }
3759
3760 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3761 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3762 * image, but if that's already in use, pick a new one, and report back to the child,
3763 * which one we now picked. */
3764
3765 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3766 if (r < 0)
3767 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3768
3769 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3770 if (l < 0)
3771 return log_error_errno(errno, "Failed to send UID shift: %m");
3772 if (l != sizeof arg_uid_shift) {
3773 log_error("Short write while writing UID shift.");
3774 return -EIO;
3775 }
3776 }
3777 }
3778
3779 /* Wait for the outer child. */
3780 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3781 if (r != 0)
3782 return r < 0 ? r : -EIO;
3783
3784 /* And now retrieve the PID of the inner child. */
3785 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3786 if (l < 0)
3787 return log_error_errno(errno, "Failed to read inner child PID: %m");
3788 if (l != sizeof *pid) {
3789 log_error("Short read while reading inner child PID.");
3790 return -EIO;
3791 }
3792
3793 /* We also retrieve container UUID in case it was generated by outer child */
3794 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3795 if (l < 0)
3796 return log_error_errno(errno, "Failed to read container machine ID: %m");
3797 if (l != sizeof(arg_uuid)) {
3798 log_error("Short read while reading container machined ID.");
3799 return -EIO;
3800 }
3801
3802 /* We also retrieve the socket used for notifications generated by outer child */
3803 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3804 if (notify_socket < 0)
3805 return log_error_errno(notify_socket,
3806 "Failed to receive notification socket from the outer child: %m");
3807
3808 log_debug("Init process invoked as PID "PID_FMT, *pid);
3809
3810 if (arg_userns_mode != USER_NAMESPACE_NO) {
3811 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3812 log_error("Child died too early.");
3813 return -ESRCH;
3814 }
3815
3816 r = setup_uid_map(*pid);
3817 if (r < 0)
3818 return r;
3819
3820 (void) barrier_place(&barrier); /* #2 */
3821 }
3822
3823 if (arg_private_network) {
3824
3825 r = move_network_interfaces(*pid, arg_network_interfaces);
3826 if (r < 0)
3827 return r;
3828
3829 if (arg_network_veth) {
3830 r = setup_veth(arg_machine, *pid, veth_name,
3831 arg_network_bridge || arg_network_zone);
3832 if (r < 0)
3833 return r;
3834 else if (r > 0)
3835 ifi = r;
3836
3837 if (arg_network_bridge) {
3838 /* Add the interface to a bridge */
3839 r = setup_bridge(veth_name, arg_network_bridge, false);
3840 if (r < 0)
3841 return r;
3842 if (r > 0)
3843 ifi = r;
3844 } else if (arg_network_zone) {
3845 /* Add the interface to a bridge, possibly creating it */
3846 r = setup_bridge(veth_name, arg_network_zone, true);
3847 if (r < 0)
3848 return r;
3849 if (r > 0)
3850 ifi = r;
3851 }
3852 }
3853
3854 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3855 if (r < 0)
3856 return r;
3857
3858 /* We created the primary and extra veth links now; let's remember this, so that we know to
3859 remove them later on. Note that we don't bother with removing veth links that were created
3860 here when their setup failed half-way, because in that case the kernel should be able to
3861 remove them on its own, since they cannot be referenced by anything yet. */
3862 *veth_created = true;
3863
3864 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3865 if (r < 0)
3866 return r;
3867
3868 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3869 if (r < 0)
3870 return r;
3871 }
3872
3873 if (arg_register) {
3874 r = register_machine(
3875 arg_machine,
3876 *pid,
3877 arg_directory,
3878 arg_uuid,
3879 ifi,
3880 arg_slice,
3881 arg_custom_mounts, arg_n_custom_mounts,
3882 arg_kill_signal,
3883 arg_property,
3884 arg_keep_unit,
3885 arg_container_service_name);
3886 if (r < 0)
3887 return r;
3888 }
3889
f0bef277 3890 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3891 if (r < 0)
3892 return r;
3893
3894 if (arg_keep_unit) {
3895 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3896 if (r < 0)
3897 return r;
3898 }
3899
3900 r = chown_cgroup(*pid, arg_uid_shift);
3901 if (r < 0)
3902 return r;
3903
3904 /* Notify the child that the parent is ready with all
3905 * its setup (including cgroup-ification), and that
3906 * the child can now hand over control to the code to
3907 * run inside the container. */
3908 (void) barrier_place(&barrier); /* #3 */
3909
3910 /* Block SIGCHLD here, before notifying child.
3911 * process_pty() will handle it with the other signals. */
3912 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3913
3914 /* Reset signal to default */
3915 r = default_signals(SIGCHLD, -1);
3916 if (r < 0)
3917 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3918
3919 r = sd_event_new(&event);
3920 if (r < 0)
3921 return log_error_errno(r, "Failed to get default event source: %m");
3922
3923 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
3924 if (r < 0)
3925 return r;
3926
3927 /* Let the child know that we are ready and wait that the child is completely ready now. */
3928 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3929 log_error("Child died too early.");
3930 return -ESRCH;
3931 }
3932
3933 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3934 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3935 etc_passwd_lock = safe_close(etc_passwd_lock);
3936
3937 sd_notifyf(false,
3938 "STATUS=Container running.\n"
3939 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3940 if (!arg_notify_ready)
3941 sd_notify(false, "READY=1\n");
3942
3943 if (arg_kill_signal > 0) {
3944 /* Try to kill the init system on SIGINT or SIGTERM */
3945 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3946 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3947 } else {
3948 /* Immediately exit */
3949 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3950 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3951 }
3952
3953 /* simply exit on sigchld */
3954 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3955
3956 if (arg_expose_ports) {
3957 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3958 if (r < 0)
3959 return r;
3960
3961 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3962 }
3963
3964 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3965
3966 r = pty_forward_new(event, master,
3967 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3968 &forward);
3969 if (r < 0)
3970 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3971
3972 r = sd_event_loop(event);
3973 if (r < 0)
3974 return log_error_errno(r, "Failed to run event loop: %m");
3975
3976 pty_forward_get_last_char(forward, &last_char);
3977
3978 forward = pty_forward_free(forward);
3979
3980 if (!arg_quiet && last_char != '\n')
3981 putc('\n', stdout);
3982
3983 /* Kill if it is not dead yet anyway */
3984 if (arg_register && !arg_keep_unit)
3985 terminate_machine(*pid);
3986
3987 /* Normally redundant, but better safe than sorry */
c67b0082 3988 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3989
3990 r = wait_for_container(*pid, &container_status);
3991 *pid = 0;
3992
3993 if (r < 0)
3994 /* We failed to wait for the container, or the container exited abnormally. */
3995 return r;
3996 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3997 /* r > 0 → The container exited with a non-zero status.
3998 * As a special case, we need to replace 133 with a different value,
3999 * because 133 is special-cased in the service file to reboot the container.
4000 * otherwise → The container exited with zero status and a reboot was not requested.
4001 */
2a49b612 4002 if (r == EXIT_FORCE_RESTART)
27e29a1e 4003 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4004 *ret = r;
b0067625
ZJS
4005 return 0; /* finito */
4006 }
4007
4008 /* CONTAINER_REBOOTED, loop again */
4009
4010 if (arg_keep_unit) {
4011 /* Special handling if we are running as a service: instead of simply
4012 * restarting the machine we want to restart the entire service, so let's
4013 * inform systemd about this with the special exit code 133. The service
4014 * file uses RestartForceExitStatus=133 so that this results in a full
4015 * nspawn restart. This is necessary since we might have cgroup parameters
4016 * set we want to have flushed out. */
2a49b612
ZJS
4017 *ret = EXIT_FORCE_RESTART;
4018 return 0; /* finito */
b0067625
ZJS
4019 }
4020
4021 expose_port_flush(arg_expose_ports, exposed);
4022
4023 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4024 *veth_created = false;
4025 return 1; /* loop again */
4026}
4027
03cfe0d5
LP
4028int main(int argc, char *argv[]) {
4029
a6bc7db9 4030 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
4031 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4032 _cleanup_close_ int master = -1, image_fd = -1;
4033 _cleanup_fdset_free_ FDSet *fds = NULL;
cfed63f6 4034 int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS;
5aa3eba5 4035 char veth_name[IFNAMSIZ] = "";
17cbb288 4036 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4037 pid_t pid = 0;
03cfe0d5
LP
4038 union in_addr_union exposed = {};
4039 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4040 bool interactive, veth_created = false, remove_tmprootdir = false;
4041 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
03cfe0d5
LP
4042
4043 log_parse_environment();
4044 log_open();
4045
7732f92b
LP
4046 /* Make sure rename_process() in the stub init process can work */
4047 saved_argv = argv;
4048 saved_argc = argc;
4049
03cfe0d5
LP
4050 r = parse_argv(argc, argv);
4051 if (r <= 0)
4052 goto finish;
4053
03cfe0d5
LP
4054 if (geteuid() != 0) {
4055 log_error("Need to be root.");
4056 r = -EPERM;
4057 goto finish;
4058 }
f757855e
LP
4059 r = determine_names();
4060 if (r < 0)
4061 goto finish;
4062
4063 r = load_settings();
4064 if (r < 0)
4065 goto finish;
4066
4067 r = verify_arguments();
4068 if (r < 0)
4069 goto finish;
03cfe0d5
LP
4070
4071 n_fd_passed = sd_listen_fds(false);
4072 if (n_fd_passed > 0) {
4073 r = fdset_new_listen_fds(&fds, false);
4074 if (r < 0) {
4075 log_error_errno(r, "Failed to collect file descriptors: %m");
4076 goto finish;
4077 }
4078 }
4079
4080 if (arg_directory) {
4081 assert(!arg_image);
4082
4083 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4084 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4085 r = -EINVAL;
4086 goto finish;
4087 }
4088
4089 if (arg_ephemeral) {
4090 _cleanup_free_ char *np = NULL;
4091
8d4aa2bb 4092 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4093 if (r < 0)
4094 goto finish;
4095
03cfe0d5
LP
4096 /* If the specified path is a mount point we
4097 * generate the new snapshot immediately
4098 * inside it under a random name. However if
4099 * the specified is not a mount point we
4100 * create the new snapshot in the parent
4101 * directory, just next to it. */
e1873695 4102 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4103 if (r < 0) {
4104 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4105 goto finish;
4106 }
4107 if (r > 0)
770b5ce4 4108 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4109 else
770b5ce4 4110 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4111 if (r < 0) {
0f3be6ca 4112 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4113 goto finish;
4114 }
4115
4116 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4117 if (r < 0) {
4118 log_error_errno(r, "Failed to lock %s: %m", np);
4119 goto finish;
4120 }
4121
17cbb288
LP
4122 r = btrfs_subvol_snapshot(arg_directory, np,
4123 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4124 BTRFS_SNAPSHOT_FALLBACK_COPY |
4125 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4126 BTRFS_SNAPSHOT_RECURSIVE |
4127 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4128 if (r < 0) {
4129 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4130 goto finish;
ec16945e
LP
4131 }
4132
4133 free(arg_directory);
4134 arg_directory = np;
8a16a7b4 4135 np = NULL;
ec16945e 4136
17cbb288 4137 remove_directory = true;
30535c16
LP
4138
4139 } else {
cb638b5e 4140 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4141 if (r < 0)
4142 goto finish;
4143
30535c16
LP
4144 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4145 if (r == -EBUSY) {
4146 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4147 goto finish;
4148 }
4149 if (r < 0) {
4150 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4151 goto finish;
30535c16
LP
4152 }
4153
4154 if (arg_template) {
8d4aa2bb 4155 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4156 if (r < 0)
4157 goto finish;
4158
17cbb288
LP
4159 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4160 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4161 BTRFS_SNAPSHOT_FALLBACK_COPY |
4162 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4163 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4164 BTRFS_SNAPSHOT_RECURSIVE |
4165 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4166 if (r == -EEXIST) {
4167 if (!arg_quiet)
4168 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4169 } else if (r < 0) {
83521414 4170 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4171 goto finish;
4172 } else {
4173 if (!arg_quiet)
4174 log_info("Populated %s from template %s.", arg_directory, arg_template);
4175 }
4176 }
ec16945e
LP
4177 }
4178
7732f92b 4179 if (arg_start_mode == START_BOOT) {
1b9e5b12 4180 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4181 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4182 r = -EINVAL;
1b9e5b12
LP
4183 goto finish;
4184 }
4185 } else {
4186 const char *p;
4187
16fb773e
LP
4188 p = strjoina(arg_directory, "/usr/");
4189 if (laccess(p, F_OK) < 0) {
4190 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 4191 r = -EINVAL;
1b9e5b12 4192 goto finish;
1b9e5b12
LP
4193 }
4194 }
ec16945e 4195
6b9132a9 4196 } else {
ec16945e
LP
4197 assert(arg_image);
4198 assert(!arg_template);
4199
8d4aa2bb 4200 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4201 if (r < 0)
4202 goto finish;
4203
0f3be6ca
LP
4204 if (arg_ephemeral) {
4205 _cleanup_free_ char *np = NULL;
4206
4207 r = tempfn_random(arg_image, "machine.", &np);
4208 if (r < 0) {
4209 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4210 goto finish;
4211 }
4212
4213 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4214 if (r < 0) {
4215 r = log_error_errno(r, "Failed to create image lock: %m");
4216 goto finish;
4217 }
4218
4219 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL);
4220 if (r < 0) {
4221 r = log_error_errno(r, "Failed to copy image file: %m");
4222 goto finish;
4223 }
4224
4225 free(arg_image);
4226 arg_image = np;
4227 np = NULL;
4228
4229 remove_image = true;
4230 } else {
4231 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4232 if (r == -EBUSY) {
4233 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4234 goto finish;
4235 }
4236 if (r < 0) {
4237 r = log_error_errno(r, "Failed to create image lock: %m");
4238 goto finish;
4239 }
30535c16
LP
4240 }
4241
c67b0082 4242 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4243 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4244 goto finish;
1b9e5b12 4245 }
6b9132a9 4246
c67b0082
LP
4247 remove_tmprootdir = true;
4248
4249 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4250 if (!arg_directory) {
4251 r = log_oom();
4252 goto finish;
6b9132a9 4253 }
88213476 4254
1b9e5b12
LP
4255 image_fd = setup_image(&device_path, &loop_nr);
4256 if (image_fd < 0) {
4257 r = image_fd;
842f3b0f
LP
4258 goto finish;
4259 }
1b9e5b12 4260
4d9f07b4
LP
4261 r = dissect_image(image_fd,
4262 &root_device, &root_device_rw,
4263 &home_device, &home_device_rw,
4264 &srv_device, &srv_device_rw,
a6bc7db9 4265 &esp_device,
4d9f07b4 4266 &secondary);
1b9e5b12
LP
4267 if (r < 0)
4268 goto finish;
0f3be6ca
LP
4269
4270 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4271 if (remove_image && unlink(arg_image) >= 0)
4272 remove_image = false;
842f3b0f 4273 }
842f3b0f 4274
86c0dd4a 4275 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4276 if (r < 0)
4277 goto finish;
4278
03cfe0d5
LP
4279 interactive =
4280 isatty(STDIN_FILENO) > 0 &&
4281 isatty(STDOUT_FILENO) > 0;
9c857b9d 4282
db7feb7e
LP
4283 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4284 if (master < 0) {
ec16945e 4285 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4286 goto finish;
4287 }
4288
611b312b
LP
4289 r = ptsname_malloc(master, &console);
4290 if (r < 0) {
4291 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4292 goto finish;
68b02049
DW
4293 }
4294
4295 if (arg_selinux_apifs_context) {
4296 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4297 if (r < 0)
4298 goto finish;
a258bf26
LP
4299 }
4300
a258bf26 4301 if (unlockpt(master) < 0) {
ec16945e 4302 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4303 goto finish;
4304 }
4305
9c857b9d
LP
4306 if (!arg_quiet)
4307 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4308 arg_machine, arg_image ?: arg_directory);
4309
72c0a2c2 4310 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4311
03cfe0d5
LP
4312 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4313 r = log_error_errno(errno, "Failed to become subreaper: %m");
4314 goto finish;
4315 }
4316
d87be9b0 4317 for (;;) {
b0067625
ZJS
4318 r = run(master,
4319 console,
4320 root_device, root_device_rw,
4321 home_device, home_device_rw,
4322 srv_device, srv_device_rw,
4323 esp_device,
4324 interactive, secondary,
4325 fds,
4326 veth_name, &veth_created,
4327 &exposed,
4328 &pid, &ret);
4329 if (r <= 0)
d87be9b0 4330 break;
d87be9b0 4331 }
88213476
LP
4332
4333finish:
af4ec430 4334 sd_notify(false,
2a49b612
ZJS
4335 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4336 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4337
9444b1f2 4338 if (pid > 0)
c67b0082 4339 (void) kill(pid, SIGKILL);
88213476 4340
503546da 4341 /* Try to flush whatever is still queued in the pty */
6a0f896b 4342 if (master >= 0) {
59f448cf 4343 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
6a0f896b
LP
4344 master = safe_close(master);
4345 }
4346
4347 if (pid > 0)
4348 (void) wait_for_terminate(pid, NULL);
503546da 4349
03cfe0d5
LP
4350 loop_remove(loop_nr, &image_fd);
4351
17cbb288 4352 if (remove_directory && arg_directory) {
ec16945e
LP
4353 int k;
4354
17cbb288 4355 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4356 if (k < 0)
17cbb288 4357 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4358 }
4359
0f3be6ca
LP
4360 if (remove_image && arg_image) {
4361 if (unlink(arg_image) < 0)
4362 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4363 }
4364
c67b0082
LP
4365 if (remove_tmprootdir) {
4366 if (rmdir(tmprootdir) < 0)
4367 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4368 }
4369
785890ac
LP
4370 if (arg_machine) {
4371 const char *p;
4372
63c372cb 4373 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4374 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4375 }
4376
7a8f6325 4377 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4378
4379 if (veth_created)
4380 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4381 (void) remove_bridge(arg_network_zone);
f757855e 4382
04d391da 4383 free(arg_directory);
ec16945e
LP
4384 free(arg_template);
4385 free(arg_image);
7027ff61 4386 free(arg_machine);
c74e630d 4387 free(arg_user);
5f932eb9 4388 free(arg_chdir);
c74e630d 4389 strv_free(arg_setenv);
f757855e 4390 free(arg_network_bridge);
c74e630d
LP
4391 strv_free(arg_network_interfaces);
4392 strv_free(arg_network_macvlan);
4bbfe7ad 4393 strv_free(arg_network_ipvlan);
f6d6bad1 4394 strv_free(arg_network_veth_extra);
f757855e
LP
4395 strv_free(arg_parameters);
4396 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4397 expose_port_free_all(arg_expose_ports);
6d0b55c2 4398
ec16945e 4399 return r < 0 ? EXIT_FAILURE : ret;
88213476 4400}