]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #4843 from joukewitteveen/protocol
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
f97b34a6 60#include "format-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
118 CONTAINER_REBOOTED
119} ContainerStatus;
120
57fb9fb5
LP
121typedef enum LinkJournal {
122 LINK_NO,
123 LINK_AUTO,
124 LINK_HOST,
125 LINK_GUEST
126} LinkJournal;
88213476
LP
127
128static char *arg_directory = NULL;
ec16945e 129static char *arg_template = NULL;
5f932eb9 130static char *arg_chdir = NULL;
687d0825 131static char *arg_user = NULL;
9444b1f2 132static sd_id128_t arg_uuid = {};
7027ff61 133static char *arg_machine = NULL;
c74e630d
LP
134static const char *arg_selinux_context = NULL;
135static const char *arg_selinux_apifs_context = NULL;
9444b1f2 136static const char *arg_slice = NULL;
ff01d048 137static bool arg_private_network = false;
bc2f673e 138static bool arg_read_only = false;
7732f92b 139static StartMode arg_start_mode = START_PID1;
ec16945e 140static bool arg_ephemeral = false;
57fb9fb5 141static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 142static bool arg_link_journal_try = false;
520e0d54 143static uint64_t arg_caps_retain =
50b52222
LP
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 155 (1ULL << CAP_MKNOD) |
5076f0cc
LP
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
5076f0cc 159 (1ULL << CAP_SETFCAP) |
50b52222 160 (1ULL << CAP_SETGID) |
5076f0cc
LP
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
50b52222 164 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
50b52222 169 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
170static CustomMount *arg_custom_mounts = NULL;
171static unsigned arg_n_custom_mounts = 0;
f4889f65 172static char **arg_setenv = NULL;
284c0b91 173static bool arg_quiet = false;
eb91eb18 174static bool arg_register = true;
89f7c846 175static bool arg_keep_unit = false;
aa28aefe 176static char **arg_network_interfaces = NULL;
c74e630d 177static char **arg_network_macvlan = NULL;
4bbfe7ad 178static char **arg_network_ipvlan = NULL;
69c79d3c 179static bool arg_network_veth = false;
f6d6bad1 180static char **arg_network_veth_extra = NULL;
f757855e 181static char *arg_network_bridge = NULL;
22b28dfd 182static char *arg_network_zone = NULL;
050f7277 183static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 184static char *arg_image = NULL;
f757855e 185static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
f36933fe 187static char **arg_property = NULL;
0de7acce 188static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 189static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 190static bool arg_userns_chown = false;
c6c8f6e2 191static int arg_kill_signal = 0;
5da38d07 192static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
193static SettingsMask arg_settings_mask = 0;
194static int arg_settings_trusted = -1;
195static char **arg_parameters = NULL;
6aadfa4c 196static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 197static bool arg_notify_ready = false;
5a8ff0e6 198static bool arg_use_cgns = true;
0c582db0 199static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 200static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
88213476 201
601185b4 202static void help(void) {
88213476
LP
203 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
204 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
205 " -h --help Show this help\n"
206 " --version Print version string\n"
69c79d3c 207 " -q --quiet Do not show status information\n"
1b9e5b12 208 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
209 " --template=PATH Initialize root directory from template directory,\n"
210 " if missing\n"
211 " -x --ephemeral Run container with snapshot of root directory, and\n"
212 " remove it after exit\n"
213 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 214 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 215 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 216 " --chdir=PATH Set working directory in the container\n"
a8828ed9 217 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 218 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 219 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 220 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 221 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 222 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 223 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 224 " Similar, but with user configured UID/GID range\n"
24597ee0 225 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
226 " --private-network Disable network in container\n"
227 " --network-interface=INTERFACE\n"
228 " Assign an existing network interface to the\n"
229 " container\n"
c74e630d
LP
230 " --network-macvlan=INTERFACE\n"
231 " Create a macvlan network interface based on an\n"
232 " existing network interface to the container\n"
4bbfe7ad
TG
233 " --network-ipvlan=INTERFACE\n"
234 " Create a ipvlan network interface based on an\n"
235 " existing network interface to the container\n"
a8eaaee7 236 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 237 " and container\n"
f6d6bad1
LP
238 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
239 " Add an additional virtual Ethernet link between\n"
240 " host and container\n"
ab046dde 241 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
242 " Add a virtual Ethernet connection to the container\n"
243 " and attach it to an existing bridge on the host\n"
244 " --network-zone=NAME Similar, but attach the new interface to an\n"
245 " an automatically managed bridge interface\n"
6d0b55c2 246 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 247 " Expose a container IP port on the host\n"
82adf6af
LP
248 " -Z --selinux-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " processes in the container\n"
251 " -L --selinux-apifs-context=SECLABEL\n"
252 " Set the SELinux security context to be used by\n"
253 " API/tmpfs file systems in the container\n"
a8828ed9
DW
254 " --capability=CAP In addition to the default, retain specified\n"
255 " capability\n"
256 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 257 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
258 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
259 " host, try-guest, try-host\n"
574edc90 260 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 261 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
262 " --bind=PATH[:PATH[:OPTIONS]]\n"
263 " Bind mount a file or directory from the host into\n"
a8828ed9 264 " the container\n"
5e5bfa6e
EY
265 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
266 " Similar, but creates a read-only bind mount\n"
06c17c39 267 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
268 " --overlay=PATH[:PATH...]:PATH\n"
269 " Create an overlay mount from the host to \n"
270 " the container\n"
271 " --overlay-ro=PATH[:PATH...]:PATH\n"
272 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 273 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 274 " --register=BOOLEAN Register container as machine\n"
89f7c846 275 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 276 " the service unit nspawn is running in\n"
6d0b55c2 277 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 278 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 279 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 280 , program_invocation_short_name);
88213476
LP
281}
282
86c0dd4a 283static int custom_mount_check_all(void) {
5a8af538 284 unsigned i;
5a8af538 285
5a8af538
LP
286 for (i = 0; i < arg_n_custom_mounts; i++) {
287 CustomMount *m = &arg_custom_mounts[i];
288
0de7acce 289 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
290
291 if (arg_userns_chown) {
292 log_error("--private-users-chown may not be combined with custom root mounts.");
293 return -EINVAL;
294 } else if (arg_uid_shift == UID_INVALID) {
295 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
296 return -EINVAL;
297 }
825d5287 298 }
5a8af538
LP
299 }
300
301 return 0;
302}
303
0fd9563f 304static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 305 const char *e;
5da38d07
TH
306 int r, all_unified, systemd_unified;
307
efdb0237
LP
308 /* Allow the user to control whether the unified hierarchy is used */
309 e = getenv("UNIFIED_CGROUP_HIERARCHY");
310 if (e) {
311 r = parse_boolean(e);
312 if (r < 0)
313 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
314 if (r > 0)
315 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
316 else
317 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 318
efdb0237
LP
319 return 0;
320 }
321
98afd6af
ZJS
322 all_unified = cg_all_unified();
323 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
324
325 if (all_unified < 0 || systemd_unified < 0)
326 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
327 "Failed to determine whether the unified cgroups hierarchy is used: %m");
328
efdb0237 329 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
330 if (all_unified > 0) {
331 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
332 * routine only detects 231, so we'll have a false negative here for 230. */
333 r = systemd_installation_has_version(directory, 230);
334 if (r < 0)
335 return log_error_errno(r, "Failed to determine systemd version in container: %m");
336 if (r > 0)
337 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
338 else
339 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
340 } else if (systemd_unified > 0) {
341 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
342 r = systemd_installation_has_version(directory, 232);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine systemd version in container: %m");
345 if (r > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
347 else
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
349 } else
5da38d07 350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 351
efdb0237
LP
352 return 0;
353}
354
0c582db0
LB
355static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
356 int r;
357
358 r = getenv_bool(name);
359 if (r == -ENXIO)
360 return;
361 if (r < 0)
362 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
363 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
364}
365
4f086aab
SU
366static void parse_mount_settings_env(void) {
367 int r;
368 const char *e;
369
370 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
371 if (!e)
372 return;
373
374 if (streq(e, "network")) {
375 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
376 return;
377 }
378
379 r = parse_boolean(e);
380 if (r < 0) {
381 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
382 return;
383 } else if (r > 0)
384 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
385 else
386 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
387
388 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
389}
390
88213476
LP
391static int parse_argv(int argc, char *argv[]) {
392
a41fe3a2 393 enum {
acbeb427
ZJS
394 ARG_VERSION = 0x100,
395 ARG_PRIVATE_NETWORK,
bc2f673e 396 ARG_UUID,
5076f0cc 397 ARG_READ_ONLY,
57fb9fb5 398 ARG_CAPABILITY,
420c7379 399 ARG_DROP_CAPABILITY,
17fe0523
LP
400 ARG_LINK_JOURNAL,
401 ARG_BIND,
f4889f65 402 ARG_BIND_RO,
06c17c39 403 ARG_TMPFS,
5a8af538
LP
404 ARG_OVERLAY,
405 ARG_OVERLAY_RO,
eb91eb18 406 ARG_SHARE_SYSTEM,
89f7c846 407 ARG_REGISTER,
aa28aefe 408 ARG_KEEP_UNIT,
69c79d3c 409 ARG_NETWORK_INTERFACE,
c74e630d 410 ARG_NETWORK_MACVLAN,
4bbfe7ad 411 ARG_NETWORK_IPVLAN,
ab046dde 412 ARG_NETWORK_BRIDGE,
22b28dfd 413 ARG_NETWORK_ZONE,
f6d6bad1 414 ARG_NETWORK_VETH_EXTRA,
6afc95b7 415 ARG_PERSONALITY,
4d9f07b4 416 ARG_VOLATILE,
ec16945e 417 ARG_TEMPLATE,
f36933fe 418 ARG_PROPERTY,
6dac160c 419 ARG_PRIVATE_USERS,
c6c8f6e2 420 ARG_KILL_SIGNAL,
f757855e 421 ARG_SETTINGS,
5f932eb9 422 ARG_CHDIR,
7336138e 423 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 424 ARG_NOTIFY_READY,
a41fe3a2
LP
425 };
426
88213476 427 static const struct option options[] = {
27eb8e90
ZJS
428 { "help", no_argument, NULL, 'h' },
429 { "version", no_argument, NULL, ARG_VERSION },
430 { "directory", required_argument, NULL, 'D' },
431 { "template", required_argument, NULL, ARG_TEMPLATE },
432 { "ephemeral", no_argument, NULL, 'x' },
433 { "user", required_argument, NULL, 'u' },
434 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
435 { "as-pid2", no_argument, NULL, 'a' },
436 { "boot", no_argument, NULL, 'b' },
437 { "uuid", required_argument, NULL, ARG_UUID },
438 { "read-only", no_argument, NULL, ARG_READ_ONLY },
439 { "capability", required_argument, NULL, ARG_CAPABILITY },
440 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
441 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
442 { "bind", required_argument, NULL, ARG_BIND },
443 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
444 { "tmpfs", required_argument, NULL, ARG_TMPFS },
445 { "overlay", required_argument, NULL, ARG_OVERLAY },
446 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
447 { "machine", required_argument, NULL, 'M' },
448 { "slice", required_argument, NULL, 'S' },
449 { "setenv", required_argument, NULL, 'E' },
450 { "selinux-context", required_argument, NULL, 'Z' },
451 { "selinux-apifs-context", required_argument, NULL, 'L' },
452 { "quiet", no_argument, NULL, 'q' },
453 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
454 { "register", required_argument, NULL, ARG_REGISTER },
455 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
456 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
457 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
458 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
459 { "network-veth", no_argument, NULL, 'n' },
460 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
461 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
462 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
463 { "personality", required_argument, NULL, ARG_PERSONALITY },
464 { "image", required_argument, NULL, 'i' },
465 { "volatile", optional_argument, NULL, ARG_VOLATILE },
466 { "port", required_argument, NULL, 'p' },
467 { "property", required_argument, NULL, ARG_PROPERTY },
468 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
469 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
470 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
471 { "settings", required_argument, NULL, ARG_SETTINGS },
472 { "chdir", required_argument, NULL, ARG_CHDIR },
473 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 474 {}
88213476
LP
475 };
476
9444b1f2 477 int c, r;
6aadfa4c 478 const char *p, *e;
a42c8b54 479 uint64_t plus = 0, minus = 0;
f757855e 480 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
481
482 assert(argc >= 0);
483 assert(argv);
484
19aac838 485 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
486
487 switch (c) {
488
489 case 'h':
601185b4
ZJS
490 help();
491 return 0;
88213476 492
acbeb427 493 case ARG_VERSION:
3f6fd1ba 494 return version();
acbeb427 495
88213476 496 case 'D':
0f03c2a4 497 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 498 if (r < 0)
0f03c2a4 499 return r;
ec16945e
LP
500 break;
501
502 case ARG_TEMPLATE:
0f03c2a4 503 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 504 if (r < 0)
0f03c2a4 505 return r;
88213476
LP
506 break;
507
1b9e5b12 508 case 'i':
0f03c2a4 509 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 510 if (r < 0)
0f03c2a4 511 return r;
ec16945e
LP
512 break;
513
514 case 'x':
515 arg_ephemeral = true;
1b9e5b12
LP
516 break;
517
687d0825 518 case 'u':
2fc09a9c
DM
519 r = free_and_strdup(&arg_user, optarg);
520 if (r < 0)
7027ff61 521 return log_oom();
687d0825 522
f757855e 523 arg_settings_mask |= SETTING_USER;
687d0825
MV
524 break;
525
22b28dfd
LP
526 case ARG_NETWORK_ZONE: {
527 char *j;
528
529 j = strappend("vz-", optarg);
530 if (!j)
531 return log_oom();
532
533 if (!ifname_valid(j)) {
534 log_error("Network zone name not valid: %s", j);
535 free(j);
536 return -EINVAL;
537 }
538
539 free(arg_network_zone);
540 arg_network_zone = j;
541
542 arg_network_veth = true;
543 arg_private_network = true;
544 arg_settings_mask |= SETTING_NETWORK;
545 break;
546 }
547
ab046dde 548 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
549
550 if (!ifname_valid(optarg)) {
551 log_error("Bridge interface name not valid: %s", optarg);
552 return -EINVAL;
553 }
554
f757855e
LP
555 r = free_and_strdup(&arg_network_bridge, optarg);
556 if (r < 0)
557 return log_oom();
ab046dde
TG
558
559 /* fall through */
560
0dfaa006 561 case 'n':
69c79d3c
LP
562 arg_network_veth = true;
563 arg_private_network = true;
f757855e 564 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
565 break;
566
f6d6bad1
LP
567 case ARG_NETWORK_VETH_EXTRA:
568 r = veth_extra_parse(&arg_network_veth_extra, optarg);
569 if (r < 0)
570 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
571
572 arg_private_network = true;
573 arg_settings_mask |= SETTING_NETWORK;
574 break;
575
aa28aefe 576 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
577
578 if (!ifname_valid(optarg)) {
579 log_error("Network interface name not valid: %s", optarg);
580 return -EINVAL;
581 }
582
c74e630d
LP
583 if (strv_extend(&arg_network_interfaces, optarg) < 0)
584 return log_oom();
585
586 arg_private_network = true;
f757855e 587 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
588 break;
589
590 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
591
592 if (!ifname_valid(optarg)) {
593 log_error("MACVLAN network interface name not valid: %s", optarg);
594 return -EINVAL;
595 }
596
c74e630d 597 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
598 return log_oom();
599
4bbfe7ad 600 arg_private_network = true;
f757855e 601 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
602 break;
603
604 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
605
606 if (!ifname_valid(optarg)) {
607 log_error("IPVLAN network interface name not valid: %s", optarg);
608 return -EINVAL;
609 }
610
4bbfe7ad
TG
611 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
612 return log_oom();
613
aa28aefe
LP
614 /* fall through */
615
ff01d048
LP
616 case ARG_PRIVATE_NETWORK:
617 arg_private_network = true;
f757855e 618 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
619 break;
620
0f0dbc46 621 case 'b':
7732f92b
LP
622 if (arg_start_mode == START_PID2) {
623 log_error("--boot and --as-pid2 may not be combined.");
624 return -EINVAL;
625 }
626
627 arg_start_mode = START_BOOT;
628 arg_settings_mask |= SETTING_START_MODE;
629 break;
630
631 case 'a':
632 if (arg_start_mode == START_BOOT) {
633 log_error("--boot and --as-pid2 may not be combined.");
634 return -EINVAL;
635 }
636
637 arg_start_mode = START_PID2;
638 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
639 break;
640
144f0fc0 641 case ARG_UUID:
9444b1f2 642 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
643 if (r < 0)
644 return log_error_errno(r, "Invalid UUID: %s", optarg);
645
646 if (sd_id128_is_null(arg_uuid)) {
647 log_error("Machine UUID may not be all zeroes.");
648 return -EINVAL;
aa96c6cb 649 }
f757855e
LP
650
651 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 652 break;
aa96c6cb 653
9444b1f2 654 case 'S':
c74e630d 655 arg_slice = optarg;
144f0fc0
LP
656 break;
657
7027ff61 658 case 'M':
c1521918 659 if (isempty(optarg))
97b11eed 660 arg_machine = mfree(arg_machine);
c1521918 661 else {
0c3c4284 662 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
663 log_error("Invalid machine name: %s", optarg);
664 return -EINVAL;
665 }
7027ff61 666
0c3c4284
LP
667 r = free_and_strdup(&arg_machine, optarg);
668 if (r < 0)
eb91eb18
LP
669 return log_oom();
670
671 break;
672 }
7027ff61 673
82adf6af
LP
674 case 'Z':
675 arg_selinux_context = optarg;
a8828ed9
DW
676 break;
677
82adf6af
LP
678 case 'L':
679 arg_selinux_apifs_context = optarg;
a8828ed9
DW
680 break;
681
bc2f673e
LP
682 case ARG_READ_ONLY:
683 arg_read_only = true;
f757855e 684 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
685 break;
686
420c7379
LP
687 case ARG_CAPABILITY:
688 case ARG_DROP_CAPABILITY: {
6cbe4ed1 689 p = optarg;
9ed794a3 690 for (;;) {
6cbe4ed1 691 _cleanup_free_ char *t = NULL;
5076f0cc 692
6cbe4ed1
SS
693 r = extract_first_word(&p, &t, ",", 0);
694 if (r < 0)
695 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 696
6cbe4ed1
SS
697 if (r == 0)
698 break;
5076f0cc 699
39ed67d1
LP
700 if (streq(t, "all")) {
701 if (c == ARG_CAPABILITY)
a42c8b54 702 plus = (uint64_t) -1;
39ed67d1 703 else
a42c8b54 704 minus = (uint64_t) -1;
39ed67d1 705 } else {
2822da4f
LP
706 int cap;
707
708 cap = capability_from_name(t);
709 if (cap < 0) {
39ed67d1
LP
710 log_error("Failed to parse capability %s.", t);
711 return -EINVAL;
712 }
713
714 if (c == ARG_CAPABILITY)
a42c8b54 715 plus |= 1ULL << (uint64_t) cap;
39ed67d1 716 else
a42c8b54 717 minus |= 1ULL << (uint64_t) cap;
5076f0cc 718 }
5076f0cc
LP
719 }
720
f757855e 721 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
722 break;
723 }
724
57fb9fb5
LP
725 case 'j':
726 arg_link_journal = LINK_GUEST;
574edc90 727 arg_link_journal_try = true;
57fb9fb5
LP
728 break;
729
730 case ARG_LINK_JOURNAL:
53e438e3 731 if (streq(optarg, "auto")) {
57fb9fb5 732 arg_link_journal = LINK_AUTO;
53e438e3
LP
733 arg_link_journal_try = false;
734 } else if (streq(optarg, "no")) {
57fb9fb5 735 arg_link_journal = LINK_NO;
53e438e3
LP
736 arg_link_journal_try = false;
737 } else if (streq(optarg, "guest")) {
57fb9fb5 738 arg_link_journal = LINK_GUEST;
53e438e3
LP
739 arg_link_journal_try = false;
740 } else if (streq(optarg, "host")) {
57fb9fb5 741 arg_link_journal = LINK_HOST;
53e438e3
LP
742 arg_link_journal_try = false;
743 } else if (streq(optarg, "try-guest")) {
574edc90
MP
744 arg_link_journal = LINK_GUEST;
745 arg_link_journal_try = true;
746 } else if (streq(optarg, "try-host")) {
747 arg_link_journal = LINK_HOST;
748 arg_link_journal_try = true;
749 } else {
57fb9fb5
LP
750 log_error("Failed to parse link journal mode %s", optarg);
751 return -EINVAL;
752 }
753
754 break;
755
17fe0523 756 case ARG_BIND:
f757855e
LP
757 case ARG_BIND_RO:
758 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
759 if (r < 0)
760 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 761
f757855e 762 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 763 break;
06c17c39 764
f757855e
LP
765 case ARG_TMPFS:
766 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
767 if (r < 0)
768 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 769
f757855e 770 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 771 break;
5a8af538
LP
772
773 case ARG_OVERLAY:
ad85779a
LP
774 case ARG_OVERLAY_RO:
775 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
776 if (r == -EADDRNOTAVAIL)
777 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
778 if (r < 0)
779 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 780
f757855e 781 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 782 break;
06c17c39 783
a5f1cb3b 784 case 'E': {
f4889f65
LP
785 char **n;
786
787 if (!env_assignment_is_valid(optarg)) {
788 log_error("Environment variable assignment '%s' is not valid.", optarg);
789 return -EINVAL;
790 }
791
792 n = strv_env_set(arg_setenv, optarg);
793 if (!n)
794 return log_oom();
795
796 strv_free(arg_setenv);
797 arg_setenv = n;
f757855e
LP
798
799 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
800 break;
801 }
802
284c0b91
LP
803 case 'q':
804 arg_quiet = true;
805 break;
806
8a96d94e 807 case ARG_SHARE_SYSTEM:
a6b5216c 808 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
809 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
810 arg_clone_ns_flags = 0;
8a96d94e
LP
811 break;
812
eb91eb18
LP
813 case ARG_REGISTER:
814 r = parse_boolean(optarg);
815 if (r < 0) {
816 log_error("Failed to parse --register= argument: %s", optarg);
817 return r;
818 }
819
820 arg_register = r;
821 break;
822
89f7c846
LP
823 case ARG_KEEP_UNIT:
824 arg_keep_unit = true;
825 break;
826
6afc95b7
LP
827 case ARG_PERSONALITY:
828
ac45f971 829 arg_personality = personality_from_string(optarg);
050f7277 830 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
831 log_error("Unknown or unsupported personality '%s'.", optarg);
832 return -EINVAL;
833 }
834
f757855e 835 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
836 break;
837
4d9f07b4
LP
838 case ARG_VOLATILE:
839
840 if (!optarg)
f757855e 841 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 842 else {
f757855e 843 VolatileMode m;
4d9f07b4 844
f757855e
LP
845 m = volatile_mode_from_string(optarg);
846 if (m < 0) {
847 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 848 return -EINVAL;
f757855e
LP
849 } else
850 arg_volatile_mode = m;
6d0b55c2
LP
851 }
852
f757855e
LP
853 arg_settings_mask |= SETTING_VOLATILE_MODE;
854 break;
6d0b55c2 855
f757855e
LP
856 case 'p':
857 r = expose_port_parse(&arg_expose_ports, optarg);
858 if (r == -EEXIST)
859 return log_error_errno(r, "Duplicate port specification: %s", optarg);
860 if (r < 0)
861 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 862
f757855e 863 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 864 break;
6d0b55c2 865
f36933fe
LP
866 case ARG_PROPERTY:
867 if (strv_extend(&arg_property, optarg) < 0)
868 return log_oom();
869
870 break;
871
ae209204
ZJS
872 case ARG_PRIVATE_USERS: {
873 int boolean = -1;
0de7acce 874
ae209204
ZJS
875 if (!optarg)
876 boolean = true;
877 else if (!in_charset(optarg, DIGITS))
878 /* do *not* parse numbers as booleans */
879 boolean = parse_boolean(optarg);
880
881 if (boolean == false) {
0de7acce
LP
882 /* no: User namespacing off */
883 arg_userns_mode = USER_NAMESPACE_NO;
884 arg_uid_shift = UID_INVALID;
885 arg_uid_range = UINT32_C(0x10000);
ae209204 886 } else if (boolean == true) {
0de7acce
LP
887 /* yes: User namespacing on, UID range is read from root dir */
888 arg_userns_mode = USER_NAMESPACE_FIXED;
889 arg_uid_shift = UID_INVALID;
890 arg_uid_range = UINT32_C(0x10000);
891 } else if (streq(optarg, "pick")) {
892 /* pick: User namespacing on, UID range is picked randomly */
893 arg_userns_mode = USER_NAMESPACE_PICK;
894 arg_uid_shift = UID_INVALID;
895 arg_uid_range = UINT32_C(0x10000);
896 } else {
6c2058b3 897 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
898 const char *range, *shift;
899
0de7acce
LP
900 /* anything else: User namespacing on, UID range is explicitly configured */
901
6dac160c
LP
902 range = strchr(optarg, ':');
903 if (range) {
6c2058b3
ZJS
904 buffer = strndup(optarg, range - optarg);
905 if (!buffer)
906 return log_oom();
907 shift = buffer;
6dac160c
LP
908
909 range++;
bfd292ec
ZJS
910 r = safe_atou32(range, &arg_uid_range);
911 if (r < 0)
be715731 912 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
913 } else
914 shift = optarg;
915
be715731
ZJS
916 r = parse_uid(shift, &arg_uid_shift);
917 if (r < 0)
918 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
919
920 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
921 }
922
be715731
ZJS
923 if (arg_uid_range <= 0) {
924 log_error("UID range cannot be 0.");
925 return -EINVAL;
926 }
927
0de7acce 928 arg_settings_mask |= SETTING_USERNS;
6dac160c 929 break;
ae209204 930 }
6dac160c 931
0de7acce 932 case 'U':
ccabee0d
LP
933 if (userns_supported()) {
934 arg_userns_mode = USER_NAMESPACE_PICK;
935 arg_uid_shift = UID_INVALID;
936 arg_uid_range = UINT32_C(0x10000);
937
938 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
939 }
940
7336138e
LP
941 break;
942
0de7acce 943 case ARG_PRIVATE_USERS_CHOWN:
19aac838 944 arg_userns_chown = true;
0de7acce
LP
945
946 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
947 break;
948
c6c8f6e2
LP
949 case ARG_KILL_SIGNAL:
950 arg_kill_signal = signal_from_string_try_harder(optarg);
951 if (arg_kill_signal < 0) {
952 log_error("Cannot parse signal: %s", optarg);
953 return -EINVAL;
954 }
955
f757855e
LP
956 arg_settings_mask |= SETTING_KILL_SIGNAL;
957 break;
958
959 case ARG_SETTINGS:
960
961 /* no → do not read files
962 * yes → read files, do not override cmdline, trust only subset
963 * override → read files, override cmdline, trust only subset
964 * trusted → read files, do not override cmdline, trust all
965 */
966
967 r = parse_boolean(optarg);
968 if (r < 0) {
969 if (streq(optarg, "trusted")) {
970 mask_all_settings = false;
971 mask_no_settings = false;
972 arg_settings_trusted = true;
973
974 } else if (streq(optarg, "override")) {
975 mask_all_settings = false;
976 mask_no_settings = true;
977 arg_settings_trusted = -1;
978 } else
979 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
980 } else if (r > 0) {
981 /* yes */
982 mask_all_settings = false;
983 mask_no_settings = false;
984 arg_settings_trusted = -1;
985 } else {
986 /* no */
987 mask_all_settings = true;
988 mask_no_settings = false;
989 arg_settings_trusted = false;
990 }
991
c6c8f6e2
LP
992 break;
993
5f932eb9
LP
994 case ARG_CHDIR:
995 if (!path_is_absolute(optarg)) {
996 log_error("Working directory %s is not an absolute path.", optarg);
997 return -EINVAL;
998 }
999
1000 r = free_and_strdup(&arg_chdir, optarg);
1001 if (r < 0)
1002 return log_oom();
1003
1004 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1005 break;
1006
9c1e04d0
AP
1007 case ARG_NOTIFY_READY:
1008 r = parse_boolean(optarg);
1009 if (r < 0) {
1010 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1011 return -EINVAL;
1012 }
1013 arg_notify_ready = r;
1014 arg_settings_mask |= SETTING_NOTIFY_READY;
1015 break;
1016
88213476
LP
1017 case '?':
1018 return -EINVAL;
1019
1020 default:
eb9da376 1021 assert_not_reached("Unhandled option");
88213476 1022 }
88213476 1023
0c582db0
LB
1024 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1025 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1026 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1027 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1028
4f086aab
SU
1029 if (arg_userns_mode != USER_NAMESPACE_NO)
1030 arg_mount_settings |= MOUNT_USE_USERNS;
1031
1032 if (arg_private_network)
1033 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1034
1035 parse_mount_settings_env();
1036
48a8d337
LB
1037 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1038 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1039 arg_register = false;
0c582db0
LB
1040 if (arg_start_mode != START_PID1) {
1041 log_error("--boot cannot be used without namespacing.");
1042 return -EINVAL;
1043 }
1044 }
eb91eb18 1045
0de7acce 1046 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1047 arg_userns_chown = true;
1048
89f7c846
LP
1049 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1050 log_error("--keep-unit may not be used when invoked from a user session.");
1051 return -EINVAL;
1052 }
1053
1b9e5b12
LP
1054 if (arg_directory && arg_image) {
1055 log_error("--directory= and --image= may not be combined.");
1056 return -EINVAL;
1057 }
1058
ec16945e
LP
1059 if (arg_template && arg_image) {
1060 log_error("--template= and --image= may not be combined.");
1061 return -EINVAL;
1062 }
1063
8cd328d8
LP
1064 if (arg_ephemeral && arg_template && !arg_directory) {
1065 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1066 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1067 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1068 * --directory=". */
1069
1070 arg_directory = arg_template;
1071 arg_template = NULL;
1072 }
1073
ec16945e
LP
1074 if (arg_template && !(arg_directory || arg_machine)) {
1075 log_error("--template= needs --directory= or --machine=.");
1076 return -EINVAL;
1077 }
1078
1079 if (arg_ephemeral && arg_template) {
1080 log_error("--ephemeral and --template= may not be combined.");
1081 return -EINVAL;
1082 }
1083
df9a75e4
LP
1084 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1085 log_error("--ephemeral and --link-journal= may not be combined.");
1086 return -EINVAL;
1087 }
1088
ccabee0d 1089 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1090 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1091 return -EOPNOTSUPP;
1092 }
1093
1094 if (arg_userns_chown && arg_read_only) {
1095 log_error("--read-only and --private-users-chown may not be combined.");
1096 return -EINVAL;
1097 }
f757855e 1098
22b28dfd
LP
1099 if (arg_network_bridge && arg_network_zone) {
1100 log_error("--network-bridge= and --network-zone= may not be combined.");
1101 return -EINVAL;
1102 }
1103
f757855e
LP
1104 if (argc > optind) {
1105 arg_parameters = strv_copy(argv + optind);
1106 if (!arg_parameters)
1107 return log_oom();
1108
7732f92b 1109 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1110 }
1111
1112 /* Load all settings from .nspawn files */
1113 if (mask_no_settings)
1114 arg_settings_mask = 0;
1115
1116 /* Don't load any settings from .nspawn files */
1117 if (mask_all_settings)
1118 arg_settings_mask = _SETTINGS_MASK_ALL;
1119
520e0d54 1120 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1121
6aadfa4c
ILG
1122 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1123 if (e)
1124 arg_container_service_name = e;
1125
5a8ff0e6
CB
1126 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1127 if (r < 0)
1128 arg_use_cgns = cg_ns_supported();
1129 else
1130 arg_use_cgns = r;
1131
86c0dd4a
LP
1132 r = custom_mount_check_all();
1133 if (r < 0)
1134 return r;
1135
f757855e
LP
1136 return 1;
1137}
1138
1139static int verify_arguments(void) {
4f086aab
SU
1140 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1141 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1142 return -EINVAL;
1143 }
1144
1145 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1146 log_error("Cannot combine --private-users with read-write mounts.");
1147 return -EINVAL;
1148 }
f757855e
LP
1149
1150 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1151 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1152 return -EINVAL;
1153 }
1154
6d0b55c2
LP
1155 if (arg_expose_ports && !arg_private_network) {
1156 log_error("Cannot use --port= without private networking.");
1157 return -EINVAL;
1158 }
1159
1c1ea217
EV
1160#ifndef HAVE_LIBIPTC
1161 if (arg_expose_ports) {
1162 log_error("--port= is not supported, compiled without libiptc support.");
1163 return -EOPNOTSUPP;
1164 }
1165#endif
1166
7732f92b 1167 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1168 arg_kill_signal = SIGRTMIN+3;
1169
f757855e 1170 return 0;
88213476
LP
1171}
1172
03cfe0d5
LP
1173static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1174 assert(p);
1175
0de7acce 1176 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1177 return 0;
1178
1179 if (uid == UID_INVALID && gid == GID_INVALID)
1180 return 0;
1181
1182 if (uid != UID_INVALID) {
1183 uid += arg_uid_shift;
1184
1185 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1186 return -EOVERFLOW;
1187 }
1188
1189 if (gid != GID_INVALID) {
1190 gid += (gid_t) arg_uid_shift;
1191
1192 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1193 return -EOVERFLOW;
1194 }
1195
1196 if (lchown(p, uid, gid) < 0)
1197 return -errno;
b12afc8c
LP
1198
1199 return 0;
1200}
1201
03cfe0d5
LP
1202static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1203 const char *q;
1204
1205 q = prefix_roota(root, path);
1206 if (mkdir(q, mode) < 0) {
1207 if (errno == EEXIST)
1208 return 0;
1209 return -errno;
1210 }
1211
1212 return userns_lchown(q, uid, gid);
1213}
1214
e58a1277 1215static int setup_timezone(const char *dest) {
03cfe0d5
LP
1216 _cleanup_free_ char *p = NULL, *q = NULL;
1217 const char *where, *check, *what;
d4036145
LP
1218 char *z, *y;
1219 int r;
f8440af5 1220
e58a1277
LP
1221 assert(dest);
1222
1223 /* Fix the timezone, if possible */
d4036145
LP
1224 r = readlink_malloc("/etc/localtime", &p);
1225 if (r < 0) {
0b493a02
MP
1226 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1227 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1228 * with a symbolic link to a time zone data file.
0b493a02
MP
1229 *
1230 * Example:
21dc0227 1231 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1232 */
d4036145
LP
1233 return 0;
1234 }
1235
1236 z = path_startswith(p, "../usr/share/zoneinfo/");
1237 if (!z)
1238 z = path_startswith(p, "/usr/share/zoneinfo/");
1239 if (!z) {
1240 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1241 return 0;
1242 }
1243
03cfe0d5 1244 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1245 r = readlink_malloc(where, &q);
1246 if (r >= 0) {
1247 y = path_startswith(q, "../usr/share/zoneinfo/");
1248 if (!y)
1249 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1250
d4036145
LP
1251 /* Already pointing to the right place? Then do nothing .. */
1252 if (y && streq(y, z))
1253 return 0;
1254 }
1255
03cfe0d5 1256 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1257 check = prefix_roota(dest, check);
03cfe0d5 1258 if (laccess(check, F_OK) < 0) {
d4036145
LP
1259 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1260 return 0;
1261 }
68fb0892 1262
79d80fc1
TG
1263 r = unlink(where);
1264 if (r < 0 && errno != ENOENT) {
56f64d95 1265 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1266 return 0;
1267 }
4d9f07b4 1268
03cfe0d5 1269 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1270 if (symlink(what, where) < 0) {
56f64d95 1271 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1272 return 0;
1273 }
e58a1277 1274
03cfe0d5
LP
1275 r = userns_lchown(where, 0, 0);
1276 if (r < 0)
1277 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1278
e58a1277 1279 return 0;
88213476
LP
1280}
1281
2547bb41 1282static int setup_resolv_conf(const char *dest) {
03cfe0d5 1283 const char *where = NULL;
79d80fc1 1284 int r;
2547bb41
LP
1285
1286 assert(dest);
1287
1288 if (arg_private_network)
1289 return 0;
1290
1291 /* Fix resolv.conf, if possible */
03cfe0d5 1292 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1293
7debb05d
CH
1294 if (access("/run/systemd/resolve/resolv.conf", F_OK) >= 0 &&
1295 access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
3539724c
LP
1296 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1297 * container, so that the container can use the host's resolver. Given that network namespacing is
1298 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1299 * advantage that the container will be able to follow the host's DNS server configuration changes
1300 * transparently. */
1301
60e76d48
ZJS
1302 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1303 if (r >= 0)
1304 return mount_verbose(LOG_ERR, NULL, where, NULL,
1305 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1306 }
1307
1308 /* If that didn't work, let's copy the file */
f2068bcc 1309 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1310 if (r < 0) {
3539724c
LP
1311 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1312 * resolved or something similar runs inside and the symlink points there.
68a313c5 1313 *
3539724c 1314 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1315 */
1316 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1317 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1318 return 0;
1319 }
2547bb41 1320
03cfe0d5
LP
1321 r = userns_lchown(where, 0, 0);
1322 if (r < 0)
3539724c 1323 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1324
2547bb41
LP
1325 return 0;
1326}
1327
04bc4a3f 1328static int setup_boot_id(const char *dest) {
3bbaff3e 1329 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1330 const char *from, *to;
04bc4a3f
LP
1331 int r;
1332
04bc4a3f
LP
1333 /* Generate a new randomized boot ID, so that each boot-up of
1334 * the container gets a new one */
1335
03cfe0d5
LP
1336 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1337 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1338
1339 r = sd_id128_randomize(&rnd);
f647962d
MS
1340 if (r < 0)
1341 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1342
15b1248a 1343 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1344 if (r < 0)
1345 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1346
60e76d48
ZJS
1347 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1348 if (r >= 0)
1349 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1350 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1351
3bbaff3e 1352 (void) unlink(from);
04bc4a3f
LP
1353 return r;
1354}
1355
e58a1277 1356static int copy_devnodes(const char *dest) {
88213476
LP
1357
1358 static const char devnodes[] =
1359 "null\0"
1360 "zero\0"
1361 "full\0"
1362 "random\0"
1363 "urandom\0"
85614d66
TG
1364 "tty\0"
1365 "net/tun\0";
88213476
LP
1366
1367 const char *d;
e58a1277 1368 int r = 0;
7fd1b19b 1369 _cleanup_umask_ mode_t u;
a258bf26
LP
1370
1371 assert(dest);
124640f1
LP
1372
1373 u = umask(0000);
88213476 1374
03cfe0d5
LP
1375 /* Create /dev/net, so that we can create /dev/net/tun in it */
1376 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1377 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1378
88213476 1379 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1380 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1381 struct stat st;
88213476 1382
7f112f50 1383 from = strappend("/dev/", d);
03cfe0d5 1384 to = prefix_root(dest, from);
88213476
LP
1385
1386 if (stat(from, &st) < 0) {
1387
4a62c710
MS
1388 if (errno != ENOENT)
1389 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1390
a258bf26 1391 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1392
03cfe0d5 1393 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1394 return -EIO;
a258bf26 1395
85614d66 1396 } else {
81f5049b 1397 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1398 /*
1399 * This is some sort of protection too against
1400 * recursive userns chown on shared /dev/
1401 */
1402 if (errno == EEXIST)
1403 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1404 if (errno != EPERM)
1405 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1406
1407 /* Some systems abusively restrict mknod but
1408 * allow bind mounts. */
1409 r = touch(to);
1410 if (r < 0)
1411 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1412 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1413 if (r < 0)
1414 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1415 }
6278cf60 1416
03cfe0d5
LP
1417 r = userns_lchown(to, 0, 0);
1418 if (r < 0)
1419 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1420 }
88213476
LP
1421 }
1422
e58a1277
LP
1423 return r;
1424}
88213476 1425
03cfe0d5
LP
1426static int setup_pts(const char *dest) {
1427 _cleanup_free_ char *options = NULL;
1428 const char *p;
709f6e46 1429 int r;
03cfe0d5
LP
1430
1431#ifdef HAVE_SELINUX
1432 if (arg_selinux_apifs_context)
1433 (void) asprintf(&options,
3dce8915 1434 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1435 arg_uid_shift + TTY_GID,
1436 arg_selinux_apifs_context);
1437 else
1438#endif
1439 (void) asprintf(&options,
3dce8915 1440 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1441 arg_uid_shift + TTY_GID);
f2d88580 1442
03cfe0d5 1443 if (!options)
f2d88580
LP
1444 return log_oom();
1445
03cfe0d5 1446 /* Mount /dev/pts itself */
cc9fce65 1447 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1448 if (mkdir(p, 0755) < 0)
1449 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1450 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1451 if (r < 0)
1452 return r;
709f6e46
MS
1453 r = userns_lchown(p, 0, 0);
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1456
1457 /* Create /dev/ptmx symlink */
1458 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1459 if (symlink("pts/ptmx", p) < 0)
1460 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1461 r = userns_lchown(p, 0, 0);
1462 if (r < 0)
1463 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1464
03cfe0d5
LP
1465 /* And fix /dev/pts/ptmx ownership */
1466 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1467 r = userns_lchown(p, 0, 0);
1468 if (r < 0)
1469 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1470
f2d88580
LP
1471 return 0;
1472}
1473
e58a1277 1474static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1475 _cleanup_umask_ mode_t u;
1476 const char *to;
e58a1277 1477 int r;
e58a1277
LP
1478
1479 assert(dest);
1480 assert(console);
1481
1482 u = umask(0000);
1483
03cfe0d5 1484 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1485 if (r < 0)
1486 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1487
a258bf26
LP
1488 /* We need to bind mount the right tty to /dev/console since
1489 * ptys can only exist on pts file systems. To have something
81f5049b 1490 * to bind mount things on we create a empty regular file. */
a258bf26 1491
03cfe0d5 1492 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1493 r = touch(to);
1494 if (r < 0)
1495 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1496
60e76d48 1497 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1498}
1499
1500static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1501 const char *from, *to;
7fd1b19b 1502 _cleanup_umask_ mode_t u;
d9603714 1503 int fd, r;
e58a1277 1504
e58a1277 1505 assert(kmsg_socket >= 0);
a258bf26 1506
e58a1277 1507 u = umask(0000);
a258bf26 1508
03cfe0d5 1509 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1510 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1511 * on the reading side behave very similar to /proc/kmsg,
1512 * their writing side behaves differently from /dev/kmsg in
1513 * that writing blocks when nothing is reading. In order to
1514 * avoid any problems with containers deadlocking due to this
1515 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1516 from = prefix_roota(dest, "/run/kmsg");
1517 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1518
4a62c710 1519 if (mkfifo(from, 0600) < 0)
03cfe0d5 1520 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1521 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1522 if (r < 0)
1523 return r;
e58a1277
LP
1524
1525 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1526 if (fd < 0)
1527 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1528
e58a1277
LP
1529 /* Store away the fd in the socket, so that it stays open as
1530 * long as we run the child */
3ee897d6 1531 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1532 safe_close(fd);
e58a1277 1533
d9603714
DH
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1536
03cfe0d5
LP
1537 /* And now make the FIFO unavailable as /run/kmsg... */
1538 (void) unlink(from);
1539
25ea79fe 1540 return 0;
88213476
LP
1541}
1542
1c4baffc 1543static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1544 union in_addr_union *exposed = userdata;
1545
1546 assert(rtnl);
1547 assert(m);
1548 assert(exposed);
1549
7a8f6325 1550 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1551 return 0;
1552}
1553
3a74cea5 1554static int setup_hostname(void) {
3a74cea5 1555
0c582db0 1556 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1557 return 0;
1558
605f81a8 1559 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1560 return -errno;
3a74cea5 1561
7027ff61 1562 return 0;
3a74cea5
LP
1563}
1564
57fb9fb5 1565static int setup_journal(const char *directory) {
e01ff70a 1566 sd_id128_t this_id;
0f5e1382 1567 _cleanup_free_ char *d = NULL;
e01ff70a 1568 const char *p, *q;
8054d749 1569 bool try;
e01ff70a 1570 char id[33];
57fb9fb5
LP
1571 int r;
1572
df9a75e4
LP
1573 /* Don't link journals in ephemeral mode */
1574 if (arg_ephemeral)
1575 return 0;
1576
8054d749
LP
1577 if (arg_link_journal == LINK_NO)
1578 return 0;
1579
1580 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1581
4d680aee 1582 r = sd_id128_get_machine(&this_id);
f647962d
MS
1583 if (r < 0)
1584 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1585
e01ff70a 1586 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1587 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1588 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1589 if (try)
4d680aee 1590 return 0;
df9a75e4 1591 return -EEXIST;
4d680aee
ZJS
1592 }
1593
03cfe0d5
LP
1594 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1595 if (r < 0)
1596 return log_error_errno(r, "Failed to create /var: %m");
1597
1598 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1599 if (r < 0)
1600 return log_error_errno(r, "Failed to create /var/log: %m");
1601
1602 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1603 if (r < 0)
1604 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1605
e01ff70a
MS
1606 (void) sd_id128_to_string(arg_uuid, id);
1607
03cfe0d5
LP
1608 p = strjoina("/var/log/journal/", id);
1609 q = prefix_roota(directory, p);
27407a01 1610
e1873695 1611 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1612 if (try)
1613 return 0;
27407a01 1614
8054d749
LP
1615 log_error("%s: already a mount point, refusing to use for journal", p);
1616 return -EEXIST;
57fb9fb5
LP
1617 }
1618
e1873695 1619 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1620 if (try)
1621 return 0;
57fb9fb5 1622
8054d749
LP
1623 log_error("%s: already a mount point, refusing to use for journal", q);
1624 return -EEXIST;
57fb9fb5
LP
1625 }
1626
1627 r = readlink_and_make_absolute(p, &d);
1628 if (r >= 0) {
1629 if ((arg_link_journal == LINK_GUEST ||
1630 arg_link_journal == LINK_AUTO) &&
1631 path_equal(d, q)) {
1632
03cfe0d5 1633 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1634 if (r < 0)
709f6e46 1635 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1636 return 0;
57fb9fb5
LP
1637 }
1638
4a62c710
MS
1639 if (unlink(p) < 0)
1640 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1641 } else if (r == -EINVAL) {
1642
1643 if (arg_link_journal == LINK_GUEST &&
1644 rmdir(p) < 0) {
1645
27407a01
ZJS
1646 if (errno == ENOTDIR) {
1647 log_error("%s already exists and is neither a symlink nor a directory", p);
1648 return r;
4314d33f
MS
1649 } else
1650 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1651 }
4314d33f
MS
1652 } else if (r != -ENOENT)
1653 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1654
1655 if (arg_link_journal == LINK_GUEST) {
1656
1657 if (symlink(q, p) < 0) {
8054d749 1658 if (try) {
56f64d95 1659 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1660 return 0;
4314d33f
MS
1661 } else
1662 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1663 }
1664
03cfe0d5 1665 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1666 if (r < 0)
709f6e46 1667 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1668 return 0;
57fb9fb5
LP
1669 }
1670
1671 if (arg_link_journal == LINK_HOST) {
ccddd104 1672 /* don't create parents here — if the host doesn't have
574edc90 1673 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1674
1675 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1676 if (try) {
56f64d95 1677 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1678 return 0;
4314d33f
MS
1679 } else
1680 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1681 }
1682
27407a01
ZJS
1683 } else if (access(p, F_OK) < 0)
1684 return 0;
57fb9fb5 1685
cdb2b9d0
LP
1686 if (dir_is_empty(q) == 0)
1687 log_warning("%s is not empty, proceeding anyway.", q);
1688
03cfe0d5 1689 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1690 if (r < 0)
1691 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1692
60e76d48
ZJS
1693 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1694 if (r < 0)
4a62c710 1695 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1696
27407a01 1697 return 0;
57fb9fb5
LP
1698}
1699
88213476 1700static int drop_capabilities(void) {
520e0d54 1701 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1702}
1703
db999e0f
LP
1704static int reset_audit_loginuid(void) {
1705 _cleanup_free_ char *p = NULL;
1706 int r;
1707
0c582db0 1708 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1709 return 0;
1710
1711 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1712 if (r == -ENOENT)
db999e0f 1713 return 0;
f647962d
MS
1714 if (r < 0)
1715 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1716
1717 /* Already reset? */
1718 if (streq(p, "4294967295"))
1719 return 0;
1720
ad118bda 1721 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1722 if (r < 0) {
10a87006
LP
1723 log_error_errno(r,
1724 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1725 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1726 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1727 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1728 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1729
db999e0f 1730 sleep(5);
77b6e194 1731 }
db999e0f
LP
1732
1733 return 0;
77b6e194
LP
1734}
1735
24fb1112 1736
785890ac
LP
1737static int setup_propagate(const char *root) {
1738 const char *p, *q;
709f6e46 1739 int r;
785890ac
LP
1740
1741 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1742 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1743 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1744 (void) mkdir_p(p, 0600);
1745
709f6e46
MS
1746 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1747 if (r < 0)
1748 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1749
709f6e46
MS
1750 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1753
709f6e46
MS
1754 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1755 if (r < 0)
1756 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1757
03cfe0d5 1758 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1759 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1760 if (r < 0)
1761 return r;
785890ac 1762
60e76d48
ZJS
1763 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1764 if (r < 0)
1765 return r;
785890ac 1766
19caffac
AC
1767 /* machined will MS_MOVE into that directory, and that's only
1768 * supported for non-shared mounts. */
60e76d48 1769 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1770}
1771
1b9e5b12
LP
1772static int setup_image(char **device_path, int *loop_nr) {
1773 struct loop_info64 info = {
1774 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1775 };
1776 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1777 _cleanup_free_ char* loopdev = NULL;
1778 struct stat st;
1779 int r, nr;
1780
1781 assert(device_path);
1782 assert(loop_nr);
ec16945e 1783 assert(arg_image);
1b9e5b12
LP
1784
1785 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1786 if (fd < 0)
1787 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1788
4a62c710
MS
1789 if (fstat(fd, &st) < 0)
1790 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1791
1792 if (S_ISBLK(st.st_mode)) {
1793 char *p;
1794
1795 p = strdup(arg_image);
1796 if (!p)
1797 return log_oom();
1798
1799 *device_path = p;
1800
1801 *loop_nr = -1;
1802
1803 r = fd;
1804 fd = -1;
1805
1806 return r;
1807 }
1808
1809 if (!S_ISREG(st.st_mode)) {
070edd97 1810 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1811 return -EINVAL;
1812 }
1813
1814 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1815 if (control < 0)
1816 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1817
1818 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1819 if (nr < 0)
1820 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1821
1822 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1823 return log_oom();
1824
1825 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1826 if (loop < 0)
1827 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1828
4a62c710
MS
1829 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1830 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1831
1832 if (arg_read_only)
1833 info.lo_flags |= LO_FLAGS_READ_ONLY;
1834
4a62c710
MS
1835 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1836 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1837
1838 *device_path = loopdev;
1839 loopdev = NULL;
1840
1841 *loop_nr = nr;
1842
1843 r = loop;
1844 loop = -1;
1845
1846 return r;
1847}
1848
ada4799a
LP
1849#define PARTITION_TABLE_BLURB \
1850 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1851 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1852 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1853 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1854 "to be bootable with systemd-nspawn."
1855
1b9e5b12
LP
1856static int dissect_image(
1857 int fd,
727fd4fd
LP
1858 char **root_device, bool *root_device_rw,
1859 char **home_device, bool *home_device_rw,
1860 char **srv_device, bool *srv_device_rw,
a6bc7db9 1861 char **esp_device,
1b9e5b12
LP
1862 bool *secondary) {
1863
1864#ifdef HAVE_BLKID
a6bc7db9 1865 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1866#ifdef GPT_ROOT_NATIVE
1867 int root_nr = -1;
1868#endif
1869#ifdef GPT_ROOT_SECONDARY
1870 int secondary_root_nr = -1;
1871#endif
a6bc7db9 1872 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1873 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1874 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1875 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1876 _cleanup_udev_unref_ struct udev *udev = NULL;
1877 struct udev_list_entry *first, *item;
f6c51a81 1878 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1879 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1880 const char *pttype = NULL;
1881 blkid_partlist pl;
1882 struct stat st;
c09ef2e4 1883 unsigned i;
1b9e5b12
LP
1884 int r;
1885
1886 assert(fd >= 0);
1887 assert(root_device);
1888 assert(home_device);
1889 assert(srv_device);
a6bc7db9 1890 assert(esp_device);
1b9e5b12 1891 assert(secondary);
ec16945e 1892 assert(arg_image);
1b9e5b12
LP
1893
1894 b = blkid_new_probe();
1895 if (!b)
1896 return log_oom();
1897
1898 errno = 0;
1899 r = blkid_probe_set_device(b, fd, 0, 0);
1900 if (r != 0) {
1901 if (errno == 0)
1902 return log_oom();
1903
e1427b13 1904 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1905 }
1906
1907 blkid_probe_enable_partitions(b, 1);
1908 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1909
1910 errno = 0;
1911 r = blkid_do_safeprobe(b);
1912 if (r == -2 || r == 1) {
ada4799a
LP
1913 log_error("Failed to identify any partition table on\n"
1914 " %s\n"
1915 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1916 return -EINVAL;
1917 } else if (r != 0) {
1918 if (errno == 0)
1919 errno = EIO;
e1427b13 1920 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1921 }
1922
48861960 1923 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1924
1925 is_gpt = streq_ptr(pttype, "gpt");
1926 is_mbr = streq_ptr(pttype, "dos");
1927
1928 if (!is_gpt && !is_mbr) {
1929 log_error("No GPT or MBR partition table discovered on\n"
1930 " %s\n"
1931 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1932 return -EINVAL;
1933 }
1934
1935 errno = 0;
1936 pl = blkid_probe_get_partitions(b);
1937 if (!pl) {
1938 if (errno == 0)
1939 return log_oom();
1940
1941 log_error("Failed to list partitions of %s", arg_image);
1942 return -errno;
1943 }
1944
1945 udev = udev_new();
1946 if (!udev)
1947 return log_oom();
1948
4a62c710
MS
1949 if (fstat(fd, &st) < 0)
1950 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1951
c09ef2e4
LP
1952 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1953 if (!d)
1b9e5b12
LP
1954 return log_oom();
1955
c09ef2e4
LP
1956 for (i = 0;; i++) {
1957 int n, m;
1b9e5b12 1958
c09ef2e4
LP
1959 if (i >= 10) {
1960 log_error("Kernel partitions never appeared.");
1961 return -ENXIO;
1962 }
1963
1964 e = udev_enumerate_new(udev);
1965 if (!e)
1966 return log_oom();
1967
1968 r = udev_enumerate_add_match_parent(e, d);
1969 if (r < 0)
1970 return log_oom();
1971
1972 r = udev_enumerate_scan_devices(e);
1973 if (r < 0)
1974 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1975
1976 /* Count the partitions enumerated by the kernel */
1977 n = 0;
1978 first = udev_enumerate_get_list_entry(e);
1979 udev_list_entry_foreach(item, first)
1980 n++;
1981
1982 /* Count the partitions enumerated by blkid */
1983 m = blkid_partlist_numof_partitions(pl);
1984 if (n == m + 1)
1985 break;
1986 if (n > m + 1) {
1987 log_error("blkid and kernel partition list do not match.");
1988 return -EIO;
1989 }
1990 if (n < m + 1) {
1991 unsigned j;
1992
1993 /* The kernel has probed fewer partitions than
1994 * blkid? Maybe the kernel prober is still
1995 * running or it got EBUSY because udev
1996 * already opened the device. Let's reprobe
1997 * the device, which is a synchronous call
1998 * that waits until probing is complete. */
1999
2000 for (j = 0; j < 20; j++) {
2001
2002 r = ioctl(fd, BLKRRPART, 0);
2003 if (r < 0)
2004 r = -errno;
2005 if (r >= 0 || r != -EBUSY)
2006 break;
2007
2008 /* If something else has the device
2009 * open, such as an udev rule, the
2010 * ioctl will return EBUSY. Since
2011 * there's no way to wait until it
2012 * isn't busy anymore, let's just wait
2013 * a bit, and try again.
2014 *
2015 * This is really something they
2016 * should fix in the kernel! */
2017
2018 usleep(50 * USEC_PER_MSEC);
2019 }
2020
2021 if (r < 0)
2022 return log_error_errno(r, "Failed to reread partition table: %m");
2023 }
2024
2025 e = udev_enumerate_unref(e);
2026 }
1b9e5b12
LP
2027
2028 first = udev_enumerate_get_list_entry(e);
2029 udev_list_entry_foreach(item, first) {
2030 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2031 const char *node;
727fd4fd 2032 unsigned long long flags;
1b9e5b12
LP
2033 blkid_partition pp;
2034 dev_t qn;
2035 int nr;
2036
2037 errno = 0;
2038 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2039 if (!q) {
2040 if (!errno)
2041 errno = ENOMEM;
2042
e1427b13 2043 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2044 }
2045
2046 qn = udev_device_get_devnum(q);
2047 if (major(qn) == 0)
2048 continue;
2049
2050 if (st.st_rdev == qn)
2051 continue;
2052
2053 node = udev_device_get_devnode(q);
2054 if (!node)
2055 continue;
2056
2057 pp = blkid_partlist_devno_to_partition(pl, qn);
2058 if (!pp)
2059 continue;
2060
727fd4fd 2061 flags = blkid_partition_get_flags(pp);
727fd4fd 2062
1b9e5b12
LP
2063 nr = blkid_partition_get_partno(pp);
2064 if (nr < 0)
2065 continue;
2066
ada4799a
LP
2067 if (is_gpt) {
2068 sd_id128_t type_id;
2069 const char *stype;
1b9e5b12 2070
f6c51a81
LP
2071 if (flags & GPT_FLAG_NO_AUTO)
2072 continue;
2073
ada4799a
LP
2074 stype = blkid_partition_get_type_string(pp);
2075 if (!stype)
2076 continue;
1b9e5b12 2077
ada4799a 2078 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2079 continue;
2080
ada4799a 2081 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2082
ada4799a
LP
2083 if (home && nr >= home_nr)
2084 continue;
1b9e5b12 2085
ada4799a
LP
2086 home_nr = nr;
2087 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2088
ada4799a
LP
2089 r = free_and_strdup(&home, node);
2090 if (r < 0)
2091 return log_oom();
727fd4fd 2092
ada4799a
LP
2093 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2094
2095 if (srv && nr >= srv_nr)
2096 continue;
2097
2098 srv_nr = nr;
2099 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2100
2101 r = free_and_strdup(&srv, node);
2102 if (r < 0)
2103 return log_oom();
a6bc7db9
LP
2104 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2105
2106 if (esp && nr >= esp_nr)
2107 continue;
2108
2109 esp_nr = nr;
2110
2111 r = free_and_strdup(&esp, node);
2112 if (r < 0)
2113 return log_oom();
ada4799a 2114 }
1b9e5b12 2115#ifdef GPT_ROOT_NATIVE
ada4799a 2116 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2117
ada4799a
LP
2118 if (root && nr >= root_nr)
2119 continue;
1b9e5b12 2120
ada4799a
LP
2121 root_nr = nr;
2122 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2123
ada4799a
LP
2124 r = free_and_strdup(&root, node);
2125 if (r < 0)
2126 return log_oom();
2127 }
1b9e5b12
LP
2128#endif
2129#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2130 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2131
2132 if (secondary_root && nr >= secondary_root_nr)
2133 continue;
2134
2135 secondary_root_nr = nr;
2136 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2137
2138 r = free_and_strdup(&secondary_root, node);
2139 if (r < 0)
2140 return log_oom();
2141 }
2142#endif
f6c51a81
LP
2143 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2144
2145 if (generic)
2146 multiple_generic = true;
2147 else {
2148 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2149
2150 r = free_and_strdup(&generic, node);
2151 if (r < 0)
2152 return log_oom();
2153 }
2154 }
ada4799a
LP
2155
2156 } else if (is_mbr) {
2157 int type;
1b9e5b12 2158
f6c51a81
LP
2159 if (flags != 0x80) /* Bootable flag */
2160 continue;
2161
ada4799a
LP
2162 type = blkid_partition_get_type(pp);
2163 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2164 continue;
2165
f6c51a81
LP
2166 if (generic)
2167 multiple_generic = true;
2168 else {
2169 generic_rw = true;
727fd4fd 2170
f6c51a81
LP
2171 r = free_and_strdup(&root, node);
2172 if (r < 0)
2173 return log_oom();
2174 }
1b9e5b12 2175 }
1b9e5b12
LP
2176 }
2177
1b9e5b12
LP
2178 if (root) {
2179 *root_device = root;
2180 root = NULL;
727fd4fd
LP
2181
2182 *root_device_rw = root_rw;
1b9e5b12
LP
2183 *secondary = false;
2184 } else if (secondary_root) {
2185 *root_device = secondary_root;
2186 secondary_root = NULL;
727fd4fd
LP
2187
2188 *root_device_rw = secondary_root_rw;
1b9e5b12 2189 *secondary = true;
f6c51a81
LP
2190 } else if (generic) {
2191
2192 /* There were no partitions with precise meanings
2193 * around, but we found generic partitions. In this
2194 * case, if there's only one, we can go ahead and boot
2195 * it, otherwise we bail out, because we really cannot
2196 * make any sense of it. */
2197
2198 if (multiple_generic) {
2199 log_error("Identified multiple bootable Linux partitions on\n"
2200 " %s\n"
2201 PARTITION_TABLE_BLURB, arg_image);
2202 return -EINVAL;
2203 }
2204
2205 *root_device = generic;
2206 generic = NULL;
2207
2208 *root_device_rw = generic_rw;
2209 *secondary = false;
2210 } else {
2211 log_error("Failed to identify root partition in disk image\n"
2212 " %s\n"
2213 PARTITION_TABLE_BLURB, arg_image);
2214 return -EINVAL;
1b9e5b12
LP
2215 }
2216
2217 if (home) {
2218 *home_device = home;
2219 home = NULL;
727fd4fd
LP
2220
2221 *home_device_rw = home_rw;
1b9e5b12
LP
2222 }
2223
2224 if (srv) {
2225 *srv_device = srv;
2226 srv = NULL;
727fd4fd
LP
2227
2228 *srv_device_rw = srv_rw;
1b9e5b12
LP
2229 }
2230
a6bc7db9
LP
2231 if (esp) {
2232 *esp_device = esp;
2233 esp = NULL;
2234 }
2235
1b9e5b12
LP
2236 return 0;
2237#else
2238 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2239 return -EOPNOTSUPP;
1b9e5b12
LP
2240#endif
2241}
2242
727fd4fd 2243static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2244#ifdef HAVE_BLKID
2245 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2bce2acc 2246 const char *fstype, *p, *options;
1b9e5b12
LP
2247 int r;
2248
2249 assert(what);
2250 assert(where);
2251
727fd4fd
LP
2252 if (arg_read_only)
2253 rw = false;
2254
1b9e5b12 2255 if (directory)
63c372cb 2256 p = strjoina(where, directory);
1b9e5b12
LP
2257 else
2258 p = where;
2259
2260 errno = 0;
2261 b = blkid_new_probe_from_filename(what);
2262 if (!b) {
2263 if (errno == 0)
2264 return log_oom();
e1427b13 2265 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2266 }
2267
2268 blkid_probe_enable_superblocks(b, 1);
2269 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2270
2271 errno = 0;
2272 r = blkid_do_safeprobe(b);
2273 if (r == -1 || r == 1) {
2274 log_error("Cannot determine file system type of %s", what);
2275 return -EINVAL;
2276 } else if (r != 0) {
2277 if (errno == 0)
2278 errno = EIO;
e1427b13 2279 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2280 }
2281
2282 errno = 0;
2283 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2284 if (errno == 0)
2285 errno = EINVAL;
2286 log_error("Failed to determine file system type of %s", what);
2287 return -errno;
2288 }
2289
2290 if (streq(fstype, "crypto_LUKS")) {
2291 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2292 return -EOPNOTSUPP;
1b9e5b12
LP
2293 }
2294
2bce2acc
LP
2295 /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
2296 * sparse when possible. */
2297 if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
2298 const char *l;
2299
2300 l = path_startswith(what, "/dev");
2301 if (l && startswith(l, "loop"))
2302 options = "discard";
2303 }
2304
2305 return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
1b9e5b12
LP
2306#else
2307 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2308 return -EOPNOTSUPP;
1b9e5b12
LP
2309#endif
2310}
2311
317feb4d 2312static int setup_machine_id(const char *directory) {
691675ba
LP
2313 const char *etc_machine_id;
2314 sd_id128_t id;
3bbaff3e 2315 int r;
e01ff70a 2316
317feb4d
LP
2317 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2318 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2319 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2320 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2321 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2322 * container behaves nicely). */
2323
e01ff70a
MS
2324 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2325
691675ba 2326 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2327 if (r < 0) {
2328 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2329 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2330
317feb4d
LP
2331 if (sd_id128_is_null(arg_uuid)) {
2332 r = sd_id128_randomize(&arg_uuid);
2333 if (r < 0)
2334 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2335 }
2336 } else {
2337 if (sd_id128_is_null(id)) {
2338 log_error("Machine ID in container image is zero, refusing.");
2339 return -EINVAL;
2340 }
e01ff70a 2341
317feb4d
LP
2342 arg_uuid = id;
2343 }
691675ba 2344
e01ff70a
MS
2345 return 0;
2346}
2347
7336138e
LP
2348static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2349 int r;
2350
2351 assert(directory);
2352
0de7acce 2353 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2354 return 0;
2355
2356 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2357 if (r == -EOPNOTSUPP)
2358 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2359 if (r == -EBADE)
2360 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2361 if (r < 0)
2362 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2363 if (r == 0)
2364 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2365 else
2366 log_debug("Patched directory tree to match UID/GID range.");
2367
2368 return r;
2369}
2370
727fd4fd
LP
2371static int mount_devices(
2372 const char *where,
2373 const char *root_device, bool root_device_rw,
2374 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2375 const char *srv_device, bool srv_device_rw,
2376 const char *esp_device) {
1b9e5b12
LP
2377 int r;
2378
2379 assert(where);
2380
2381 if (root_device) {
727fd4fd 2382 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2383 if (r < 0)
2384 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2385 }
2386
2387 if (home_device) {
727fd4fd 2388 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2389 if (r < 0)
2390 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2391 }
2392
2393 if (srv_device) {
727fd4fd 2394 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2395 if (r < 0)
2396 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2397 }
2398
a6bc7db9
LP
2399 if (esp_device) {
2400 const char *mp, *x;
2401
2402 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2403
2404 mp = "/efi";
2405 x = strjoina(arg_directory, mp);
2406 r = dir_is_empty(x);
2407 if (r == -ENOENT) {
2408 mp = "/boot";
2409 x = strjoina(arg_directory, mp);
2410 r = dir_is_empty(x);
2411 }
2412
2413 if (r > 0) {
2414 r = mount_device(esp_device, arg_directory, mp, true);
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to mount ESP: %m");
2417 }
2418 }
2419
1b9e5b12
LP
2420 return 0;
2421}
2422
2423static void loop_remove(int nr, int *image_fd) {
2424 _cleanup_close_ int control = -1;
e8c8ddcc 2425 int r;
1b9e5b12
LP
2426
2427 if (nr < 0)
2428 return;
2429
2430 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2431 r = ioctl(*image_fd, LOOP_CLR_FD);
2432 if (r < 0)
5e4074aa 2433 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2434 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2435 }
2436
2437 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2438 if (control < 0) {
56f64d95 2439 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2440 return;
e8c8ddcc 2441 }
1b9e5b12 2442
e8c8ddcc
TG
2443 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2444 if (r < 0)
5e4074aa 2445 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2446}
2447
113cea80 2448/*
6d416b9c
LS
2449 * Return values:
2450 * < 0 : wait_for_terminate() failed to get the state of the
2451 * container, the container was terminated by a signal, or
2452 * failed for an unknown reason. No change is made to the
2453 * container argument.
2454 * > 0 : The program executed in the container terminated with an
2455 * error. The exit code of the program executed in the
919699ec
LP
2456 * container is returned. The container argument has been set
2457 * to CONTAINER_TERMINATED.
6d416b9c
LS
2458 * 0 : The container is being rebooted, has been shut down or exited
2459 * successfully. The container argument has been set to either
2460 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2461 *
6d416b9c
LS
2462 * That is, success is indicated by a return value of zero, and an
2463 * error is indicated by a non-zero value.
113cea80
DH
2464 */
2465static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2466 siginfo_t status;
919699ec 2467 int r;
113cea80
DH
2468
2469 r = wait_for_terminate(pid, &status);
f647962d
MS
2470 if (r < 0)
2471 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2472
2473 switch (status.si_code) {
fddbb89c 2474
113cea80 2475 case CLD_EXITED:
b5a2179b 2476 if (status.si_status == 0)
919699ec 2477 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2478 else
919699ec 2479 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2480
919699ec
LP
2481 *container = CONTAINER_TERMINATED;
2482 return status.si_status;
113cea80
DH
2483
2484 case CLD_KILLED:
2485 if (status.si_status == SIGINT) {
919699ec 2486 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2487 *container = CONTAINER_TERMINATED;
919699ec
LP
2488 return 0;
2489
113cea80 2490 } else if (status.si_status == SIGHUP) {
919699ec 2491 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2492 *container = CONTAINER_REBOOTED;
919699ec 2493 return 0;
113cea80 2494 }
919699ec 2495
113cea80
DH
2496 /* CLD_KILLED fallthrough */
2497
2498 case CLD_DUMPED:
fddbb89c 2499 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2500 return -EIO;
113cea80
DH
2501
2502 default:
fddbb89c 2503 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2504 return -EIO;
113cea80 2505 }
113cea80
DH
2506}
2507
023fb90b
LP
2508static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2509 pid_t pid;
2510
4a0b58c4 2511 pid = PTR_TO_PID(userdata);
023fb90b 2512 if (pid > 0) {
c6c8f6e2 2513 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2514 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2515 sd_event_source_set_userdata(s, NULL);
2516 return 0;
2517 }
2518 }
2519
2520 sd_event_exit(sd_event_source_get_event(s), 0);
2521 return 0;
2522}
2523
ec16945e 2524static int determine_names(void) {
1b9cebf6 2525 int r;
ec16945e 2526
c1521918
LP
2527 if (arg_template && !arg_directory && arg_machine) {
2528
2529 /* If --template= was specified then we should not
2530 * search for a machine, but instead create a new one
2531 * in /var/lib/machine. */
2532
605405c6 2533 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2534 if (!arg_directory)
2535 return log_oom();
2536 }
2537
ec16945e 2538 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2539 if (arg_machine) {
2540 _cleanup_(image_unrefp) Image *i = NULL;
2541
2542 r = image_find(arg_machine, &i);
2543 if (r < 0)
2544 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2545 if (r == 0) {
1b9cebf6
LP
2546 log_error("No image for machine '%s': %m", arg_machine);
2547 return -ENOENT;
2548 }
2549
aceac2f0 2550 if (i->type == IMAGE_RAW)
0f03c2a4 2551 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2552 else
0f03c2a4 2553 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2554 if (r < 0)
0f3be6ca 2555 return log_oom();
1b9cebf6 2556
aee327b8
LP
2557 if (!arg_ephemeral)
2558 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2559 } else
ec16945e
LP
2560 arg_directory = get_current_dir_name();
2561
0f3be6ca 2562 if (!arg_directory && !arg_image) {
1b9cebf6 2563 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2564 return -EINVAL;
2565 }
2566 }
2567
2568 if (!arg_machine) {
b9ba4dab
LP
2569 if (arg_directory && path_equal(arg_directory, "/"))
2570 arg_machine = gethostname_malloc();
2571 else
2572 arg_machine = strdup(basename(arg_image ?: arg_directory));
ec16945e
LP
2573 if (!arg_machine)
2574 return log_oom();
2575
ae691c1d 2576 hostname_cleanup(arg_machine);
ec16945e
LP
2577 if (!machine_name_is_valid(arg_machine)) {
2578 log_error("Failed to determine machine name automatically, please use -M.");
2579 return -EINVAL;
2580 }
b9ba4dab
LP
2581
2582 if (arg_ephemeral) {
2583 char *b;
2584
2585 /* Add a random suffix when this is an
2586 * ephemeral machine, so that we can run many
2587 * instances at once without manually having
2588 * to specify -M each time. */
2589
2590 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2591 return log_oom();
2592
2593 free(arg_machine);
2594 arg_machine = b;
2595 }
ec16945e
LP
2596 }
2597
2598 return 0;
2599}
2600
8d4aa2bb 2601static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2602 char *chased;
2603 int r;
2604
2605 assert(p);
2606
2607 if (!*p)
2608 return 0;
2609
8d4aa2bb 2610 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2611 if (r < 0)
2612 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2613
2614 free(*p);
2615 *p = chased;
2616
2617 return 0;
2618}
2619
03cfe0d5 2620static int determine_uid_shift(const char *directory) {
6dac160c
LP
2621 int r;
2622
0de7acce 2623 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2624 arg_uid_shift = 0;
6dac160c 2625 return 0;
03cfe0d5 2626 }
6dac160c
LP
2627
2628 if (arg_uid_shift == UID_INVALID) {
2629 struct stat st;
2630
03cfe0d5 2631 r = stat(directory, &st);
6dac160c 2632 if (r < 0)
03cfe0d5 2633 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2634
2635 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2636
2637 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2638 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2639 return -EINVAL;
2640 }
2641
2642 arg_uid_range = UINT32_C(0x10000);
2643 }
2644
2645 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2646 log_error("UID base too high for UID range.");
2647 return -EINVAL;
2648 }
2649
6dac160c
LP
2650 return 0;
2651}
2652
03cfe0d5
LP
2653static int inner_child(
2654 Barrier *barrier,
2655 const char *directory,
2656 bool secondary,
2657 int kmsg_socket,
2658 int rtnl_socket,
f757855e 2659 FDSet *fds) {
69c79d3c 2660
03cfe0d5 2661 _cleanup_free_ char *home = NULL;
e01ff70a 2662 char as_uuid[37];
6aadfa4c 2663 unsigned n_env = 1;
03cfe0d5
LP
2664 const char *envp[] = {
2665 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2666 NULL, /* container */
03cfe0d5
LP
2667 NULL, /* TERM */
2668 NULL, /* HOME */
2669 NULL, /* USER */
2670 NULL, /* LOGNAME */
2671 NULL, /* container_uuid */
2672 NULL, /* LISTEN_FDS */
2673 NULL, /* LISTEN_PID */
9c1e04d0 2674 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2675 NULL
2676 };
88213476 2677
2371271c 2678 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2679 int r;
88213476 2680
03cfe0d5
LP
2681 assert(barrier);
2682 assert(directory);
2683 assert(kmsg_socket >= 0);
88213476 2684
efdb0237
LP
2685 cg_unified_flush();
2686
0de7acce 2687 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2688 /* Tell the parent, that it now can write the UID map. */
2689 (void) barrier_place(barrier); /* #1 */
7027ff61 2690
03cfe0d5
LP
2691 /* Wait until the parent wrote the UID map */
2692 if (!barrier_place_and_sync(barrier)) { /* #2 */
2693 log_error("Parent died too early");
2694 return -ESRCH;
2695 }
88213476
LP
2696 }
2697
6d66bd3b
EV
2698 r = reset_uid_gid();
2699 if (r < 0)
2700 return log_error_errno(r, "Couldn't become new root: %m");
2701
0de7acce 2702 r = mount_all(NULL,
4f086aab 2703 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2704 arg_uid_shift,
2705 arg_uid_range,
2706 arg_selinux_apifs_context);
2707
03cfe0d5
LP
2708 if (r < 0)
2709 return r;
2710
4f086aab 2711 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2712 if (r < 0)
2713 return r;
2714
03cfe0d5
LP
2715 /* Wait until we are cgroup-ified, so that we
2716 * can mount the right cgroup path writable */
2717 if (!barrier_place_and_sync(barrier)) { /* #3 */
2718 log_error("Parent died too early");
2719 return -ESRCH;
88213476
LP
2720 }
2721
5a8ff0e6 2722 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2723 r = unshare(CLONE_NEWCGROUP);
2724 if (r < 0)
2725 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2726 r = mount_cgroups(
2727 "",
2728 arg_unified_cgroup_hierarchy,
2729 arg_userns_mode != USER_NAMESPACE_NO,
2730 arg_uid_shift,
2731 arg_uid_range,
5a8ff0e6 2732 arg_selinux_apifs_context,
ada54120 2733 true);
0996ef00
CB
2734 if (r < 0)
2735 return r;
2736 } else {
2737 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2738 if (r < 0)
2739 return r;
2740 }
ec16945e 2741
03cfe0d5
LP
2742 r = setup_boot_id(NULL);
2743 if (r < 0)
2744 return r;
ec16945e 2745
03cfe0d5
LP
2746 r = setup_kmsg(NULL, kmsg_socket);
2747 if (r < 0)
2748 return r;
2749 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2750
03cfe0d5 2751 umask(0022);
30535c16 2752
03cfe0d5
LP
2753 if (setsid() < 0)
2754 return log_error_errno(errno, "setsid() failed: %m");
2755
2756 if (arg_private_network)
2757 loopback_setup();
2758
7a8f6325
LP
2759 if (arg_expose_ports) {
2760 r = expose_port_send_rtnl(rtnl_socket);
2761 if (r < 0)
2762 return r;
2763 rtnl_socket = safe_close(rtnl_socket);
2764 }
03cfe0d5 2765
709f6e46
MS
2766 r = drop_capabilities();
2767 if (r < 0)
2768 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2769
2770 setup_hostname();
2771
050f7277 2772 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2773 if (personality(arg_personality) < 0)
2774 return log_error_errno(errno, "personality() failed: %m");
2775 } else if (secondary) {
2776 if (personality(PER_LINUX32) < 0)
2777 return log_error_errno(errno, "personality() failed: %m");
2778 }
2779
2780#ifdef HAVE_SELINUX
2781 if (arg_selinux_context)
2ed96880 2782 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2783 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2784#endif
2785
ee645080 2786 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2787 if (r < 0)
2788 return r;
2789
6aadfa4c
ILG
2790 /* LXC sets container=lxc, so follow the scheme here */
2791 envp[n_env++] = strjoina("container=", arg_container_service_name);
2792
03cfe0d5
LP
2793 envp[n_env] = strv_find_prefix(environ, "TERM=");
2794 if (envp[n_env])
313cefa1 2795 n_env++;
03cfe0d5
LP
2796
2797 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2798 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2799 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2800 return log_oom();
2801
3bbaff3e 2802 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2803
691675ba 2804 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2805 return log_oom();
03cfe0d5
LP
2806
2807 if (fdset_size(fds) > 0) {
2808 r = fdset_cloexec(fds, false);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2811
2812 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2813 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2814 return log_oom();
2815 }
9c1e04d0
AP
2816 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2817 return log_oom();
03cfe0d5 2818
2371271c
TG
2819 env_use = strv_env_merge(2, envp, arg_setenv);
2820 if (!env_use)
2821 return log_oom();
03cfe0d5
LP
2822
2823 /* Let the parent know that we are ready and
2824 * wait until the parent is ready with the
2825 * setup, too... */
2826 if (!barrier_place_and_sync(barrier)) { /* #4 */
2827 log_error("Parent died too early");
2828 return -ESRCH;
2829 }
2830
5f932eb9
LP
2831 if (arg_chdir)
2832 if (chdir(arg_chdir) < 0)
2833 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2834
7732f92b
LP
2835 if (arg_start_mode == START_PID2) {
2836 r = stub_pid1();
2837 if (r < 0)
2838 return r;
2839 }
2840
03cfe0d5
LP
2841 /* Now, explicitly close the log, so that we
2842 * then can close all remaining fds. Closing
2843 * the log explicitly first has the benefit
2844 * that the logging subsystem knows about it,
2845 * and is thus ready to be reopened should we
2846 * need it again. Note that the other fds
2847 * closed here are at least the locking and
2848 * barrier fds. */
2849 log_close();
2850 (void) fdset_close_others(fds);
2851
7732f92b 2852 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2853 char **a;
2854 size_t m;
2855
2856 /* Automatically search for the init system */
2857
75f32f04
ZJS
2858 m = strv_length(arg_parameters);
2859 a = newa(char*, m + 2);
2860 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2861 a[1 + m] = NULL;
03cfe0d5
LP
2862
2863 a[0] = (char*) "/usr/lib/systemd/systemd";
2864 execve(a[0], a, env_use);
2865
2866 a[0] = (char*) "/lib/systemd/systemd";
2867 execve(a[0], a, env_use);
2868
2869 a[0] = (char*) "/sbin/init";
2870 execve(a[0], a, env_use);
f757855e
LP
2871 } else if (!strv_isempty(arg_parameters))
2872 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2873 else {
5f932eb9 2874 if (!arg_chdir)
d929b0f9
ZJS
2875 /* If we cannot change the directory, we'll end up in /, that is expected. */
2876 (void) chdir(home ?: "/root");
5f932eb9 2877
03cfe0d5
LP
2878 execle("/bin/bash", "-bash", NULL, env_use);
2879 execle("/bin/sh", "-sh", NULL, env_use);
2880 }
2881
35607a8d 2882 r = -errno;
03cfe0d5 2883 (void) log_open();
35607a8d 2884 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2885}
2886
9c1e04d0
AP
2887static int setup_sd_notify_child(void) {
2888 static const int one = 1;
2889 int fd = -1;
2890 union sockaddr_union sa = {
2891 .sa.sa_family = AF_UNIX,
2892 };
2893 int r;
2894
2895 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2896 if (fd < 0)
2897 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2898
2899 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2900 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2901
2902 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2903 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2904 if (r < 0) {
2905 safe_close(fd);
2906 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2907 }
2908
2909 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2910 if (r < 0) {
2911 safe_close(fd);
2912 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2913 }
2914
2915 return fd;
2916}
2917
03cfe0d5
LP
2918static int outer_child(
2919 Barrier *barrier,
2920 const char *directory,
2921 const char *console,
2922 const char *root_device, bool root_device_rw,
2923 const char *home_device, bool home_device_rw,
2924 const char *srv_device, bool srv_device_rw,
a6bc7db9 2925 const char *esp_device,
03cfe0d5
LP
2926 bool interactive,
2927 bool secondary,
2928 int pid_socket,
e01ff70a 2929 int uuid_socket,
9c1e04d0 2930 int notify_socket,
03cfe0d5
LP
2931 int kmsg_socket,
2932 int rtnl_socket,
825d5287 2933 int uid_shift_socket,
f757855e 2934 FDSet *fds) {
03cfe0d5
LP
2935
2936 pid_t pid;
2937 ssize_t l;
2938 int r;
9c1e04d0 2939 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2940
2941 assert(barrier);
2942 assert(directory);
2943 assert(console);
2944 assert(pid_socket >= 0);
e01ff70a 2945 assert(uuid_socket >= 0);
9c1e04d0 2946 assert(notify_socket >= 0);
03cfe0d5
LP
2947 assert(kmsg_socket >= 0);
2948
efdb0237
LP
2949 cg_unified_flush();
2950
03cfe0d5
LP
2951 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2952 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2953
2954 if (interactive) {
2955 close_nointr(STDIN_FILENO);
2956 close_nointr(STDOUT_FILENO);
2957 close_nointr(STDERR_FILENO);
2958
2959 r = open_terminal(console, O_RDWR);
2960 if (r != STDIN_FILENO) {
2961 if (r >= 0) {
2962 safe_close(r);
2963 r = -EINVAL;
2964 }
2965
2966 return log_error_errno(r, "Failed to open console: %m");
2967 }
2968
2969 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2970 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2971 return log_error_errno(errno, "Failed to duplicate console: %m");
2972 }
2973
2974 r = reset_audit_loginuid();
2975 if (r < 0)
2976 return r;
2977
2978 /* Mark everything as slave, so that we still
2979 * receive mounts from the real root, but don't
2980 * propagate mounts to the real root. */
60e76d48
ZJS
2981 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2982 if (r < 0)
2983 return r;
03cfe0d5
LP
2984
2985 r = mount_devices(directory,
2986 root_device, root_device_rw,
2987 home_device, home_device_rw,
a6bc7db9
LP
2988 srv_device, srv_device_rw,
2989 esp_device);
03cfe0d5
LP
2990 if (r < 0)
2991 return r;
2992
391567f4
LP
2993 r = determine_uid_shift(directory);
2994 if (r < 0)
2995 return r;
2996
0fd9563f
ZJS
2997 r = detect_unified_cgroup_hierarchy(directory);
2998 if (r < 0)
2999 return r;
3000
0de7acce 3001 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3002 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3003 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3004 if (l < 0)
3005 return log_error_errno(errno, "Failed to send UID shift: %m");
3006 if (l != sizeof(arg_uid_shift)) {
3007 log_error("Short write while sending UID shift.");
3008 return -EIO;
3009 }
0e7ac751 3010
0de7acce 3011 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3012 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3013 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3014 * not it will pick a different one, and send it back to us. */
3015
3016 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3017 if (l < 0)
3018 return log_error_errno(errno, "Failed to recv UID shift: %m");
3019 if (l != sizeof(arg_uid_shift)) {
595bfe7d 3020 log_error("Short read while receiving UID shift.");
0e7ac751
LP
3021 return -EIO;
3022 }
3023 }
3024
3025 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3026 }
3027
03cfe0d5 3028 /* Turn directory into bind mount */
60e76d48
ZJS
3029 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3030 if (r < 0)
3031 return r;
03cfe0d5 3032
19caffac
AC
3033 /* Mark everything as shared so our mounts get propagated down. This is
3034 * required to make new bind mounts available in systemd services
3035 * inside the containter that create a new mount namespace.
3036 * See https://github.com/systemd/systemd/issues/3860
3037 * Further submounts (such as /dev) done after this will inherit the
3038 * shared propagation mode.*/
60e76d48
ZJS
3039 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3040 if (r < 0)
3041 return r;
19caffac 3042
7336138e 3043 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
3044 if (r < 0)
3045 return r;
3046
0de7acce
LP
3047 r = setup_volatile(
3048 directory,
3049 arg_volatile_mode,
3050 arg_userns_mode != USER_NAMESPACE_NO,
3051 arg_uid_shift,
3052 arg_uid_range,
3053 arg_selinux_context);
03cfe0d5
LP
3054 if (r < 0)
3055 return r;
3056
0de7acce
LP
3057 r = setup_volatile_state(
3058 directory,
3059 arg_volatile_mode,
3060 arg_userns_mode != USER_NAMESPACE_NO,
3061 arg_uid_shift,
3062 arg_uid_range,
3063 arg_selinux_context);
03cfe0d5
LP
3064 if (r < 0)
3065 return r;
3066
03cfe0d5
LP
3067 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3068 if (r < 0)
3069 return r;
3070
03cfe0d5 3071 if (arg_read_only) {
6b7c9f8b 3072 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3073 if (r < 0)
3074 return log_error_errno(r, "Failed to make tree read-only: %m");
3075 }
3076
0de7acce 3077 r = mount_all(directory,
4f086aab 3078 arg_mount_settings,
0de7acce
LP
3079 arg_uid_shift,
3080 arg_uid_range,
3081 arg_selinux_apifs_context);
03cfe0d5
LP
3082 if (r < 0)
3083 return r;
3084
07fa00f9
LP
3085 r = copy_devnodes(directory);
3086 if (r < 0)
03cfe0d5
LP
3087 return r;
3088
3089 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3090
07fa00f9
LP
3091 r = setup_pts(directory);
3092 if (r < 0)
03cfe0d5
LP
3093 return r;
3094
3095 r = setup_propagate(directory);
3096 if (r < 0)
3097 return r;
3098
3099 r = setup_dev_console(directory, console);
3100 if (r < 0)
3101 return r;
3102
520e0d54 3103 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3104 if (r < 0)
3105 return r;
3106
3107 r = setup_timezone(directory);
3108 if (r < 0)
3109 return r;
3110
3111 r = setup_resolv_conf(directory);
3112 if (r < 0)
3113 return r;
3114
e01ff70a
MS
3115 r = setup_machine_id(directory);
3116 if (r < 0)
3117 return r;
3118
03cfe0d5
LP
3119 r = setup_journal(directory);
3120 if (r < 0)
3121 return r;
3122
0de7acce
LP
3123 r = mount_custom(
3124 directory,
3125 arg_custom_mounts,
3126 arg_n_custom_mounts,
3127 arg_userns_mode != USER_NAMESPACE_NO,
3128 arg_uid_shift,
3129 arg_uid_range,
3130 arg_selinux_apifs_context);
03cfe0d5
LP
3131 if (r < 0)
3132 return r;
3133
5a8ff0e6 3134 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3135 r = mount_cgroups(
3136 directory,
3137 arg_unified_cgroup_hierarchy,
3138 arg_userns_mode != USER_NAMESPACE_NO,
3139 arg_uid_shift,
3140 arg_uid_range,
5a8ff0e6 3141 arg_selinux_apifs_context,
ada54120 3142 false);
0996ef00
CB
3143 if (r < 0)
3144 return r;
3145 }
03cfe0d5
LP
3146
3147 r = mount_move_root(directory);
3148 if (r < 0)
3149 return log_error_errno(r, "Failed to move root directory: %m");
3150
9c1e04d0
AP
3151 fd = setup_sd_notify_child();
3152 if (fd < 0)
3153 return fd;
3154
03cfe0d5 3155 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3156 arg_clone_ns_flags |
03cfe0d5 3157 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3158 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3159 if (pid < 0)
3160 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3161 if (pid == 0) {
3162 pid_socket = safe_close(pid_socket);
e01ff70a 3163 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3164 notify_socket = safe_close(notify_socket);
825d5287 3165 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3166
3167 /* The inner child has all namespaces that are
3168 * requested, so that we all are owned by the user if
3169 * user namespaces are turned on. */
3170
f757855e 3171 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3172 if (r < 0)
3173 _exit(EXIT_FAILURE);
3174
3175 _exit(EXIT_SUCCESS);
3176 }
3177
3178 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3179 if (l < 0)
3180 return log_error_errno(errno, "Failed to send PID: %m");
3181 if (l != sizeof(pid)) {
3182 log_error("Short write while sending PID.");
3183 return -EIO;
3184 }
3185
e01ff70a
MS
3186 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3187 if (l < 0)
3188 return log_error_errno(errno, "Failed to send machine ID: %m");
3189 if (l != sizeof(arg_uuid)) {
3190 log_error("Short write while sending machine ID.");
3191 return -EIO;
3192 }
3193
9c1e04d0
AP
3194 l = send_one_fd(notify_socket, fd, 0);
3195 if (l < 0)
3196 return log_error_errno(errno, "Failed to send notify fd: %m");
3197
03cfe0d5 3198 pid_socket = safe_close(pid_socket);
e01ff70a 3199 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3200 notify_socket = safe_close(notify_socket);
327e26d6
KN
3201 kmsg_socket = safe_close(kmsg_socket);
3202 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3203
3204 return 0;
3205}
3206
0e7ac751
LP
3207static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3208 unsigned n_tries = 100;
3209 uid_t candidate;
3210 int r;
3211
3212 assert(shift);
3213 assert(ret_lock_file);
0de7acce 3214 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3215 assert(arg_uid_range == 0x10000U);
3216
3217 candidate = *shift;
3218
3219 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3220
3221 for (;;) {
3222 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3223 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3224
3225 if (--n_tries <= 0)
3226 return -EBUSY;
3227
3228 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3229 goto next;
3230 if ((candidate & UINT32_C(0xFFFF)) != 0)
3231 goto next;
3232
3233 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3234 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3235 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3236 goto next;
3237 if (r < 0)
3238 return r;
3239
3240 /* Make some superficial checks whether the range is currently known in the user database */
3241 if (getpwuid(candidate))
3242 goto next;
3243 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3244 goto next;
3245 if (getgrgid(candidate))
3246 goto next;
3247 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3248 goto next;
3249
3250 *ret_lock_file = lf;
3251 lf = (struct LockFile) LOCK_FILE_INIT;
3252 *shift = candidate;
3253 return 0;
3254
3255 next:
3256 random_bytes(&candidate, sizeof(candidate));
3257 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3258 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3259 }
3260}
3261
03cfe0d5
LP
3262static int setup_uid_map(pid_t pid) {
3263 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3264 int r;
3265
3266 assert(pid > 1);
3267
3268 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3269 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3270 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3271 if (r < 0)
3272 return log_error_errno(r, "Failed to write UID map: %m");
3273
3274 /* We always assign the same UID and GID ranges */
3275 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3276 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3277 if (r < 0)
3278 return log_error_errno(r, "Failed to write GID map: %m");
3279
3280 return 0;
3281}
3282
9c1e04d0 3283static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3284 char buf[NOTIFY_BUFFER_MAX+1];
3285 char *p = NULL;
3286 struct iovec iovec = {
3287 .iov_base = buf,
3288 .iov_len = sizeof(buf)-1,
3289 };
3290 union {
3291 struct cmsghdr cmsghdr;
3292 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3293 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3294 } control = {};
3295 struct msghdr msghdr = {
3296 .msg_iov = &iovec,
3297 .msg_iovlen = 1,
3298 .msg_control = &control,
3299 .msg_controllen = sizeof(control),
3300 };
3301 struct cmsghdr *cmsg;
3302 struct ucred *ucred = NULL;
3303 ssize_t n;
3304 pid_t inner_child_pid;
3305 _cleanup_strv_free_ char **tags = NULL;
3306
3307 assert(userdata);
3308
3309 inner_child_pid = PTR_TO_PID(userdata);
3310
3311 if (revents != EPOLLIN) {
3312 log_warning("Got unexpected poll event for notify fd.");
3313 return 0;
3314 }
3315
3316 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3317 if (n < 0) {
3318 if (errno == EAGAIN || errno == EINTR)
3319 return 0;
3320
3321 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3322 }
3323 cmsg_close_all(&msghdr);
3324
3325 CMSG_FOREACH(cmsg, &msghdr) {
3326 if (cmsg->cmsg_level == SOL_SOCKET &&
3327 cmsg->cmsg_type == SCM_CREDENTIALS &&
3328 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3329
3330 ucred = (struct ucred*) CMSG_DATA(cmsg);
3331 }
3332 }
3333
3334 if (!ucred || ucred->pid != inner_child_pid) {
3335 log_warning("Received notify message without valid credentials. Ignoring.");
3336 return 0;
3337 }
3338
3339 if ((size_t) n >= sizeof(buf)) {
3340 log_warning("Received notify message exceeded maximum size. Ignoring.");
3341 return 0;
3342 }
3343
3344 buf[n] = 0;
3345 tags = strv_split(buf, "\n\r");
3346 if (!tags)
3347 return log_oom();
3348
3349 if (strv_find(tags, "READY=1"))
3350 sd_notifyf(false, "READY=1\n");
3351
3352 p = strv_find_startswith(tags, "STATUS=");
3353 if (p)
3354 sd_notifyf(false, "STATUS=Container running: %s", p);
3355
3356 return 0;
3357}
3358
3359static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3360 int r;
3361 sd_event_source *notify_event_source;
3362
3363 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3364 if (r < 0)
3365 return log_error_errno(r, "Failed to allocate notify event source: %m");
3366
3367 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3368
3369 return 0;
3370}
3371
f757855e
LP
3372static int load_settings(void) {
3373 _cleanup_(settings_freep) Settings *settings = NULL;
3374 _cleanup_fclose_ FILE *f = NULL;
3375 _cleanup_free_ char *p = NULL;
3376 const char *fn, *i;
3377 int r;
3378
3379 /* If all settings are masked, there's no point in looking for
3380 * the settings file */
3381 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3382 return 0;
3383
3384 fn = strjoina(arg_machine, ".nspawn");
3385
3386 /* We first look in the admin's directories in /etc and /run */
3387 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3388 _cleanup_free_ char *j = NULL;
3389
605405c6 3390 j = strjoin(i, "/", fn);
f757855e
LP
3391 if (!j)
3392 return log_oom();
3393
3394 f = fopen(j, "re");
3395 if (f) {
3396 p = j;
3397 j = NULL;
3398
b938cb90 3399 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3400 if (arg_settings_trusted < 0)
3401 arg_settings_trusted = true;
3402
3403 break;
3404 }
3405
3406 if (errno != ENOENT)
3407 return log_error_errno(errno, "Failed to open %s: %m", j);
3408 }
3409
3410 if (!f) {
3411 /* After that, let's look for a file next to the
3412 * actual image we shall boot. */
3413
3414 if (arg_image) {
3415 p = file_in_same_dir(arg_image, fn);
3416 if (!p)
3417 return log_oom();
3418 } else if (arg_directory) {
3419 p = file_in_same_dir(arg_directory, fn);
3420 if (!p)
3421 return log_oom();
3422 }
3423
3424 if (p) {
3425 f = fopen(p, "re");
3426 if (!f && errno != ENOENT)
3427 return log_error_errno(errno, "Failed to open %s: %m", p);
3428
b938cb90 3429 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3430 if (arg_settings_trusted < 0)
3431 arg_settings_trusted = false;
3432 }
3433 }
3434
3435 if (!f)
3436 return 0;
3437
3438 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3439
3440 r = settings_load(f, p, &settings);
3441 if (r < 0)
3442 return r;
3443
3444 /* Copy over bits from the settings, unless they have been
3445 * explicitly masked by command line switches. */
3446
7732f92b
LP
3447 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3448 settings->start_mode >= 0) {
3449 arg_start_mode = settings->start_mode;
f757855e
LP
3450
3451 strv_free(arg_parameters);
3452 arg_parameters = settings->parameters;
3453 settings->parameters = NULL;
3454 }
3455
5f932eb9
LP
3456 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3457 settings->working_directory) {
3458 free(arg_chdir);
3459 arg_chdir = settings->working_directory;
3460 settings->working_directory = NULL;
3461 }
3462
f757855e
LP
3463 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3464 settings->environment) {
3465 strv_free(arg_setenv);
3466 arg_setenv = settings->environment;
3467 settings->environment = NULL;
3468 }
3469
3470 if ((arg_settings_mask & SETTING_USER) == 0 &&
3471 settings->user) {
3472 free(arg_user);
3473 arg_user = settings->user;
3474 settings->user = NULL;
3475 }
3476
3477 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3478 uint64_t plus;
f757855e 3479
0e265674
LP
3480 plus = settings->capability;
3481 if (settings_private_network(settings))
3482 plus |= (1ULL << CAP_NET_ADMIN);
3483
3484 if (!arg_settings_trusted && plus != 0) {
3485 if (settings->capability != 0)
3486 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3487 } else
520e0d54 3488 arg_caps_retain |= plus;
f757855e 3489
520e0d54 3490 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3491 }
3492
3493 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3494 settings->kill_signal > 0)
3495 arg_kill_signal = settings->kill_signal;
3496
3497 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3498 settings->personality != PERSONALITY_INVALID)
3499 arg_personality = settings->personality;
3500
3501 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3502 !sd_id128_is_null(settings->machine_id)) {
3503
3504 if (!arg_settings_trusted)
3505 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3506 else
3507 arg_uuid = settings->machine_id;
3508 }
3509
3510 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3511 settings->read_only >= 0)
3512 arg_read_only = settings->read_only;
3513
3514 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3515 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3516 arg_volatile_mode = settings->volatile_mode;
3517
3518 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3519 settings->n_custom_mounts > 0) {
3520
3521 if (!arg_settings_trusted)
3522 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3523 else {
3524 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3525 arg_custom_mounts = settings->custom_mounts;
3526 arg_n_custom_mounts = settings->n_custom_mounts;
3527
3528 settings->custom_mounts = NULL;
3529 settings->n_custom_mounts = 0;
3530 }
3531 }
3532
3533 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3534 (settings->private_network >= 0 ||
3535 settings->network_veth >= 0 ||
3536 settings->network_bridge ||
22b28dfd 3537 settings->network_zone ||
f757855e
LP
3538 settings->network_interfaces ||
3539 settings->network_macvlan ||
f6d6bad1
LP
3540 settings->network_ipvlan ||
3541 settings->network_veth_extra)) {
f757855e
LP
3542
3543 if (!arg_settings_trusted)
3544 log_warning("Ignoring network settings, file %s is not trusted.", p);
3545 else {
f6d6bad1 3546 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3547 arg_private_network = settings_private_network(settings);
3548
f757855e
LP
3549 strv_free(arg_network_interfaces);
3550 arg_network_interfaces = settings->network_interfaces;
3551 settings->network_interfaces = NULL;
3552
3553 strv_free(arg_network_macvlan);
3554 arg_network_macvlan = settings->network_macvlan;
3555 settings->network_macvlan = NULL;
3556
3557 strv_free(arg_network_ipvlan);
3558 arg_network_ipvlan = settings->network_ipvlan;
3559 settings->network_ipvlan = NULL;
3560
f6d6bad1
LP
3561 strv_free(arg_network_veth_extra);
3562 arg_network_veth_extra = settings->network_veth_extra;
3563 settings->network_veth_extra = NULL;
3564
f757855e
LP
3565 free(arg_network_bridge);
3566 arg_network_bridge = settings->network_bridge;
3567 settings->network_bridge = NULL;
22b28dfd
LP
3568
3569 free(arg_network_zone);
3570 arg_network_zone = settings->network_zone;
3571 settings->network_zone = NULL;
f757855e
LP
3572 }
3573 }
3574
3575 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3576 settings->expose_ports) {
3577
3578 if (!arg_settings_trusted)
3579 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3580 else {
3581 expose_port_free_all(arg_expose_ports);
3582 arg_expose_ports = settings->expose_ports;
3583 settings->expose_ports = NULL;
3584 }
3585 }
3586
0de7acce
LP
3587 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3588 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3589
3590 if (!arg_settings_trusted)
3591 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3592 else {
3593 arg_userns_mode = settings->userns_mode;
3594 arg_uid_shift = settings->uid_shift;
3595 arg_uid_range = settings->uid_range;
3596 arg_userns_chown = settings->userns_chown;
3597 }
3598 }
3599
9c1e04d0
AP
3600 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3601 arg_notify_ready = settings->notify_ready;
3602
f757855e
LP
3603 return 0;
3604}
3605
b0067625
ZJS
3606static int run(int master,
3607 const char* console,
3608 const char *root_device, bool root_device_rw,
3609 const char *home_device, bool home_device_rw,
3610 const char *srv_device, bool srv_device_rw,
3611 const char *esp_device,
3612 bool interactive,
3613 bool secondary,
3614 FDSet *fds,
3615 char veth_name[IFNAMSIZ], bool *veth_created,
3616 union in_addr_union *exposed,
3617 pid_t *pid, int *ret) {
3618
3619 static const struct sigaction sa = {
3620 .sa_handler = nop_signal_handler,
e28c7cd0 3621 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3622 };
3623
3624 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3625 _cleanup_close_ int etc_passwd_lock = -1;
3626 _cleanup_close_pair_ int
3627 kmsg_socket_pair[2] = { -1, -1 },
3628 rtnl_socket_pair[2] = { -1, -1 },
3629 pid_socket_pair[2] = { -1, -1 },
3630 uuid_socket_pair[2] = { -1, -1 },
3631 notify_socket_pair[2] = { -1, -1 },
3632 uid_shift_socket_pair[2] = { -1, -1 };
3633 _cleanup_close_ int notify_socket= -1;
3634 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3635 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3636 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3637 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3638 ContainerStatus container_status = 0;
3639 char last_char = 0;
3640 int ifi = 0, r;
3641 ssize_t l;
3642 sigset_t mask_chld;
3643
3644 assert_se(sigemptyset(&mask_chld) == 0);
3645 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3646
3647 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3648 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3649 * check with getpwuid() if the specific user already exists. Note that /etc might be
3650 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3651 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3652 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3653 * really ours. */
3654
3655 etc_passwd_lock = take_etc_passwd_lock(NULL);
3656 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3657 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3658 }
3659
3660 r = barrier_create(&barrier);
3661 if (r < 0)
3662 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3663
3664 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3665 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3666
3667 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3668 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3669
3670 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3671 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3672
3673 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3674 return log_error_errno(errno, "Failed to create id socket pair: %m");
3675
3676 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3677 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3678
3679 if (arg_userns_mode != USER_NAMESPACE_NO)
3680 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3681 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3682
3683 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3684 * parent's blocking calls and give it a chance to call wait() and terminate. */
3685 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3686 if (r < 0)
3687 return log_error_errno(errno, "Failed to change the signal mask: %m");
3688
3689 r = sigaction(SIGCHLD, &sa, NULL);
3690 if (r < 0)
3691 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3692
3693 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3694 if (*pid < 0)
3695 return log_error_errno(errno, "clone() failed%s: %m",
3696 errno == EINVAL ?
3697 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3698
3699 if (*pid == 0) {
3700 /* The outer child only has a file system namespace. */
3701 barrier_set_role(&barrier, BARRIER_CHILD);
3702
3703 master = safe_close(master);
3704
3705 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3706 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3707 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3708 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3709 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3710 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3711
3712 (void) reset_all_signal_handlers();
3713 (void) reset_signal_mask();
3714
3715 r = outer_child(&barrier,
3716 arg_directory,
3717 console,
3718 root_device, root_device_rw,
3719 home_device, home_device_rw,
3720 srv_device, srv_device_rw,
3721 esp_device,
3722 interactive,
3723 secondary,
3724 pid_socket_pair[1],
3725 uuid_socket_pair[1],
3726 notify_socket_pair[1],
3727 kmsg_socket_pair[1],
3728 rtnl_socket_pair[1],
3729 uid_shift_socket_pair[1],
3730 fds);
3731 if (r < 0)
3732 _exit(EXIT_FAILURE);
3733
3734 _exit(EXIT_SUCCESS);
3735 }
3736
3737 barrier_set_role(&barrier, BARRIER_PARENT);
3738
3739 fds = fdset_free(fds);
3740
3741 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3742 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3743 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3744 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3745 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3746 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3747
3748 if (arg_userns_mode != USER_NAMESPACE_NO) {
3749 /* The child just let us know the UID shift it might have read from the image. */
3750 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3751 if (l < 0)
3752 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3753 if (l != sizeof arg_uid_shift) {
3754 log_error("Short read while reading UID shift.");
3755 return -EIO;
3756 }
3757
3758 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3759 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3760 * image, but if that's already in use, pick a new one, and report back to the child,
3761 * which one we now picked. */
3762
3763 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3764 if (r < 0)
3765 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3766
3767 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3768 if (l < 0)
3769 return log_error_errno(errno, "Failed to send UID shift: %m");
3770 if (l != sizeof arg_uid_shift) {
3771 log_error("Short write while writing UID shift.");
3772 return -EIO;
3773 }
3774 }
3775 }
3776
3777 /* Wait for the outer child. */
3778 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3779 if (r != 0)
3780 return r < 0 ? r : -EIO;
3781
3782 /* And now retrieve the PID of the inner child. */
3783 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3784 if (l < 0)
3785 return log_error_errno(errno, "Failed to read inner child PID: %m");
3786 if (l != sizeof *pid) {
3787 log_error("Short read while reading inner child PID.");
3788 return -EIO;
3789 }
3790
3791 /* We also retrieve container UUID in case it was generated by outer child */
3792 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3793 if (l < 0)
3794 return log_error_errno(errno, "Failed to read container machine ID: %m");
3795 if (l != sizeof(arg_uuid)) {
3796 log_error("Short read while reading container machined ID.");
3797 return -EIO;
3798 }
3799
3800 /* We also retrieve the socket used for notifications generated by outer child */
3801 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3802 if (notify_socket < 0)
3803 return log_error_errno(notify_socket,
3804 "Failed to receive notification socket from the outer child: %m");
3805
3806 log_debug("Init process invoked as PID "PID_FMT, *pid);
3807
3808 if (arg_userns_mode != USER_NAMESPACE_NO) {
3809 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3810 log_error("Child died too early.");
3811 return -ESRCH;
3812 }
3813
3814 r = setup_uid_map(*pid);
3815 if (r < 0)
3816 return r;
3817
3818 (void) barrier_place(&barrier); /* #2 */
3819 }
3820
3821 if (arg_private_network) {
3822
3823 r = move_network_interfaces(*pid, arg_network_interfaces);
3824 if (r < 0)
3825 return r;
3826
3827 if (arg_network_veth) {
3828 r = setup_veth(arg_machine, *pid, veth_name,
3829 arg_network_bridge || arg_network_zone);
3830 if (r < 0)
3831 return r;
3832 else if (r > 0)
3833 ifi = r;
3834
3835 if (arg_network_bridge) {
3836 /* Add the interface to a bridge */
3837 r = setup_bridge(veth_name, arg_network_bridge, false);
3838 if (r < 0)
3839 return r;
3840 if (r > 0)
3841 ifi = r;
3842 } else if (arg_network_zone) {
3843 /* Add the interface to a bridge, possibly creating it */
3844 r = setup_bridge(veth_name, arg_network_zone, true);
3845 if (r < 0)
3846 return r;
3847 if (r > 0)
3848 ifi = r;
3849 }
3850 }
3851
3852 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3853 if (r < 0)
3854 return r;
3855
3856 /* We created the primary and extra veth links now; let's remember this, so that we know to
3857 remove them later on. Note that we don't bother with removing veth links that were created
3858 here when their setup failed half-way, because in that case the kernel should be able to
3859 remove them on its own, since they cannot be referenced by anything yet. */
3860 *veth_created = true;
3861
3862 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3863 if (r < 0)
3864 return r;
3865
3866 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3867 if (r < 0)
3868 return r;
3869 }
3870
3871 if (arg_register) {
3872 r = register_machine(
3873 arg_machine,
3874 *pid,
3875 arg_directory,
3876 arg_uuid,
3877 ifi,
3878 arg_slice,
3879 arg_custom_mounts, arg_n_custom_mounts,
3880 arg_kill_signal,
3881 arg_property,
3882 arg_keep_unit,
3883 arg_container_service_name);
3884 if (r < 0)
3885 return r;
3886 }
3887
f0bef277 3888 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3889 if (r < 0)
3890 return r;
3891
3892 if (arg_keep_unit) {
3893 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3894 if (r < 0)
3895 return r;
3896 }
3897
3898 r = chown_cgroup(*pid, arg_uid_shift);
3899 if (r < 0)
3900 return r;
3901
3902 /* Notify the child that the parent is ready with all
3903 * its setup (including cgroup-ification), and that
3904 * the child can now hand over control to the code to
3905 * run inside the container. */
3906 (void) barrier_place(&barrier); /* #3 */
3907
3908 /* Block SIGCHLD here, before notifying child.
3909 * process_pty() will handle it with the other signals. */
3910 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3911
3912 /* Reset signal to default */
3913 r = default_signals(SIGCHLD, -1);
3914 if (r < 0)
3915 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3916
3917 r = sd_event_new(&event);
3918 if (r < 0)
3919 return log_error_errno(r, "Failed to get default event source: %m");
3920
3921 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
3922 if (r < 0)
3923 return r;
3924
3925 /* Let the child know that we are ready and wait that the child is completely ready now. */
3926 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3927 log_error("Child died too early.");
3928 return -ESRCH;
3929 }
3930
3931 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3932 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3933 etc_passwd_lock = safe_close(etc_passwd_lock);
3934
3935 sd_notifyf(false,
3936 "STATUS=Container running.\n"
3937 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3938 if (!arg_notify_ready)
3939 sd_notify(false, "READY=1\n");
3940
3941 if (arg_kill_signal > 0) {
3942 /* Try to kill the init system on SIGINT or SIGTERM */
3943 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3944 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3945 } else {
3946 /* Immediately exit */
3947 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3948 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3949 }
3950
3951 /* simply exit on sigchld */
3952 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3953
3954 if (arg_expose_ports) {
3955 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3956 if (r < 0)
3957 return r;
3958
3959 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3960 }
3961
3962 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3963
3964 r = pty_forward_new(event, master,
3965 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3966 &forward);
3967 if (r < 0)
3968 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3969
3970 r = sd_event_loop(event);
3971 if (r < 0)
3972 return log_error_errno(r, "Failed to run event loop: %m");
3973
3974 pty_forward_get_last_char(forward, &last_char);
3975
3976 forward = pty_forward_free(forward);
3977
3978 if (!arg_quiet && last_char != '\n')
3979 putc('\n', stdout);
3980
3981 /* Kill if it is not dead yet anyway */
3982 if (arg_register && !arg_keep_unit)
3983 terminate_machine(*pid);
3984
3985 /* Normally redundant, but better safe than sorry */
c67b0082 3986 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3987
3988 r = wait_for_container(*pid, &container_status);
3989 *pid = 0;
3990
3991 if (r < 0)
3992 /* We failed to wait for the container, or the container exited abnormally. */
3993 return r;
3994 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3995 /* r > 0 → The container exited with a non-zero status.
3996 * As a special case, we need to replace 133 with a different value,
3997 * because 133 is special-cased in the service file to reboot the container.
3998 * otherwise → The container exited with zero status and a reboot was not requested.
3999 */
2a49b612 4000 if (r == EXIT_FORCE_RESTART)
27e29a1e 4001 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4002 *ret = r;
b0067625
ZJS
4003 return 0; /* finito */
4004 }
4005
4006 /* CONTAINER_REBOOTED, loop again */
4007
4008 if (arg_keep_unit) {
4009 /* Special handling if we are running as a service: instead of simply
4010 * restarting the machine we want to restart the entire service, so let's
4011 * inform systemd about this with the special exit code 133. The service
4012 * file uses RestartForceExitStatus=133 so that this results in a full
4013 * nspawn restart. This is necessary since we might have cgroup parameters
4014 * set we want to have flushed out. */
2a49b612
ZJS
4015 *ret = EXIT_FORCE_RESTART;
4016 return 0; /* finito */
b0067625
ZJS
4017 }
4018
4019 expose_port_flush(arg_expose_ports, exposed);
4020
4021 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4022 *veth_created = false;
4023 return 1; /* loop again */
4024}
4025
03cfe0d5
LP
4026int main(int argc, char *argv[]) {
4027
a6bc7db9 4028 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
4029 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4030 _cleanup_close_ int master = -1, image_fd = -1;
4031 _cleanup_fdset_free_ FDSet *fds = NULL;
cfed63f6 4032 int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS;
5aa3eba5 4033 char veth_name[IFNAMSIZ] = "";
17cbb288 4034 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4035 pid_t pid = 0;
03cfe0d5
LP
4036 union in_addr_union exposed = {};
4037 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4038 bool interactive, veth_created = false, remove_tmprootdir = false;
4039 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
03cfe0d5
LP
4040
4041 log_parse_environment();
4042 log_open();
4043
7732f92b
LP
4044 /* Make sure rename_process() in the stub init process can work */
4045 saved_argv = argv;
4046 saved_argc = argc;
4047
03cfe0d5
LP
4048 r = parse_argv(argc, argv);
4049 if (r <= 0)
4050 goto finish;
4051
03cfe0d5
LP
4052 if (geteuid() != 0) {
4053 log_error("Need to be root.");
4054 r = -EPERM;
4055 goto finish;
4056 }
f757855e
LP
4057 r = determine_names();
4058 if (r < 0)
4059 goto finish;
4060
4061 r = load_settings();
4062 if (r < 0)
4063 goto finish;
4064
4065 r = verify_arguments();
4066 if (r < 0)
4067 goto finish;
03cfe0d5
LP
4068
4069 n_fd_passed = sd_listen_fds(false);
4070 if (n_fd_passed > 0) {
4071 r = fdset_new_listen_fds(&fds, false);
4072 if (r < 0) {
4073 log_error_errno(r, "Failed to collect file descriptors: %m");
4074 goto finish;
4075 }
4076 }
4077
4078 if (arg_directory) {
4079 assert(!arg_image);
4080
4081 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4082 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4083 r = -EINVAL;
4084 goto finish;
4085 }
4086
4087 if (arg_ephemeral) {
4088 _cleanup_free_ char *np = NULL;
4089
8d4aa2bb 4090 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
4091 if (r < 0)
4092 goto finish;
4093
03cfe0d5
LP
4094 /* If the specified path is a mount point we
4095 * generate the new snapshot immediately
4096 * inside it under a random name. However if
4097 * the specified is not a mount point we
4098 * create the new snapshot in the parent
4099 * directory, just next to it. */
e1873695 4100 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
4101 if (r < 0) {
4102 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4103 goto finish;
4104 }
4105 if (r > 0)
770b5ce4 4106 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4107 else
770b5ce4 4108 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4109 if (r < 0) {
0f3be6ca 4110 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4111 goto finish;
4112 }
4113
4114 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4115 if (r < 0) {
4116 log_error_errno(r, "Failed to lock %s: %m", np);
4117 goto finish;
4118 }
4119
17cbb288
LP
4120 r = btrfs_subvol_snapshot(arg_directory, np,
4121 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4122 BTRFS_SNAPSHOT_FALLBACK_COPY |
4123 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4124 BTRFS_SNAPSHOT_RECURSIVE |
4125 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4126 if (r < 0) {
4127 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4128 goto finish;
ec16945e
LP
4129 }
4130
4131 free(arg_directory);
4132 arg_directory = np;
8a16a7b4 4133 np = NULL;
ec16945e 4134
17cbb288 4135 remove_directory = true;
30535c16
LP
4136
4137 } else {
cb638b5e 4138 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
4139 if (r < 0)
4140 goto finish;
4141
30535c16
LP
4142 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4143 if (r == -EBUSY) {
4144 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4145 goto finish;
4146 }
4147 if (r < 0) {
4148 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4149 goto finish;
30535c16
LP
4150 }
4151
4152 if (arg_template) {
8d4aa2bb 4153 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
4154 if (r < 0)
4155 goto finish;
4156
17cbb288
LP
4157 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4158 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4159 BTRFS_SNAPSHOT_FALLBACK_COPY |
4160 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4161 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4162 BTRFS_SNAPSHOT_RECURSIVE |
4163 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4164 if (r == -EEXIST) {
4165 if (!arg_quiet)
4166 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4167 } else if (r < 0) {
83521414 4168 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4169 goto finish;
4170 } else {
4171 if (!arg_quiet)
4172 log_info("Populated %s from template %s.", arg_directory, arg_template);
4173 }
4174 }
ec16945e
LP
4175 }
4176
7732f92b 4177 if (arg_start_mode == START_BOOT) {
1b9e5b12 4178 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4179 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4180 r = -EINVAL;
1b9e5b12
LP
4181 goto finish;
4182 }
4183 } else {
4184 const char *p;
4185
16fb773e
LP
4186 p = strjoina(arg_directory, "/usr/");
4187 if (laccess(p, F_OK) < 0) {
4188 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 4189 r = -EINVAL;
1b9e5b12 4190 goto finish;
1b9e5b12
LP
4191 }
4192 }
ec16945e 4193
6b9132a9 4194 } else {
ec16945e
LP
4195 assert(arg_image);
4196 assert(!arg_template);
4197
8d4aa2bb 4198 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
4199 if (r < 0)
4200 goto finish;
4201
0f3be6ca
LP
4202 if (arg_ephemeral) {
4203 _cleanup_free_ char *np = NULL;
4204
4205 r = tempfn_random(arg_image, "machine.", &np);
4206 if (r < 0) {
4207 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4208 goto finish;
4209 }
4210
4211 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4212 if (r < 0) {
4213 r = log_error_errno(r, "Failed to create image lock: %m");
4214 goto finish;
4215 }
4216
4217 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL);
4218 if (r < 0) {
4219 r = log_error_errno(r, "Failed to copy image file: %m");
4220 goto finish;
4221 }
4222
4223 free(arg_image);
4224 arg_image = np;
4225 np = NULL;
4226
4227 remove_image = true;
4228 } else {
4229 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4230 if (r == -EBUSY) {
4231 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4232 goto finish;
4233 }
4234 if (r < 0) {
4235 r = log_error_errno(r, "Failed to create image lock: %m");
4236 goto finish;
4237 }
30535c16
LP
4238 }
4239
c67b0082 4240 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4241 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4242 goto finish;
1b9e5b12 4243 }
6b9132a9 4244
c67b0082
LP
4245 remove_tmprootdir = true;
4246
4247 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4248 if (!arg_directory) {
4249 r = log_oom();
4250 goto finish;
6b9132a9 4251 }
88213476 4252
1b9e5b12
LP
4253 image_fd = setup_image(&device_path, &loop_nr);
4254 if (image_fd < 0) {
4255 r = image_fd;
842f3b0f
LP
4256 goto finish;
4257 }
1b9e5b12 4258
4d9f07b4
LP
4259 r = dissect_image(image_fd,
4260 &root_device, &root_device_rw,
4261 &home_device, &home_device_rw,
4262 &srv_device, &srv_device_rw,
a6bc7db9 4263 &esp_device,
4d9f07b4 4264 &secondary);
1b9e5b12
LP
4265 if (r < 0)
4266 goto finish;
0f3be6ca
LP
4267
4268 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4269 if (remove_image && unlink(arg_image) >= 0)
4270 remove_image = false;
842f3b0f 4271 }
842f3b0f 4272
86c0dd4a 4273 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
4274 if (r < 0)
4275 goto finish;
4276
03cfe0d5
LP
4277 interactive =
4278 isatty(STDIN_FILENO) > 0 &&
4279 isatty(STDOUT_FILENO) > 0;
9c857b9d 4280
db7feb7e
LP
4281 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4282 if (master < 0) {
ec16945e 4283 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4284 goto finish;
4285 }
4286
611b312b
LP
4287 r = ptsname_malloc(master, &console);
4288 if (r < 0) {
4289 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4290 goto finish;
68b02049
DW
4291 }
4292
4293 if (arg_selinux_apifs_context) {
4294 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4295 if (r < 0)
4296 goto finish;
a258bf26
LP
4297 }
4298
a258bf26 4299 if (unlockpt(master) < 0) {
ec16945e 4300 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4301 goto finish;
4302 }
4303
9c857b9d
LP
4304 if (!arg_quiet)
4305 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4306 arg_machine, arg_image ?: arg_directory);
4307
72c0a2c2 4308 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4309
03cfe0d5
LP
4310 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4311 r = log_error_errno(errno, "Failed to become subreaper: %m");
4312 goto finish;
4313 }
4314
d87be9b0 4315 for (;;) {
b0067625
ZJS
4316 r = run(master,
4317 console,
4318 root_device, root_device_rw,
4319 home_device, home_device_rw,
4320 srv_device, srv_device_rw,
4321 esp_device,
4322 interactive, secondary,
4323 fds,
4324 veth_name, &veth_created,
4325 &exposed,
4326 &pid, &ret);
4327 if (r <= 0)
d87be9b0 4328 break;
d87be9b0 4329 }
88213476
LP
4330
4331finish:
af4ec430 4332 sd_notify(false,
2a49b612
ZJS
4333 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4334 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4335
9444b1f2 4336 if (pid > 0)
c67b0082 4337 (void) kill(pid, SIGKILL);
88213476 4338
503546da 4339 /* Try to flush whatever is still queued in the pty */
6a0f896b 4340 if (master >= 0) {
59f448cf 4341 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
6a0f896b
LP
4342 master = safe_close(master);
4343 }
4344
4345 if (pid > 0)
4346 (void) wait_for_terminate(pid, NULL);
503546da 4347
03cfe0d5
LP
4348 loop_remove(loop_nr, &image_fd);
4349
17cbb288 4350 if (remove_directory && arg_directory) {
ec16945e
LP
4351 int k;
4352
17cbb288 4353 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4354 if (k < 0)
17cbb288 4355 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4356 }
4357
0f3be6ca
LP
4358 if (remove_image && arg_image) {
4359 if (unlink(arg_image) < 0)
4360 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4361 }
4362
c67b0082
LP
4363 if (remove_tmprootdir) {
4364 if (rmdir(tmprootdir) < 0)
4365 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4366 }
4367
785890ac
LP
4368 if (arg_machine) {
4369 const char *p;
4370
63c372cb 4371 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4372 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4373 }
4374
7a8f6325 4375 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4376
4377 if (veth_created)
4378 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4379 (void) remove_bridge(arg_network_zone);
f757855e 4380
04d391da 4381 free(arg_directory);
ec16945e
LP
4382 free(arg_template);
4383 free(arg_image);
7027ff61 4384 free(arg_machine);
c74e630d 4385 free(arg_user);
5f932eb9 4386 free(arg_chdir);
c74e630d 4387 strv_free(arg_setenv);
f757855e 4388 free(arg_network_bridge);
c74e630d
LP
4389 strv_free(arg_network_interfaces);
4390 strv_free(arg_network_macvlan);
4bbfe7ad 4391 strv_free(arg_network_ipvlan);
f6d6bad1 4392 strv_free(arg_network_veth_extra);
f757855e
LP
4393 strv_free(arg_parameters);
4394 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4395 expose_port_free_all(arg_expose_ports);
6d0b55c2 4396
ec16945e 4397 return r < 0 ? EXIT_FAILURE : ret;
88213476 4398}