]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: check env var first, detect second
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
8fe0087e 60#include "formats-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
57fb9fb5
LP
119typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124} LinkJournal;
88213476
LP
125
126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
687d0825 129static char *arg_user = NULL;
9444b1f2 130static sd_id128_t arg_uuid = {};
7027ff61 131static char *arg_machine = NULL;
c74e630d
LP
132static const char *arg_selinux_context = NULL;
133static const char *arg_selinux_apifs_context = NULL;
9444b1f2 134static const char *arg_slice = NULL;
ff01d048 135static bool arg_private_network = false;
bc2f673e 136static bool arg_read_only = false;
7732f92b 137static StartMode arg_start_mode = START_PID1;
ec16945e 138static bool arg_ephemeral = false;
57fb9fb5 139static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 140static bool arg_link_journal_try = false;
520e0d54 141static uint64_t arg_caps_retain =
50b52222
LP
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 153 (1ULL << CAP_MKNOD) |
5076f0cc
LP
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
5076f0cc 157 (1ULL << CAP_SETFCAP) |
50b52222 158 (1ULL << CAP_SETGID) |
5076f0cc
LP
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
50b52222 162 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 166 (1ULL << CAP_SYS_RESOURCE) |
50b52222 167 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
168static CustomMount *arg_custom_mounts = NULL;
169static unsigned arg_n_custom_mounts = 0;
f4889f65 170static char **arg_setenv = NULL;
284c0b91 171static bool arg_quiet = false;
eb91eb18 172static bool arg_register = true;
89f7c846 173static bool arg_keep_unit = false;
aa28aefe 174static char **arg_network_interfaces = NULL;
c74e630d 175static char **arg_network_macvlan = NULL;
4bbfe7ad 176static char **arg_network_ipvlan = NULL;
69c79d3c 177static bool arg_network_veth = false;
f6d6bad1 178static char **arg_network_veth_extra = NULL;
f757855e 179static char *arg_network_bridge = NULL;
22b28dfd 180static char *arg_network_zone = NULL;
050f7277 181static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 182static char *arg_image = NULL;
f757855e 183static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 184static ExposePort *arg_expose_ports = NULL;
f36933fe 185static char **arg_property = NULL;
0de7acce 186static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 187static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 188static bool arg_userns_chown = false;
c6c8f6e2 189static int arg_kill_signal = 0;
5da38d07 190static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
191static SettingsMask arg_settings_mask = 0;
192static int arg_settings_trusted = -1;
193static char **arg_parameters = NULL;
6aadfa4c 194static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 195static bool arg_notify_ready = false;
5a8ff0e6 196static bool arg_use_cgns = true;
0c582db0 197static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
88213476 198
601185b4 199static void help(void) {
88213476
LP
200 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
201 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
202 " -h --help Show this help\n"
203 " --version Print version string\n"
69c79d3c 204 " -q --quiet Do not show status information\n"
1b9e5b12 205 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
206 " --template=PATH Initialize root directory from template directory,\n"
207 " if missing\n"
208 " -x --ephemeral Run container with snapshot of root directory, and\n"
209 " remove it after exit\n"
210 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 211 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 212 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 213 " --chdir=PATH Set working directory in the container\n"
a8828ed9 214 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 215 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 216 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 217 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 218 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 219 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 220 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d
ZJS
221 " Similar, but with user configured UID/GID range\n"
222 " --private-user-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
223 " --private-network Disable network in container\n"
224 " --network-interface=INTERFACE\n"
225 " Assign an existing network interface to the\n"
226 " container\n"
c74e630d
LP
227 " --network-macvlan=INTERFACE\n"
228 " Create a macvlan network interface based on an\n"
229 " existing network interface to the container\n"
4bbfe7ad
TG
230 " --network-ipvlan=INTERFACE\n"
231 " Create a ipvlan network interface based on an\n"
232 " existing network interface to the container\n"
a8eaaee7 233 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 234 " and container\n"
f6d6bad1
LP
235 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
236 " Add an additional virtual Ethernet link between\n"
237 " host and container\n"
ab046dde 238 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
239 " Add a virtual Ethernet connection to the container\n"
240 " and attach it to an existing bridge on the host\n"
241 " --network-zone=NAME Similar, but attach the new interface to an\n"
242 " an automatically managed bridge interface\n"
6d0b55c2 243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 244 " Expose a container IP port on the host\n"
82adf6af
LP
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
a8828ed9
DW
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
574edc90 257 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 258 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
a8828ed9 261 " the container\n"
5e5bfa6e
EY
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
06c17c39 264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
267 " the container\n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 271 " --register=BOOLEAN Register container as machine\n"
89f7c846 272 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 273 " the service unit nspawn is running in\n"
6d0b55c2 274 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 275 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 276 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 277 , program_invocation_short_name);
88213476
LP
278}
279
5a8af538
LP
280static int custom_mounts_prepare(void) {
281 unsigned i;
282 int r;
283
284 /* Ensure the mounts are applied prefix first. */
285 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
286
287 /* Allocate working directories for the overlay file systems that need it */
288 for (i = 0; i < arg_n_custom_mounts; i++) {
289 CustomMount *m = &arg_custom_mounts[i];
290
0de7acce 291 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
292
293 if (arg_userns_chown) {
294 log_error("--private-users-chown may not be combined with custom root mounts.");
295 return -EINVAL;
296 } else if (arg_uid_shift == UID_INVALID) {
297 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
298 return -EINVAL;
299 }
825d5287
RM
300 }
301
5a8af538
LP
302 if (m->type != CUSTOM_MOUNT_OVERLAY)
303 continue;
304
305 if (m->work_dir)
306 continue;
307
308 if (m->read_only)
309 continue;
310
14bcf25c 311 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
312 if (r < 0)
313 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
314 }
315
316 return 0;
317}
318
efdb0237
LP
319static int detect_unified_cgroup_hierarchy(void) {
320 const char *e;
5da38d07
TH
321 int r, all_unified, systemd_unified;
322
efdb0237
LP
323 /* Allow the user to control whether the unified hierarchy is used */
324 e = getenv("UNIFIED_CGROUP_HIERARCHY");
325 if (e) {
326 r = parse_boolean(e);
327 if (r < 0)
328 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
329 if (r > 0)
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
331 else
332 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 333
efdb0237
LP
334 return 0;
335 }
336
98afd6af
ZJS
337 all_unified = cg_all_unified();
338 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
339
340 if (all_unified < 0 || systemd_unified < 0)
341 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
342 "Failed to determine whether the unified cgroups hierarchy is used: %m");
343
efdb0237 344 /* Otherwise inherit the default from the host system */
5da38d07
TH
345 if (all_unified > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
347 else if (systemd_unified > 0)
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
349 else
350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 351
efdb0237
LP
352 return 0;
353}
354
0c582db0
LB
355static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
356 int r;
357
358 r = getenv_bool(name);
359 if (r == -ENXIO)
360 return;
361 if (r < 0)
362 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
363 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
364}
365
88213476
LP
366static int parse_argv(int argc, char *argv[]) {
367
a41fe3a2 368 enum {
acbeb427
ZJS
369 ARG_VERSION = 0x100,
370 ARG_PRIVATE_NETWORK,
bc2f673e 371 ARG_UUID,
5076f0cc 372 ARG_READ_ONLY,
57fb9fb5 373 ARG_CAPABILITY,
420c7379 374 ARG_DROP_CAPABILITY,
17fe0523
LP
375 ARG_LINK_JOURNAL,
376 ARG_BIND,
f4889f65 377 ARG_BIND_RO,
06c17c39 378 ARG_TMPFS,
5a8af538
LP
379 ARG_OVERLAY,
380 ARG_OVERLAY_RO,
eb91eb18 381 ARG_SHARE_SYSTEM,
89f7c846 382 ARG_REGISTER,
aa28aefe 383 ARG_KEEP_UNIT,
69c79d3c 384 ARG_NETWORK_INTERFACE,
c74e630d 385 ARG_NETWORK_MACVLAN,
4bbfe7ad 386 ARG_NETWORK_IPVLAN,
ab046dde 387 ARG_NETWORK_BRIDGE,
22b28dfd 388 ARG_NETWORK_ZONE,
f6d6bad1 389 ARG_NETWORK_VETH_EXTRA,
6afc95b7 390 ARG_PERSONALITY,
4d9f07b4 391 ARG_VOLATILE,
ec16945e 392 ARG_TEMPLATE,
f36933fe 393 ARG_PROPERTY,
6dac160c 394 ARG_PRIVATE_USERS,
c6c8f6e2 395 ARG_KILL_SIGNAL,
f757855e 396 ARG_SETTINGS,
5f932eb9 397 ARG_CHDIR,
7336138e 398 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 399 ARG_NOTIFY_READY,
a41fe3a2
LP
400 };
401
88213476 402 static const struct option options[] = {
aa28aefe
LP
403 { "help", no_argument, NULL, 'h' },
404 { "version", no_argument, NULL, ARG_VERSION },
405 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
406 { "template", required_argument, NULL, ARG_TEMPLATE },
407 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
408 { "user", required_argument, NULL, 'u' },
409 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
7732f92b 410 { "as-pid2", no_argument, NULL, 'a' },
aa28aefe
LP
411 { "boot", no_argument, NULL, 'b' },
412 { "uuid", required_argument, NULL, ARG_UUID },
413 { "read-only", no_argument, NULL, ARG_READ_ONLY },
414 { "capability", required_argument, NULL, ARG_CAPABILITY },
415 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
416 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
417 { "bind", required_argument, NULL, ARG_BIND },
418 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 419 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
420 { "overlay", required_argument, NULL, ARG_OVERLAY },
421 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
422 { "machine", required_argument, NULL, 'M' },
423 { "slice", required_argument, NULL, 'S' },
a5f1cb3b 424 { "setenv", required_argument, NULL, 'E' },
aa28aefe
LP
425 { "selinux-context", required_argument, NULL, 'Z' },
426 { "selinux-apifs-context", required_argument, NULL, 'L' },
427 { "quiet", no_argument, NULL, 'q' },
a6b5216c 428 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
aa28aefe
LP
429 { "register", required_argument, NULL, ARG_REGISTER },
430 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
431 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 432 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 433 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 434 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 435 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 436 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
22b28dfd 437 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
6afc95b7 438 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 439 { "image", required_argument, NULL, 'i' },
4d9f07b4 440 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 441 { "port", required_argument, NULL, 'p' },
f36933fe 442 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 443 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
7336138e 444 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
c6c8f6e2 445 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 446 { "settings", required_argument, NULL, ARG_SETTINGS },
5f932eb9 447 { "chdir", required_argument, NULL, ARG_CHDIR },
9c1e04d0 448 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 449 {}
88213476
LP
450 };
451
9444b1f2 452 int c, r;
6aadfa4c 453 const char *p, *e;
a42c8b54 454 uint64_t plus = 0, minus = 0;
f757855e 455 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
456
457 assert(argc >= 0);
458 assert(argv);
459
19aac838 460 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
461
462 switch (c) {
463
464 case 'h':
601185b4
ZJS
465 help();
466 return 0;
88213476 467
acbeb427 468 case ARG_VERSION:
3f6fd1ba 469 return version();
acbeb427 470
88213476 471 case 'D':
0f03c2a4 472 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 473 if (r < 0)
0f03c2a4 474 return r;
ec16945e
LP
475 break;
476
477 case ARG_TEMPLATE:
0f03c2a4 478 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 479 if (r < 0)
0f03c2a4 480 return r;
88213476
LP
481 break;
482
1b9e5b12 483 case 'i':
0f03c2a4 484 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 485 if (r < 0)
0f03c2a4 486 return r;
ec16945e
LP
487 break;
488
489 case 'x':
490 arg_ephemeral = true;
1b9e5b12
LP
491 break;
492
687d0825 493 case 'u':
2fc09a9c
DM
494 r = free_and_strdup(&arg_user, optarg);
495 if (r < 0)
7027ff61 496 return log_oom();
687d0825 497
f757855e 498 arg_settings_mask |= SETTING_USER;
687d0825
MV
499 break;
500
22b28dfd
LP
501 case ARG_NETWORK_ZONE: {
502 char *j;
503
504 j = strappend("vz-", optarg);
505 if (!j)
506 return log_oom();
507
508 if (!ifname_valid(j)) {
509 log_error("Network zone name not valid: %s", j);
510 free(j);
511 return -EINVAL;
512 }
513
514 free(arg_network_zone);
515 arg_network_zone = j;
516
517 arg_network_veth = true;
518 arg_private_network = true;
519 arg_settings_mask |= SETTING_NETWORK;
520 break;
521 }
522
ab046dde 523 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
524
525 if (!ifname_valid(optarg)) {
526 log_error("Bridge interface name not valid: %s", optarg);
527 return -EINVAL;
528 }
529
f757855e
LP
530 r = free_and_strdup(&arg_network_bridge, optarg);
531 if (r < 0)
532 return log_oom();
ab046dde
TG
533
534 /* fall through */
535
0dfaa006 536 case 'n':
69c79d3c
LP
537 arg_network_veth = true;
538 arg_private_network = true;
f757855e 539 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
540 break;
541
f6d6bad1
LP
542 case ARG_NETWORK_VETH_EXTRA:
543 r = veth_extra_parse(&arg_network_veth_extra, optarg);
544 if (r < 0)
545 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
546
547 arg_private_network = true;
548 arg_settings_mask |= SETTING_NETWORK;
549 break;
550
aa28aefe 551 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
552
553 if (!ifname_valid(optarg)) {
554 log_error("Network interface name not valid: %s", optarg);
555 return -EINVAL;
556 }
557
c74e630d
LP
558 if (strv_extend(&arg_network_interfaces, optarg) < 0)
559 return log_oom();
560
561 arg_private_network = true;
f757855e 562 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
563 break;
564
565 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
566
567 if (!ifname_valid(optarg)) {
568 log_error("MACVLAN network interface name not valid: %s", optarg);
569 return -EINVAL;
570 }
571
c74e630d 572 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
573 return log_oom();
574
4bbfe7ad 575 arg_private_network = true;
f757855e 576 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
577 break;
578
579 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
580
581 if (!ifname_valid(optarg)) {
582 log_error("IPVLAN network interface name not valid: %s", optarg);
583 return -EINVAL;
584 }
585
4bbfe7ad
TG
586 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
587 return log_oom();
588
aa28aefe
LP
589 /* fall through */
590
ff01d048
LP
591 case ARG_PRIVATE_NETWORK:
592 arg_private_network = true;
f757855e 593 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
594 break;
595
0f0dbc46 596 case 'b':
7732f92b
LP
597 if (arg_start_mode == START_PID2) {
598 log_error("--boot and --as-pid2 may not be combined.");
599 return -EINVAL;
600 }
601
602 arg_start_mode = START_BOOT;
603 arg_settings_mask |= SETTING_START_MODE;
604 break;
605
606 case 'a':
607 if (arg_start_mode == START_BOOT) {
608 log_error("--boot and --as-pid2 may not be combined.");
609 return -EINVAL;
610 }
611
612 arg_start_mode = START_PID2;
613 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
614 break;
615
144f0fc0 616 case ARG_UUID:
9444b1f2 617 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
618 if (r < 0)
619 return log_error_errno(r, "Invalid UUID: %s", optarg);
620
621 if (sd_id128_is_null(arg_uuid)) {
622 log_error("Machine UUID may not be all zeroes.");
623 return -EINVAL;
aa96c6cb 624 }
f757855e
LP
625
626 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 627 break;
aa96c6cb 628
9444b1f2 629 case 'S':
c74e630d 630 arg_slice = optarg;
144f0fc0
LP
631 break;
632
7027ff61 633 case 'M':
c1521918 634 if (isempty(optarg))
97b11eed 635 arg_machine = mfree(arg_machine);
c1521918 636 else {
0c3c4284 637 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
638 log_error("Invalid machine name: %s", optarg);
639 return -EINVAL;
640 }
7027ff61 641
0c3c4284
LP
642 r = free_and_strdup(&arg_machine, optarg);
643 if (r < 0)
eb91eb18
LP
644 return log_oom();
645
646 break;
647 }
7027ff61 648
82adf6af
LP
649 case 'Z':
650 arg_selinux_context = optarg;
a8828ed9
DW
651 break;
652
82adf6af
LP
653 case 'L':
654 arg_selinux_apifs_context = optarg;
a8828ed9
DW
655 break;
656
bc2f673e
LP
657 case ARG_READ_ONLY:
658 arg_read_only = true;
f757855e 659 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
660 break;
661
420c7379
LP
662 case ARG_CAPABILITY:
663 case ARG_DROP_CAPABILITY: {
6cbe4ed1 664 p = optarg;
9ed794a3 665 for (;;) {
6cbe4ed1 666 _cleanup_free_ char *t = NULL;
5076f0cc 667
6cbe4ed1
SS
668 r = extract_first_word(&p, &t, ",", 0);
669 if (r < 0)
670 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 671
6cbe4ed1
SS
672 if (r == 0)
673 break;
5076f0cc 674
39ed67d1
LP
675 if (streq(t, "all")) {
676 if (c == ARG_CAPABILITY)
a42c8b54 677 plus = (uint64_t) -1;
39ed67d1 678 else
a42c8b54 679 minus = (uint64_t) -1;
39ed67d1 680 } else {
2822da4f
LP
681 int cap;
682
683 cap = capability_from_name(t);
684 if (cap < 0) {
39ed67d1
LP
685 log_error("Failed to parse capability %s.", t);
686 return -EINVAL;
687 }
688
689 if (c == ARG_CAPABILITY)
a42c8b54 690 plus |= 1ULL << (uint64_t) cap;
39ed67d1 691 else
a42c8b54 692 minus |= 1ULL << (uint64_t) cap;
5076f0cc 693 }
5076f0cc
LP
694 }
695
f757855e 696 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
697 break;
698 }
699
57fb9fb5
LP
700 case 'j':
701 arg_link_journal = LINK_GUEST;
574edc90 702 arg_link_journal_try = true;
57fb9fb5
LP
703 break;
704
705 case ARG_LINK_JOURNAL:
53e438e3 706 if (streq(optarg, "auto")) {
57fb9fb5 707 arg_link_journal = LINK_AUTO;
53e438e3
LP
708 arg_link_journal_try = false;
709 } else if (streq(optarg, "no")) {
57fb9fb5 710 arg_link_journal = LINK_NO;
53e438e3
LP
711 arg_link_journal_try = false;
712 } else if (streq(optarg, "guest")) {
57fb9fb5 713 arg_link_journal = LINK_GUEST;
53e438e3
LP
714 arg_link_journal_try = false;
715 } else if (streq(optarg, "host")) {
57fb9fb5 716 arg_link_journal = LINK_HOST;
53e438e3
LP
717 arg_link_journal_try = false;
718 } else if (streq(optarg, "try-guest")) {
574edc90
MP
719 arg_link_journal = LINK_GUEST;
720 arg_link_journal_try = true;
721 } else if (streq(optarg, "try-host")) {
722 arg_link_journal = LINK_HOST;
723 arg_link_journal_try = true;
724 } else {
57fb9fb5
LP
725 log_error("Failed to parse link journal mode %s", optarg);
726 return -EINVAL;
727 }
728
729 break;
730
17fe0523 731 case ARG_BIND:
f757855e
LP
732 case ARG_BIND_RO:
733 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
734 if (r < 0)
735 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 736
f757855e 737 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 738 break;
06c17c39 739
f757855e
LP
740 case ARG_TMPFS:
741 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
742 if (r < 0)
743 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 744
f757855e 745 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 746 break;
5a8af538
LP
747
748 case ARG_OVERLAY:
749 case ARG_OVERLAY_RO: {
750 _cleanup_free_ char *upper = NULL, *destination = NULL;
751 _cleanup_strv_free_ char **lower = NULL;
752 CustomMount *m;
753 unsigned n = 0;
754 char **i;
755
62f9f39a
RM
756 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
757 if (r == -ENOMEM)
06c17c39 758 return log_oom();
62f9f39a
RM
759 else if (r < 0) {
760 log_error("Invalid overlay specification: %s", optarg);
761 return r;
762 }
06c17c39 763
5a8af538
LP
764 STRV_FOREACH(i, lower) {
765 if (!path_is_absolute(*i)) {
766 log_error("Overlay path %s is not absolute.", *i);
767 return -EINVAL;
768 }
769
770 n++;
771 }
772
773 if (n < 2) {
774 log_error("--overlay= needs at least two colon-separated directories specified.");
775 return -EINVAL;
776 }
777
778 if (n == 2) {
779 /* If two parameters are specified,
780 * the first one is the lower, the
781 * second one the upper directory. And
af86c440
ZJS
782 * we'll also define the destination
783 * mount point the same as the upper. */
5a8af538
LP
784 upper = lower[1];
785 lower[1] = NULL;
786
787 destination = strdup(upper);
788 if (!destination)
789 return log_oom();
790
791 } else {
792 upper = lower[n - 2];
793 destination = lower[n - 1];
794 lower[n - 2] = NULL;
795 }
796
f757855e 797 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
798 if (!m)
799 return log_oom();
800
801 m->destination = destination;
802 m->source = upper;
803 m->lower = lower;
804 m->read_only = c == ARG_OVERLAY_RO;
805
806 upper = destination = NULL;
807 lower = NULL;
06c17c39 808
f757855e 809 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
810 break;
811 }
812
a5f1cb3b 813 case 'E': {
f4889f65
LP
814 char **n;
815
816 if (!env_assignment_is_valid(optarg)) {
817 log_error("Environment variable assignment '%s' is not valid.", optarg);
818 return -EINVAL;
819 }
820
821 n = strv_env_set(arg_setenv, optarg);
822 if (!n)
823 return log_oom();
824
825 strv_free(arg_setenv);
826 arg_setenv = n;
f757855e
LP
827
828 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
829 break;
830 }
831
284c0b91
LP
832 case 'q':
833 arg_quiet = true;
834 break;
835
8a96d94e 836 case ARG_SHARE_SYSTEM:
a6b5216c 837 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
838 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
839 arg_clone_ns_flags = 0;
8a96d94e
LP
840 break;
841
eb91eb18
LP
842 case ARG_REGISTER:
843 r = parse_boolean(optarg);
844 if (r < 0) {
845 log_error("Failed to parse --register= argument: %s", optarg);
846 return r;
847 }
848
849 arg_register = r;
850 break;
851
89f7c846
LP
852 case ARG_KEEP_UNIT:
853 arg_keep_unit = true;
854 break;
855
6afc95b7
LP
856 case ARG_PERSONALITY:
857
ac45f971 858 arg_personality = personality_from_string(optarg);
050f7277 859 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
860 log_error("Unknown or unsupported personality '%s'.", optarg);
861 return -EINVAL;
862 }
863
f757855e 864 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
865 break;
866
4d9f07b4
LP
867 case ARG_VOLATILE:
868
869 if (!optarg)
f757855e 870 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 871 else {
f757855e 872 VolatileMode m;
4d9f07b4 873
f757855e
LP
874 m = volatile_mode_from_string(optarg);
875 if (m < 0) {
876 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 877 return -EINVAL;
f757855e
LP
878 } else
879 arg_volatile_mode = m;
6d0b55c2
LP
880 }
881
f757855e
LP
882 arg_settings_mask |= SETTING_VOLATILE_MODE;
883 break;
6d0b55c2 884
f757855e
LP
885 case 'p':
886 r = expose_port_parse(&arg_expose_ports, optarg);
887 if (r == -EEXIST)
888 return log_error_errno(r, "Duplicate port specification: %s", optarg);
889 if (r < 0)
890 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 891
f757855e 892 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 893 break;
6d0b55c2 894
f36933fe
LP
895 case ARG_PROPERTY:
896 if (strv_extend(&arg_property, optarg) < 0)
897 return log_oom();
898
899 break;
900
6dac160c 901 case ARG_PRIVATE_USERS:
0de7acce
LP
902
903 r = optarg ? parse_boolean(optarg) : 1;
904 if (r == 0) {
905 /* no: User namespacing off */
906 arg_userns_mode = USER_NAMESPACE_NO;
907 arg_uid_shift = UID_INVALID;
908 arg_uid_range = UINT32_C(0x10000);
909 } else if (r > 0) {
910 /* yes: User namespacing on, UID range is read from root dir */
911 arg_userns_mode = USER_NAMESPACE_FIXED;
912 arg_uid_shift = UID_INVALID;
913 arg_uid_range = UINT32_C(0x10000);
914 } else if (streq(optarg, "pick")) {
915 /* pick: User namespacing on, UID range is picked randomly */
916 arg_userns_mode = USER_NAMESPACE_PICK;
917 arg_uid_shift = UID_INVALID;
918 arg_uid_range = UINT32_C(0x10000);
919 } else {
6dac160c
LP
920 _cleanup_free_ char *buffer = NULL;
921 const char *range, *shift;
922
0de7acce
LP
923 /* anything else: User namespacing on, UID range is explicitly configured */
924
6dac160c
LP
925 range = strchr(optarg, ':');
926 if (range) {
927 buffer = strndup(optarg, range - optarg);
928 if (!buffer)
929 return log_oom();
930 shift = buffer;
931
932 range++;
933 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
934 log_error("Failed to parse UID range: %s", range);
935 return -EINVAL;
936 }
937 } else
938 shift = optarg;
939
940 if (parse_uid(shift, &arg_uid_shift) < 0) {
941 log_error("Failed to parse UID: %s", optarg);
942 return -EINVAL;
943 }
0de7acce
LP
944
945 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
946 }
947
0de7acce 948 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
949 break;
950
0de7acce 951 case 'U':
ccabee0d
LP
952 if (userns_supported()) {
953 arg_userns_mode = USER_NAMESPACE_PICK;
954 arg_uid_shift = UID_INVALID;
955 arg_uid_range = UINT32_C(0x10000);
956
957 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
958 }
959
7336138e
LP
960 break;
961
0de7acce 962 case ARG_PRIVATE_USERS_CHOWN:
19aac838 963 arg_userns_chown = true;
0de7acce
LP
964
965 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
966 break;
967
c6c8f6e2
LP
968 case ARG_KILL_SIGNAL:
969 arg_kill_signal = signal_from_string_try_harder(optarg);
970 if (arg_kill_signal < 0) {
971 log_error("Cannot parse signal: %s", optarg);
972 return -EINVAL;
973 }
974
f757855e
LP
975 arg_settings_mask |= SETTING_KILL_SIGNAL;
976 break;
977
978 case ARG_SETTINGS:
979
980 /* no → do not read files
981 * yes → read files, do not override cmdline, trust only subset
982 * override → read files, override cmdline, trust only subset
983 * trusted → read files, do not override cmdline, trust all
984 */
985
986 r = parse_boolean(optarg);
987 if (r < 0) {
988 if (streq(optarg, "trusted")) {
989 mask_all_settings = false;
990 mask_no_settings = false;
991 arg_settings_trusted = true;
992
993 } else if (streq(optarg, "override")) {
994 mask_all_settings = false;
995 mask_no_settings = true;
996 arg_settings_trusted = -1;
997 } else
998 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
999 } else if (r > 0) {
1000 /* yes */
1001 mask_all_settings = false;
1002 mask_no_settings = false;
1003 arg_settings_trusted = -1;
1004 } else {
1005 /* no */
1006 mask_all_settings = true;
1007 mask_no_settings = false;
1008 arg_settings_trusted = false;
1009 }
1010
c6c8f6e2
LP
1011 break;
1012
5f932eb9
LP
1013 case ARG_CHDIR:
1014 if (!path_is_absolute(optarg)) {
1015 log_error("Working directory %s is not an absolute path.", optarg);
1016 return -EINVAL;
1017 }
1018
1019 r = free_and_strdup(&arg_chdir, optarg);
1020 if (r < 0)
1021 return log_oom();
1022
1023 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1024 break;
1025
9c1e04d0
AP
1026 case ARG_NOTIFY_READY:
1027 r = parse_boolean(optarg);
1028 if (r < 0) {
1029 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1030 return -EINVAL;
1031 }
1032 arg_notify_ready = r;
1033 arg_settings_mask |= SETTING_NOTIFY_READY;
1034 break;
1035
88213476
LP
1036 case '?':
1037 return -EINVAL;
1038
1039 default:
eb9da376 1040 assert_not_reached("Unhandled option");
88213476 1041 }
88213476 1042
0c582db0
LB
1043 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1044 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1045 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1046 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1047
48a8d337
LB
1048 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1049 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1050 arg_register = false;
0c582db0
LB
1051 if (arg_start_mode != START_PID1) {
1052 log_error("--boot cannot be used without namespacing.");
1053 return -EINVAL;
1054 }
1055 }
eb91eb18 1056
0de7acce 1057 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1058 arg_userns_chown = true;
1059
89f7c846
LP
1060 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1061 log_error("--keep-unit may not be used when invoked from a user session.");
1062 return -EINVAL;
1063 }
1064
1b9e5b12
LP
1065 if (arg_directory && arg_image) {
1066 log_error("--directory= and --image= may not be combined.");
1067 return -EINVAL;
1068 }
1069
ec16945e
LP
1070 if (arg_template && arg_image) {
1071 log_error("--template= and --image= may not be combined.");
1072 return -EINVAL;
1073 }
1074
1075 if (arg_template && !(arg_directory || arg_machine)) {
1076 log_error("--template= needs --directory= or --machine=.");
1077 return -EINVAL;
1078 }
1079
1080 if (arg_ephemeral && arg_template) {
1081 log_error("--ephemeral and --template= may not be combined.");
1082 return -EINVAL;
1083 }
1084
1085 if (arg_ephemeral && arg_image) {
1086 log_error("--ephemeral and --image= may not be combined.");
1087 return -EINVAL;
1088 }
1089
df9a75e4
LP
1090 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1091 log_error("--ephemeral and --link-journal= may not be combined.");
1092 return -EINVAL;
1093 }
1094
ccabee0d 1095 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1096 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1097 return -EOPNOTSUPP;
1098 }
1099
1100 if (arg_userns_chown && arg_read_only) {
1101 log_error("--read-only and --private-users-chown may not be combined.");
1102 return -EINVAL;
1103 }
f757855e 1104
22b28dfd
LP
1105 if (arg_network_bridge && arg_network_zone) {
1106 log_error("--network-bridge= and --network-zone= may not be combined.");
1107 return -EINVAL;
1108 }
1109
f757855e
LP
1110 if (argc > optind) {
1111 arg_parameters = strv_copy(argv + optind);
1112 if (!arg_parameters)
1113 return log_oom();
1114
7732f92b 1115 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1116 }
1117
1118 /* Load all settings from .nspawn files */
1119 if (mask_no_settings)
1120 arg_settings_mask = 0;
1121
1122 /* Don't load any settings from .nspawn files */
1123 if (mask_all_settings)
1124 arg_settings_mask = _SETTINGS_MASK_ALL;
1125
520e0d54 1126 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e
LP
1127
1128 r = detect_unified_cgroup_hierarchy();
1129 if (r < 0)
1130 return r;
1131
6aadfa4c
ILG
1132 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1133 if (e)
1134 arg_container_service_name = e;
1135
5a8ff0e6
CB
1136 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1137 if (r < 0)
1138 arg_use_cgns = cg_ns_supported();
1139 else
1140 arg_use_cgns = r;
1141
f757855e
LP
1142 return 1;
1143}
1144
1145static int verify_arguments(void) {
1146
1147 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1148 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1149 return -EINVAL;
1150 }
1151
6d0b55c2
LP
1152 if (arg_expose_ports && !arg_private_network) {
1153 log_error("Cannot use --port= without private networking.");
1154 return -EINVAL;
1155 }
1156
1c1ea217
EV
1157#ifndef HAVE_LIBIPTC
1158 if (arg_expose_ports) {
1159 log_error("--port= is not supported, compiled without libiptc support.");
1160 return -EOPNOTSUPP;
1161 }
1162#endif
1163
7732f92b 1164 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1165 arg_kill_signal = SIGRTMIN+3;
1166
f757855e 1167 return 0;
88213476
LP
1168}
1169
03cfe0d5
LP
1170static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1171 assert(p);
1172
0de7acce 1173 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1174 return 0;
1175
1176 if (uid == UID_INVALID && gid == GID_INVALID)
1177 return 0;
1178
1179 if (uid != UID_INVALID) {
1180 uid += arg_uid_shift;
1181
1182 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1183 return -EOVERFLOW;
1184 }
1185
1186 if (gid != GID_INVALID) {
1187 gid += (gid_t) arg_uid_shift;
1188
1189 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1190 return -EOVERFLOW;
1191 }
1192
1193 if (lchown(p, uid, gid) < 0)
1194 return -errno;
b12afc8c
LP
1195
1196 return 0;
1197}
1198
03cfe0d5
LP
1199static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1200 const char *q;
1201
1202 q = prefix_roota(root, path);
1203 if (mkdir(q, mode) < 0) {
1204 if (errno == EEXIST)
1205 return 0;
1206 return -errno;
1207 }
1208
1209 return userns_lchown(q, uid, gid);
1210}
1211
e58a1277 1212static int setup_timezone(const char *dest) {
03cfe0d5
LP
1213 _cleanup_free_ char *p = NULL, *q = NULL;
1214 const char *where, *check, *what;
d4036145
LP
1215 char *z, *y;
1216 int r;
f8440af5 1217
e58a1277
LP
1218 assert(dest);
1219
1220 /* Fix the timezone, if possible */
d4036145
LP
1221 r = readlink_malloc("/etc/localtime", &p);
1222 if (r < 0) {
0b493a02
MP
1223 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1224 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1225 * with a symbolic link to a time zone data file.
0b493a02
MP
1226 *
1227 * Example:
21dc0227 1228 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1229 */
d4036145
LP
1230 return 0;
1231 }
1232
1233 z = path_startswith(p, "../usr/share/zoneinfo/");
1234 if (!z)
1235 z = path_startswith(p, "/usr/share/zoneinfo/");
1236 if (!z) {
1237 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1238 return 0;
1239 }
1240
03cfe0d5 1241 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1242 r = readlink_malloc(where, &q);
1243 if (r >= 0) {
1244 y = path_startswith(q, "../usr/share/zoneinfo/");
1245 if (!y)
1246 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1247
d4036145
LP
1248 /* Already pointing to the right place? Then do nothing .. */
1249 if (y && streq(y, z))
1250 return 0;
1251 }
1252
03cfe0d5 1253 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1254 check = prefix_roota(dest, check);
03cfe0d5 1255 if (laccess(check, F_OK) < 0) {
d4036145
LP
1256 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1257 return 0;
1258 }
68fb0892 1259
79d80fc1
TG
1260 r = unlink(where);
1261 if (r < 0 && errno != ENOENT) {
56f64d95 1262 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1263 return 0;
1264 }
4d9f07b4 1265
03cfe0d5 1266 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1267 if (symlink(what, where) < 0) {
56f64d95 1268 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1269 return 0;
1270 }
e58a1277 1271
03cfe0d5
LP
1272 r = userns_lchown(where, 0, 0);
1273 if (r < 0)
1274 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1275
e58a1277 1276 return 0;
88213476
LP
1277}
1278
2547bb41 1279static int setup_resolv_conf(const char *dest) {
03cfe0d5 1280 const char *where = NULL;
79d80fc1 1281 int r;
2547bb41
LP
1282
1283 assert(dest);
1284
1285 if (arg_private_network)
1286 return 0;
1287
1288 /* Fix resolv.conf, if possible */
03cfe0d5 1289 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1290
3539724c
LP
1291 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
1292 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1293 * container, so that the container can use the host's resolver. Given that network namespacing is
1294 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1295 * advantage that the container will be able to follow the host's DNS server configuration changes
1296 * transparently. */
1297
1298 if (mount("/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL) < 0)
1299 log_warning_errno(errno, "Failed to mount /etc/resolv.conf in the container, ignoring: %m");
1300 else {
1301 if (mount(NULL, where, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1302 return log_error_errno(errno, "Failed to remount /etc/resolv.conf read-only: %m");
1303
1304 return 0;
1305 }
1306 }
1307
1308 /* If that didn't work, let's copy the file */
f2068bcc 1309 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1310 if (r < 0) {
3539724c
LP
1311 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1312 * resolved or something similar runs inside and the symlink points there.
68a313c5 1313 *
3539724c 1314 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1315 */
1316 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1317 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1318 return 0;
1319 }
2547bb41 1320
03cfe0d5
LP
1321 r = userns_lchown(where, 0, 0);
1322 if (r < 0)
3539724c 1323 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1324
2547bb41
LP
1325 return 0;
1326}
1327
04bc4a3f 1328static int setup_boot_id(const char *dest) {
3bbaff3e 1329 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1330 const char *from, *to;
04bc4a3f
LP
1331 int r;
1332
04bc4a3f
LP
1333 /* Generate a new randomized boot ID, so that each boot-up of
1334 * the container gets a new one */
1335
03cfe0d5
LP
1336 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1337 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1338
1339 r = sd_id128_randomize(&rnd);
f647962d
MS
1340 if (r < 0)
1341 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1342
15b1248a 1343 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1344 if (r < 0)
1345 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1346
03cfe0d5
LP
1347 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1348 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1349 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
f7b7b3df 1350 r = log_error_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f 1351
3bbaff3e 1352 (void) unlink(from);
04bc4a3f
LP
1353 return r;
1354}
1355
e58a1277 1356static int copy_devnodes(const char *dest) {
88213476
LP
1357
1358 static const char devnodes[] =
1359 "null\0"
1360 "zero\0"
1361 "full\0"
1362 "random\0"
1363 "urandom\0"
85614d66
TG
1364 "tty\0"
1365 "net/tun\0";
88213476
LP
1366
1367 const char *d;
e58a1277 1368 int r = 0;
7fd1b19b 1369 _cleanup_umask_ mode_t u;
a258bf26
LP
1370
1371 assert(dest);
124640f1
LP
1372
1373 u = umask(0000);
88213476 1374
03cfe0d5
LP
1375 /* Create /dev/net, so that we can create /dev/net/tun in it */
1376 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1377 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1378
88213476 1379 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1380 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1381 struct stat st;
88213476 1382
7f112f50 1383 from = strappend("/dev/", d);
03cfe0d5 1384 to = prefix_root(dest, from);
88213476
LP
1385
1386 if (stat(from, &st) < 0) {
1387
4a62c710
MS
1388 if (errno != ENOENT)
1389 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1390
a258bf26 1391 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1392
03cfe0d5 1393 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1394 return -EIO;
a258bf26 1395
85614d66 1396 } else {
81f5049b 1397 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1398 /*
1399 * This is some sort of protection too against
1400 * recursive userns chown on shared /dev/
1401 */
1402 if (errno == EEXIST)
1403 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1404 if (errno != EPERM)
1405 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1406
1407 /* Some systems abusively restrict mknod but
1408 * allow bind mounts. */
1409 r = touch(to);
1410 if (r < 0)
1411 return log_error_errno(r, "touch (%s) failed: %m", to);
1412 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1413 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1414 }
6278cf60 1415
03cfe0d5
LP
1416 r = userns_lchown(to, 0, 0);
1417 if (r < 0)
1418 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1419 }
88213476
LP
1420 }
1421
e58a1277
LP
1422 return r;
1423}
88213476 1424
03cfe0d5
LP
1425static int setup_pts(const char *dest) {
1426 _cleanup_free_ char *options = NULL;
1427 const char *p;
709f6e46 1428 int r;
03cfe0d5
LP
1429
1430#ifdef HAVE_SELINUX
1431 if (arg_selinux_apifs_context)
1432 (void) asprintf(&options,
3dce8915 1433 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1434 arg_uid_shift + TTY_GID,
1435 arg_selinux_apifs_context);
1436 else
1437#endif
1438 (void) asprintf(&options,
3dce8915 1439 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1440 arg_uid_shift + TTY_GID);
f2d88580 1441
03cfe0d5 1442 if (!options)
f2d88580
LP
1443 return log_oom();
1444
03cfe0d5 1445 /* Mount /dev/pts itself */
cc9fce65 1446 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1447 if (mkdir(p, 0755) < 0)
1448 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1449 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1450 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1451 r = userns_lchown(p, 0, 0);
1452 if (r < 0)
1453 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1454
1455 /* Create /dev/ptmx symlink */
1456 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1457 if (symlink("pts/ptmx", p) < 0)
1458 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1459 r = userns_lchown(p, 0, 0);
1460 if (r < 0)
1461 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1462
03cfe0d5
LP
1463 /* And fix /dev/pts/ptmx ownership */
1464 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1465 r = userns_lchown(p, 0, 0);
1466 if (r < 0)
1467 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1468
f2d88580
LP
1469 return 0;
1470}
1471
e58a1277 1472static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1473 _cleanup_umask_ mode_t u;
1474 const char *to;
e58a1277 1475 int r;
e58a1277
LP
1476
1477 assert(dest);
1478 assert(console);
1479
1480 u = umask(0000);
1481
03cfe0d5 1482 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1483 if (r < 0)
1484 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1485
a258bf26
LP
1486 /* We need to bind mount the right tty to /dev/console since
1487 * ptys can only exist on pts file systems. To have something
81f5049b 1488 * to bind mount things on we create a empty regular file. */
a258bf26 1489
03cfe0d5 1490 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1491 r = touch(to);
1492 if (r < 0)
1493 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1494
4543768d 1495 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1496 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1497
25ea79fe 1498 return 0;
e58a1277
LP
1499}
1500
1501static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1502 const char *from, *to;
7fd1b19b 1503 _cleanup_umask_ mode_t u;
d9603714 1504 int fd, r;
e58a1277 1505
e58a1277 1506 assert(kmsg_socket >= 0);
a258bf26 1507
e58a1277 1508 u = umask(0000);
a258bf26 1509
03cfe0d5 1510 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1511 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1512 * on the reading side behave very similar to /proc/kmsg,
1513 * their writing side behaves differently from /dev/kmsg in
1514 * that writing blocks when nothing is reading. In order to
1515 * avoid any problems with containers deadlocking due to this
1516 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1517 from = prefix_roota(dest, "/run/kmsg");
1518 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1519
4a62c710 1520 if (mkfifo(from, 0600) < 0)
03cfe0d5 1521 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1522 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1523 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1524
1525 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1526 if (fd < 0)
1527 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1528
e58a1277
LP
1529 /* Store away the fd in the socket, so that it stays open as
1530 * long as we run the child */
3ee897d6 1531 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1532 safe_close(fd);
e58a1277 1533
d9603714
DH
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1536
03cfe0d5
LP
1537 /* And now make the FIFO unavailable as /run/kmsg... */
1538 (void) unlink(from);
1539
25ea79fe 1540 return 0;
88213476
LP
1541}
1542
1c4baffc 1543static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1544 union in_addr_union *exposed = userdata;
1545
1546 assert(rtnl);
1547 assert(m);
1548 assert(exposed);
1549
7a8f6325 1550 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1551 return 0;
1552}
1553
3a74cea5 1554static int setup_hostname(void) {
3a74cea5 1555
0c582db0 1556 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1557 return 0;
1558
605f81a8 1559 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1560 return -errno;
3a74cea5 1561
7027ff61 1562 return 0;
3a74cea5
LP
1563}
1564
57fb9fb5 1565static int setup_journal(const char *directory) {
e01ff70a 1566 sd_id128_t this_id;
0f5e1382 1567 _cleanup_free_ char *d = NULL;
e01ff70a 1568 const char *p, *q;
8054d749 1569 bool try;
e01ff70a 1570 char id[33];
57fb9fb5
LP
1571 int r;
1572
df9a75e4
LP
1573 /* Don't link journals in ephemeral mode */
1574 if (arg_ephemeral)
1575 return 0;
1576
8054d749
LP
1577 if (arg_link_journal == LINK_NO)
1578 return 0;
1579
1580 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1581
4d680aee 1582 r = sd_id128_get_machine(&this_id);
f647962d
MS
1583 if (r < 0)
1584 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1585
e01ff70a 1586 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1587 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1588 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1589 if (try)
4d680aee 1590 return 0;
df9a75e4 1591 return -EEXIST;
4d680aee
ZJS
1592 }
1593
03cfe0d5
LP
1594 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1595 if (r < 0)
1596 return log_error_errno(r, "Failed to create /var: %m");
1597
1598 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1599 if (r < 0)
1600 return log_error_errno(r, "Failed to create /var/log: %m");
1601
1602 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1603 if (r < 0)
1604 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1605
e01ff70a
MS
1606 (void) sd_id128_to_string(arg_uuid, id);
1607
03cfe0d5
LP
1608 p = strjoina("/var/log/journal/", id);
1609 q = prefix_roota(directory, p);
27407a01 1610
e26d6ce5 1611 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1612 if (try)
1613 return 0;
27407a01 1614
8054d749
LP
1615 log_error("%s: already a mount point, refusing to use for journal", p);
1616 return -EEXIST;
57fb9fb5
LP
1617 }
1618
e26d6ce5 1619 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1620 if (try)
1621 return 0;
57fb9fb5 1622
8054d749
LP
1623 log_error("%s: already a mount point, refusing to use for journal", q);
1624 return -EEXIST;
57fb9fb5
LP
1625 }
1626
1627 r = readlink_and_make_absolute(p, &d);
1628 if (r >= 0) {
1629 if ((arg_link_journal == LINK_GUEST ||
1630 arg_link_journal == LINK_AUTO) &&
1631 path_equal(d, q)) {
1632
03cfe0d5 1633 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1634 if (r < 0)
709f6e46 1635 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1636 return 0;
57fb9fb5
LP
1637 }
1638
4a62c710
MS
1639 if (unlink(p) < 0)
1640 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1641 } else if (r == -EINVAL) {
1642
1643 if (arg_link_journal == LINK_GUEST &&
1644 rmdir(p) < 0) {
1645
27407a01
ZJS
1646 if (errno == ENOTDIR) {
1647 log_error("%s already exists and is neither a symlink nor a directory", p);
1648 return r;
4314d33f
MS
1649 } else
1650 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1651 }
4314d33f
MS
1652 } else if (r != -ENOENT)
1653 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1654
1655 if (arg_link_journal == LINK_GUEST) {
1656
1657 if (symlink(q, p) < 0) {
8054d749 1658 if (try) {
56f64d95 1659 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1660 return 0;
4314d33f
MS
1661 } else
1662 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1663 }
1664
03cfe0d5 1665 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1666 if (r < 0)
709f6e46 1667 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1668 return 0;
57fb9fb5
LP
1669 }
1670
1671 if (arg_link_journal == LINK_HOST) {
ccddd104 1672 /* don't create parents here — if the host doesn't have
574edc90 1673 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1674
1675 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1676 if (try) {
56f64d95 1677 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1678 return 0;
4314d33f
MS
1679 } else
1680 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1681 }
1682
27407a01
ZJS
1683 } else if (access(p, F_OK) < 0)
1684 return 0;
57fb9fb5 1685
cdb2b9d0
LP
1686 if (dir_is_empty(q) == 0)
1687 log_warning("%s is not empty, proceeding anyway.", q);
1688
03cfe0d5 1689 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1690 if (r < 0)
1691 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1692
4543768d 1693 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1694 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1695
27407a01 1696 return 0;
57fb9fb5
LP
1697}
1698
88213476 1699static int drop_capabilities(void) {
520e0d54 1700 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1701}
1702
db999e0f
LP
1703static int reset_audit_loginuid(void) {
1704 _cleanup_free_ char *p = NULL;
1705 int r;
1706
0c582db0 1707 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1708 return 0;
1709
1710 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1711 if (r == -ENOENT)
db999e0f 1712 return 0;
f647962d
MS
1713 if (r < 0)
1714 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1715
1716 /* Already reset? */
1717 if (streq(p, "4294967295"))
1718 return 0;
1719
ad118bda 1720 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1721 if (r < 0) {
10a87006
LP
1722 log_error_errno(r,
1723 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1724 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1725 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1726 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1727 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1728
db999e0f 1729 sleep(5);
77b6e194 1730 }
db999e0f
LP
1731
1732 return 0;
77b6e194
LP
1733}
1734
24fb1112 1735
785890ac
LP
1736static int setup_propagate(const char *root) {
1737 const char *p, *q;
709f6e46 1738 int r;
785890ac
LP
1739
1740 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1741 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1742 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1743 (void) mkdir_p(p, 0600);
1744
709f6e46
MS
1745 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1746 if (r < 0)
1747 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1748
709f6e46
MS
1749 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1750 if (r < 0)
1751 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1752
709f6e46
MS
1753 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1754 if (r < 0)
1755 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1756
03cfe0d5 1757 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1758 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1759 return log_error_errno(errno, "Failed to install propagation bind mount.");
1760
1761 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1762 return log_error_errno(errno, "Failed to make propagation mount read-only");
1763
19caffac
AC
1764 /* machined will MS_MOVE into that directory, and that's only
1765 * supported for non-shared mounts. */
1766 if (mount(NULL, q, NULL, MS_SLAVE, NULL) < 0)
1767 return log_error_errno(errno, "Failed to make propagation mount slave");
1768
785890ac
LP
1769 return 0;
1770}
1771
1b9e5b12
LP
1772static int setup_image(char **device_path, int *loop_nr) {
1773 struct loop_info64 info = {
1774 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1775 };
1776 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1777 _cleanup_free_ char* loopdev = NULL;
1778 struct stat st;
1779 int r, nr;
1780
1781 assert(device_path);
1782 assert(loop_nr);
ec16945e 1783 assert(arg_image);
1b9e5b12
LP
1784
1785 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1786 if (fd < 0)
1787 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1788
4a62c710
MS
1789 if (fstat(fd, &st) < 0)
1790 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1791
1792 if (S_ISBLK(st.st_mode)) {
1793 char *p;
1794
1795 p = strdup(arg_image);
1796 if (!p)
1797 return log_oom();
1798
1799 *device_path = p;
1800
1801 *loop_nr = -1;
1802
1803 r = fd;
1804 fd = -1;
1805
1806 return r;
1807 }
1808
1809 if (!S_ISREG(st.st_mode)) {
070edd97 1810 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1811 return -EINVAL;
1812 }
1813
1814 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1815 if (control < 0)
1816 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1817
1818 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1819 if (nr < 0)
1820 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1821
1822 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1823 return log_oom();
1824
1825 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1826 if (loop < 0)
1827 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1828
4a62c710
MS
1829 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1830 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1831
1832 if (arg_read_only)
1833 info.lo_flags |= LO_FLAGS_READ_ONLY;
1834
4a62c710
MS
1835 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1836 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1837
1838 *device_path = loopdev;
1839 loopdev = NULL;
1840
1841 *loop_nr = nr;
1842
1843 r = loop;
1844 loop = -1;
1845
1846 return r;
1847}
1848
ada4799a
LP
1849#define PARTITION_TABLE_BLURB \
1850 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1851 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1852 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1853 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1854 "to be bootable with systemd-nspawn."
1855
1b9e5b12
LP
1856static int dissect_image(
1857 int fd,
727fd4fd
LP
1858 char **root_device, bool *root_device_rw,
1859 char **home_device, bool *home_device_rw,
1860 char **srv_device, bool *srv_device_rw,
a6bc7db9 1861 char **esp_device,
1b9e5b12
LP
1862 bool *secondary) {
1863
1864#ifdef HAVE_BLKID
a6bc7db9 1865 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1866#ifdef GPT_ROOT_NATIVE
1867 int root_nr = -1;
1868#endif
1869#ifdef GPT_ROOT_SECONDARY
1870 int secondary_root_nr = -1;
1871#endif
a6bc7db9 1872 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1873 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1874 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1875 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1876 _cleanup_udev_unref_ struct udev *udev = NULL;
1877 struct udev_list_entry *first, *item;
f6c51a81 1878 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1879 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1880 const char *pttype = NULL;
1881 blkid_partlist pl;
1882 struct stat st;
c09ef2e4 1883 unsigned i;
1b9e5b12
LP
1884 int r;
1885
1886 assert(fd >= 0);
1887 assert(root_device);
1888 assert(home_device);
1889 assert(srv_device);
a6bc7db9 1890 assert(esp_device);
1b9e5b12 1891 assert(secondary);
ec16945e 1892 assert(arg_image);
1b9e5b12
LP
1893
1894 b = blkid_new_probe();
1895 if (!b)
1896 return log_oom();
1897
1898 errno = 0;
1899 r = blkid_probe_set_device(b, fd, 0, 0);
1900 if (r != 0) {
1901 if (errno == 0)
1902 return log_oom();
1903
e1427b13 1904 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1905 }
1906
1907 blkid_probe_enable_partitions(b, 1);
1908 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1909
1910 errno = 0;
1911 r = blkid_do_safeprobe(b);
1912 if (r == -2 || r == 1) {
ada4799a
LP
1913 log_error("Failed to identify any partition table on\n"
1914 " %s\n"
1915 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1916 return -EINVAL;
1917 } else if (r != 0) {
1918 if (errno == 0)
1919 errno = EIO;
e1427b13 1920 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1921 }
1922
48861960 1923 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1924
1925 is_gpt = streq_ptr(pttype, "gpt");
1926 is_mbr = streq_ptr(pttype, "dos");
1927
1928 if (!is_gpt && !is_mbr) {
1929 log_error("No GPT or MBR partition table discovered on\n"
1930 " %s\n"
1931 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1932 return -EINVAL;
1933 }
1934
1935 errno = 0;
1936 pl = blkid_probe_get_partitions(b);
1937 if (!pl) {
1938 if (errno == 0)
1939 return log_oom();
1940
1941 log_error("Failed to list partitions of %s", arg_image);
1942 return -errno;
1943 }
1944
1945 udev = udev_new();
1946 if (!udev)
1947 return log_oom();
1948
4a62c710
MS
1949 if (fstat(fd, &st) < 0)
1950 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1951
c09ef2e4
LP
1952 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1953 if (!d)
1b9e5b12
LP
1954 return log_oom();
1955
c09ef2e4
LP
1956 for (i = 0;; i++) {
1957 int n, m;
1b9e5b12 1958
c09ef2e4
LP
1959 if (i >= 10) {
1960 log_error("Kernel partitions never appeared.");
1961 return -ENXIO;
1962 }
1963
1964 e = udev_enumerate_new(udev);
1965 if (!e)
1966 return log_oom();
1967
1968 r = udev_enumerate_add_match_parent(e, d);
1969 if (r < 0)
1970 return log_oom();
1971
1972 r = udev_enumerate_scan_devices(e);
1973 if (r < 0)
1974 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1975
1976 /* Count the partitions enumerated by the kernel */
1977 n = 0;
1978 first = udev_enumerate_get_list_entry(e);
1979 udev_list_entry_foreach(item, first)
1980 n++;
1981
1982 /* Count the partitions enumerated by blkid */
1983 m = blkid_partlist_numof_partitions(pl);
1984 if (n == m + 1)
1985 break;
1986 if (n > m + 1) {
1987 log_error("blkid and kernel partition list do not match.");
1988 return -EIO;
1989 }
1990 if (n < m + 1) {
1991 unsigned j;
1992
1993 /* The kernel has probed fewer partitions than
1994 * blkid? Maybe the kernel prober is still
1995 * running or it got EBUSY because udev
1996 * already opened the device. Let's reprobe
1997 * the device, which is a synchronous call
1998 * that waits until probing is complete. */
1999
2000 for (j = 0; j < 20; j++) {
2001
2002 r = ioctl(fd, BLKRRPART, 0);
2003 if (r < 0)
2004 r = -errno;
2005 if (r >= 0 || r != -EBUSY)
2006 break;
2007
2008 /* If something else has the device
2009 * open, such as an udev rule, the
2010 * ioctl will return EBUSY. Since
2011 * there's no way to wait until it
2012 * isn't busy anymore, let's just wait
2013 * a bit, and try again.
2014 *
2015 * This is really something they
2016 * should fix in the kernel! */
2017
2018 usleep(50 * USEC_PER_MSEC);
2019 }
2020
2021 if (r < 0)
2022 return log_error_errno(r, "Failed to reread partition table: %m");
2023 }
2024
2025 e = udev_enumerate_unref(e);
2026 }
1b9e5b12
LP
2027
2028 first = udev_enumerate_get_list_entry(e);
2029 udev_list_entry_foreach(item, first) {
2030 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2031 const char *node;
727fd4fd 2032 unsigned long long flags;
1b9e5b12
LP
2033 blkid_partition pp;
2034 dev_t qn;
2035 int nr;
2036
2037 errno = 0;
2038 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2039 if (!q) {
2040 if (!errno)
2041 errno = ENOMEM;
2042
e1427b13 2043 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2044 }
2045
2046 qn = udev_device_get_devnum(q);
2047 if (major(qn) == 0)
2048 continue;
2049
2050 if (st.st_rdev == qn)
2051 continue;
2052
2053 node = udev_device_get_devnode(q);
2054 if (!node)
2055 continue;
2056
2057 pp = blkid_partlist_devno_to_partition(pl, qn);
2058 if (!pp)
2059 continue;
2060
727fd4fd 2061 flags = blkid_partition_get_flags(pp);
727fd4fd 2062
1b9e5b12
LP
2063 nr = blkid_partition_get_partno(pp);
2064 if (nr < 0)
2065 continue;
2066
ada4799a
LP
2067 if (is_gpt) {
2068 sd_id128_t type_id;
2069 const char *stype;
1b9e5b12 2070
f6c51a81
LP
2071 if (flags & GPT_FLAG_NO_AUTO)
2072 continue;
2073
ada4799a
LP
2074 stype = blkid_partition_get_type_string(pp);
2075 if (!stype)
2076 continue;
1b9e5b12 2077
ada4799a 2078 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2079 continue;
2080
ada4799a 2081 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2082
ada4799a
LP
2083 if (home && nr >= home_nr)
2084 continue;
1b9e5b12 2085
ada4799a
LP
2086 home_nr = nr;
2087 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2088
ada4799a
LP
2089 r = free_and_strdup(&home, node);
2090 if (r < 0)
2091 return log_oom();
727fd4fd 2092
ada4799a
LP
2093 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2094
2095 if (srv && nr >= srv_nr)
2096 continue;
2097
2098 srv_nr = nr;
2099 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2100
2101 r = free_and_strdup(&srv, node);
2102 if (r < 0)
2103 return log_oom();
a6bc7db9
LP
2104 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2105
2106 if (esp && nr >= esp_nr)
2107 continue;
2108
2109 esp_nr = nr;
2110
2111 r = free_and_strdup(&esp, node);
2112 if (r < 0)
2113 return log_oom();
ada4799a 2114 }
1b9e5b12 2115#ifdef GPT_ROOT_NATIVE
ada4799a 2116 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2117
ada4799a
LP
2118 if (root && nr >= root_nr)
2119 continue;
1b9e5b12 2120
ada4799a
LP
2121 root_nr = nr;
2122 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2123
ada4799a
LP
2124 r = free_and_strdup(&root, node);
2125 if (r < 0)
2126 return log_oom();
2127 }
1b9e5b12
LP
2128#endif
2129#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2130 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2131
2132 if (secondary_root && nr >= secondary_root_nr)
2133 continue;
2134
2135 secondary_root_nr = nr;
2136 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2137
2138 r = free_and_strdup(&secondary_root, node);
2139 if (r < 0)
2140 return log_oom();
2141 }
2142#endif
f6c51a81
LP
2143 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2144
2145 if (generic)
2146 multiple_generic = true;
2147 else {
2148 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2149
2150 r = free_and_strdup(&generic, node);
2151 if (r < 0)
2152 return log_oom();
2153 }
2154 }
ada4799a
LP
2155
2156 } else if (is_mbr) {
2157 int type;
1b9e5b12 2158
f6c51a81
LP
2159 if (flags != 0x80) /* Bootable flag */
2160 continue;
2161
ada4799a
LP
2162 type = blkid_partition_get_type(pp);
2163 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2164 continue;
2165
f6c51a81
LP
2166 if (generic)
2167 multiple_generic = true;
2168 else {
2169 generic_rw = true;
727fd4fd 2170
f6c51a81
LP
2171 r = free_and_strdup(&root, node);
2172 if (r < 0)
2173 return log_oom();
2174 }
1b9e5b12 2175 }
1b9e5b12
LP
2176 }
2177
1b9e5b12
LP
2178 if (root) {
2179 *root_device = root;
2180 root = NULL;
727fd4fd
LP
2181
2182 *root_device_rw = root_rw;
1b9e5b12
LP
2183 *secondary = false;
2184 } else if (secondary_root) {
2185 *root_device = secondary_root;
2186 secondary_root = NULL;
727fd4fd
LP
2187
2188 *root_device_rw = secondary_root_rw;
1b9e5b12 2189 *secondary = true;
f6c51a81
LP
2190 } else if (generic) {
2191
2192 /* There were no partitions with precise meanings
2193 * around, but we found generic partitions. In this
2194 * case, if there's only one, we can go ahead and boot
2195 * it, otherwise we bail out, because we really cannot
2196 * make any sense of it. */
2197
2198 if (multiple_generic) {
2199 log_error("Identified multiple bootable Linux partitions on\n"
2200 " %s\n"
2201 PARTITION_TABLE_BLURB, arg_image);
2202 return -EINVAL;
2203 }
2204
2205 *root_device = generic;
2206 generic = NULL;
2207
2208 *root_device_rw = generic_rw;
2209 *secondary = false;
2210 } else {
2211 log_error("Failed to identify root partition in disk image\n"
2212 " %s\n"
2213 PARTITION_TABLE_BLURB, arg_image);
2214 return -EINVAL;
1b9e5b12
LP
2215 }
2216
2217 if (home) {
2218 *home_device = home;
2219 home = NULL;
727fd4fd
LP
2220
2221 *home_device_rw = home_rw;
1b9e5b12
LP
2222 }
2223
2224 if (srv) {
2225 *srv_device = srv;
2226 srv = NULL;
727fd4fd
LP
2227
2228 *srv_device_rw = srv_rw;
1b9e5b12
LP
2229 }
2230
a6bc7db9
LP
2231 if (esp) {
2232 *esp_device = esp;
2233 esp = NULL;
2234 }
2235
1b9e5b12
LP
2236 return 0;
2237#else
2238 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2239 return -EOPNOTSUPP;
1b9e5b12
LP
2240#endif
2241}
2242
727fd4fd 2243static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2244#ifdef HAVE_BLKID
2245 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2246 const char *fstype, *p;
2247 int r;
2248
2249 assert(what);
2250 assert(where);
2251
727fd4fd
LP
2252 if (arg_read_only)
2253 rw = false;
2254
1b9e5b12 2255 if (directory)
63c372cb 2256 p = strjoina(where, directory);
1b9e5b12
LP
2257 else
2258 p = where;
2259
2260 errno = 0;
2261 b = blkid_new_probe_from_filename(what);
2262 if (!b) {
2263 if (errno == 0)
2264 return log_oom();
e1427b13 2265 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2266 }
2267
2268 blkid_probe_enable_superblocks(b, 1);
2269 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2270
2271 errno = 0;
2272 r = blkid_do_safeprobe(b);
2273 if (r == -1 || r == 1) {
2274 log_error("Cannot determine file system type of %s", what);
2275 return -EINVAL;
2276 } else if (r != 0) {
2277 if (errno == 0)
2278 errno = EIO;
e1427b13 2279 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2280 }
2281
2282 errno = 0;
2283 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2284 if (errno == 0)
2285 errno = EINVAL;
2286 log_error("Failed to determine file system type of %s", what);
2287 return -errno;
2288 }
2289
2290 if (streq(fstype, "crypto_LUKS")) {
2291 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2292 return -EOPNOTSUPP;
1b9e5b12
LP
2293 }
2294
4a62c710
MS
2295 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2296 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2297
2298 return 0;
2299#else
2300 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2301 return -EOPNOTSUPP;
1b9e5b12
LP
2302#endif
2303}
2304
317feb4d 2305static int setup_machine_id(const char *directory) {
691675ba
LP
2306 const char *etc_machine_id;
2307 sd_id128_t id;
3bbaff3e 2308 int r;
e01ff70a 2309
317feb4d
LP
2310 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2311 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2312 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2313 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2314 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2315 * container behaves nicely). */
2316
e01ff70a
MS
2317 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2318
691675ba 2319 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2320 if (r < 0) {
2321 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2322 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2323
317feb4d
LP
2324 if (sd_id128_is_null(arg_uuid)) {
2325 r = sd_id128_randomize(&arg_uuid);
2326 if (r < 0)
2327 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2328 }
2329 } else {
2330 if (sd_id128_is_null(id)) {
2331 log_error("Machine ID in container image is zero, refusing.");
2332 return -EINVAL;
2333 }
e01ff70a 2334
317feb4d
LP
2335 arg_uuid = id;
2336 }
691675ba 2337
e01ff70a
MS
2338 return 0;
2339}
2340
7336138e
LP
2341static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2342 int r;
2343
2344 assert(directory);
2345
0de7acce 2346 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2347 return 0;
2348
2349 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2350 if (r == -EOPNOTSUPP)
2351 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2352 if (r == -EBADE)
2353 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2356 if (r == 0)
2357 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2358 else
2359 log_debug("Patched directory tree to match UID/GID range.");
2360
2361 return r;
2362}
2363
727fd4fd
LP
2364static int mount_devices(
2365 const char *where,
2366 const char *root_device, bool root_device_rw,
2367 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2368 const char *srv_device, bool srv_device_rw,
2369 const char *esp_device) {
1b9e5b12
LP
2370 int r;
2371
2372 assert(where);
2373
2374 if (root_device) {
727fd4fd 2375 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2376 if (r < 0)
2377 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2378 }
2379
2380 if (home_device) {
727fd4fd 2381 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2384 }
2385
2386 if (srv_device) {
727fd4fd 2387 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2390 }
2391
a6bc7db9
LP
2392 if (esp_device) {
2393 const char *mp, *x;
2394
2395 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2396
2397 mp = "/efi";
2398 x = strjoina(arg_directory, mp);
2399 r = dir_is_empty(x);
2400 if (r == -ENOENT) {
2401 mp = "/boot";
2402 x = strjoina(arg_directory, mp);
2403 r = dir_is_empty(x);
2404 }
2405
2406 if (r > 0) {
2407 r = mount_device(esp_device, arg_directory, mp, true);
2408 if (r < 0)
2409 return log_error_errno(r, "Failed to mount ESP: %m");
2410 }
2411 }
2412
1b9e5b12
LP
2413 return 0;
2414}
2415
2416static void loop_remove(int nr, int *image_fd) {
2417 _cleanup_close_ int control = -1;
e8c8ddcc 2418 int r;
1b9e5b12
LP
2419
2420 if (nr < 0)
2421 return;
2422
2423 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2424 r = ioctl(*image_fd, LOOP_CLR_FD);
2425 if (r < 0)
5e4074aa 2426 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2427 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2428 }
2429
2430 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2431 if (control < 0) {
56f64d95 2432 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2433 return;
e8c8ddcc 2434 }
1b9e5b12 2435
e8c8ddcc
TG
2436 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2437 if (r < 0)
5e4074aa 2438 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2439}
2440
113cea80 2441/*
6d416b9c
LS
2442 * Return values:
2443 * < 0 : wait_for_terminate() failed to get the state of the
2444 * container, the container was terminated by a signal, or
2445 * failed for an unknown reason. No change is made to the
2446 * container argument.
2447 * > 0 : The program executed in the container terminated with an
2448 * error. The exit code of the program executed in the
919699ec
LP
2449 * container is returned. The container argument has been set
2450 * to CONTAINER_TERMINATED.
6d416b9c
LS
2451 * 0 : The container is being rebooted, has been shut down or exited
2452 * successfully. The container argument has been set to either
2453 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2454 *
6d416b9c
LS
2455 * That is, success is indicated by a return value of zero, and an
2456 * error is indicated by a non-zero value.
113cea80
DH
2457 */
2458static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2459 siginfo_t status;
919699ec 2460 int r;
113cea80
DH
2461
2462 r = wait_for_terminate(pid, &status);
f647962d
MS
2463 if (r < 0)
2464 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2465
2466 switch (status.si_code) {
fddbb89c 2467
113cea80 2468 case CLD_EXITED:
b5a2179b 2469 if (status.si_status == 0)
919699ec 2470 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2471 else
919699ec 2472 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2473
919699ec
LP
2474 *container = CONTAINER_TERMINATED;
2475 return status.si_status;
113cea80
DH
2476
2477 case CLD_KILLED:
2478 if (status.si_status == SIGINT) {
919699ec 2479 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2480 *container = CONTAINER_TERMINATED;
919699ec
LP
2481 return 0;
2482
113cea80 2483 } else if (status.si_status == SIGHUP) {
919699ec 2484 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2485 *container = CONTAINER_REBOOTED;
919699ec 2486 return 0;
113cea80 2487 }
919699ec 2488
113cea80
DH
2489 /* CLD_KILLED fallthrough */
2490
2491 case CLD_DUMPED:
fddbb89c 2492 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2493 return -EIO;
113cea80
DH
2494
2495 default:
fddbb89c 2496 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2497 return -EIO;
113cea80 2498 }
113cea80
DH
2499}
2500
023fb90b
LP
2501static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2502 pid_t pid;
2503
4a0b58c4 2504 pid = PTR_TO_PID(userdata);
023fb90b 2505 if (pid > 0) {
c6c8f6e2 2506 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2507 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2508 sd_event_source_set_userdata(s, NULL);
2509 return 0;
2510 }
2511 }
2512
2513 sd_event_exit(sd_event_source_get_event(s), 0);
2514 return 0;
2515}
2516
ec16945e 2517static int determine_names(void) {
1b9cebf6 2518 int r;
ec16945e 2519
c1521918
LP
2520 if (arg_template && !arg_directory && arg_machine) {
2521
2522 /* If --template= was specified then we should not
2523 * search for a machine, but instead create a new one
2524 * in /var/lib/machine. */
2525
2526 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2527 if (!arg_directory)
2528 return log_oom();
2529 }
2530
ec16945e 2531 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2532 if (arg_machine) {
2533 _cleanup_(image_unrefp) Image *i = NULL;
2534
2535 r = image_find(arg_machine, &i);
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2538 else if (r == 0) {
2539 log_error("No image for machine '%s': %m", arg_machine);
2540 return -ENOENT;
2541 }
2542
aceac2f0 2543 if (i->type == IMAGE_RAW)
0f03c2a4 2544 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2545 else
0f03c2a4 2546 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2547 if (r < 0)
2548 return log_error_errno(r, "Invalid image directory: %m");
2549
aee327b8
LP
2550 if (!arg_ephemeral)
2551 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2552 } else
ec16945e
LP
2553 arg_directory = get_current_dir_name();
2554
1b9cebf6
LP
2555 if (!arg_directory && !arg_machine) {
2556 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2557 return -EINVAL;
2558 }
2559 }
2560
2561 if (!arg_machine) {
b9ba4dab
LP
2562 if (arg_directory && path_equal(arg_directory, "/"))
2563 arg_machine = gethostname_malloc();
2564 else
2565 arg_machine = strdup(basename(arg_image ?: arg_directory));
2566
ec16945e
LP
2567 if (!arg_machine)
2568 return log_oom();
2569
ae691c1d 2570 hostname_cleanup(arg_machine);
ec16945e
LP
2571 if (!machine_name_is_valid(arg_machine)) {
2572 log_error("Failed to determine machine name automatically, please use -M.");
2573 return -EINVAL;
2574 }
b9ba4dab
LP
2575
2576 if (arg_ephemeral) {
2577 char *b;
2578
2579 /* Add a random suffix when this is an
2580 * ephemeral machine, so that we can run many
2581 * instances at once without manually having
2582 * to specify -M each time. */
2583
2584 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2585 return log_oom();
2586
2587 free(arg_machine);
2588 arg_machine = b;
2589 }
ec16945e
LP
2590 }
2591
2592 return 0;
2593}
2594
03cfe0d5 2595static int determine_uid_shift(const char *directory) {
6dac160c
LP
2596 int r;
2597
0de7acce 2598 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2599 arg_uid_shift = 0;
6dac160c 2600 return 0;
03cfe0d5 2601 }
6dac160c
LP
2602
2603 if (arg_uid_shift == UID_INVALID) {
2604 struct stat st;
2605
03cfe0d5 2606 r = stat(directory, &st);
6dac160c 2607 if (r < 0)
03cfe0d5 2608 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2609
2610 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2611
2612 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2613 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2614 return -EINVAL;
2615 }
2616
2617 arg_uid_range = UINT32_C(0x10000);
2618 }
2619
2620 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2621 log_error("UID base too high for UID range.");
2622 return -EINVAL;
2623 }
2624
6dac160c
LP
2625 return 0;
2626}
2627
03cfe0d5
LP
2628static int inner_child(
2629 Barrier *barrier,
2630 const char *directory,
2631 bool secondary,
2632 int kmsg_socket,
2633 int rtnl_socket,
f757855e 2634 FDSet *fds) {
69c79d3c 2635
03cfe0d5 2636 _cleanup_free_ char *home = NULL;
e01ff70a 2637 char as_uuid[37];
6aadfa4c 2638 unsigned n_env = 1;
03cfe0d5
LP
2639 const char *envp[] = {
2640 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2641 NULL, /* container */
03cfe0d5
LP
2642 NULL, /* TERM */
2643 NULL, /* HOME */
2644 NULL, /* USER */
2645 NULL, /* LOGNAME */
2646 NULL, /* container_uuid */
2647 NULL, /* LISTEN_FDS */
2648 NULL, /* LISTEN_PID */
9c1e04d0 2649 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2650 NULL
2651 };
88213476 2652
2371271c 2653 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2654 int r;
88213476 2655
03cfe0d5
LP
2656 assert(barrier);
2657 assert(directory);
2658 assert(kmsg_socket >= 0);
88213476 2659
efdb0237
LP
2660 cg_unified_flush();
2661
0de7acce 2662 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2663 /* Tell the parent, that it now can write the UID map. */
2664 (void) barrier_place(barrier); /* #1 */
7027ff61 2665
03cfe0d5
LP
2666 /* Wait until the parent wrote the UID map */
2667 if (!barrier_place_and_sync(barrier)) { /* #2 */
2668 log_error("Parent died too early");
2669 return -ESRCH;
2670 }
88213476
LP
2671 }
2672
0de7acce
LP
2673 r = mount_all(NULL,
2674 arg_userns_mode != USER_NAMESPACE_NO,
2675 true,
2676 arg_private_network,
2677 arg_uid_shift,
2678 arg_uid_range,
2679 arg_selinux_apifs_context);
2680
03cfe0d5
LP
2681 if (r < 0)
2682 return r;
2683
d8fc6a00
LP
2684 r = mount_sysfs(NULL);
2685 if (r < 0)
2686 return r;
2687
03cfe0d5
LP
2688 /* Wait until we are cgroup-ified, so that we
2689 * can mount the right cgroup path writable */
2690 if (!barrier_place_and_sync(barrier)) { /* #3 */
2691 log_error("Parent died too early");
2692 return -ESRCH;
88213476
LP
2693 }
2694
5a8ff0e6 2695 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2696 r = unshare(CLONE_NEWCGROUP);
2697 if (r < 0)
2698 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2699 r = mount_cgroups(
2700 "",
2701 arg_unified_cgroup_hierarchy,
2702 arg_userns_mode != USER_NAMESPACE_NO,
2703 arg_uid_shift,
2704 arg_uid_range,
5a8ff0e6
CB
2705 arg_selinux_apifs_context,
2706 arg_use_cgns);
0996ef00
CB
2707 if (r < 0)
2708 return r;
2709 } else {
2710 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2711 if (r < 0)
2712 return r;
2713 }
ec16945e 2714
03cfe0d5
LP
2715 r = reset_uid_gid();
2716 if (r < 0)
2717 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2718
03cfe0d5
LP
2719 r = setup_boot_id(NULL);
2720 if (r < 0)
2721 return r;
ec16945e 2722
03cfe0d5
LP
2723 r = setup_kmsg(NULL, kmsg_socket);
2724 if (r < 0)
2725 return r;
2726 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2727
03cfe0d5 2728 umask(0022);
30535c16 2729
03cfe0d5
LP
2730 if (setsid() < 0)
2731 return log_error_errno(errno, "setsid() failed: %m");
2732
2733 if (arg_private_network)
2734 loopback_setup();
2735
7a8f6325
LP
2736 if (arg_expose_ports) {
2737 r = expose_port_send_rtnl(rtnl_socket);
2738 if (r < 0)
2739 return r;
2740 rtnl_socket = safe_close(rtnl_socket);
2741 }
03cfe0d5 2742
709f6e46
MS
2743 r = drop_capabilities();
2744 if (r < 0)
2745 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2746
2747 setup_hostname();
2748
050f7277 2749 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2750 if (personality(arg_personality) < 0)
2751 return log_error_errno(errno, "personality() failed: %m");
2752 } else if (secondary) {
2753 if (personality(PER_LINUX32) < 0)
2754 return log_error_errno(errno, "personality() failed: %m");
2755 }
2756
2757#ifdef HAVE_SELINUX
2758 if (arg_selinux_context)
2ed96880 2759 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2760 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2761#endif
2762
ee645080 2763 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2764 if (r < 0)
2765 return r;
2766
6aadfa4c
ILG
2767 /* LXC sets container=lxc, so follow the scheme here */
2768 envp[n_env++] = strjoina("container=", arg_container_service_name);
2769
03cfe0d5
LP
2770 envp[n_env] = strv_find_prefix(environ, "TERM=");
2771 if (envp[n_env])
313cefa1 2772 n_env++;
03cfe0d5
LP
2773
2774 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2775 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2776 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2777 return log_oom();
2778
3bbaff3e 2779 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2780
691675ba 2781 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2782 return log_oom();
03cfe0d5
LP
2783
2784 if (fdset_size(fds) > 0) {
2785 r = fdset_cloexec(fds, false);
2786 if (r < 0)
2787 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2788
2789 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2790 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2791 return log_oom();
2792 }
9c1e04d0
AP
2793 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2794 return log_oom();
03cfe0d5 2795
2371271c
TG
2796 env_use = strv_env_merge(2, envp, arg_setenv);
2797 if (!env_use)
2798 return log_oom();
03cfe0d5
LP
2799
2800 /* Let the parent know that we are ready and
2801 * wait until the parent is ready with the
2802 * setup, too... */
2803 if (!barrier_place_and_sync(barrier)) { /* #4 */
2804 log_error("Parent died too early");
2805 return -ESRCH;
2806 }
2807
5f932eb9
LP
2808 if (arg_chdir)
2809 if (chdir(arg_chdir) < 0)
2810 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2811
7732f92b
LP
2812 if (arg_start_mode == START_PID2) {
2813 r = stub_pid1();
2814 if (r < 0)
2815 return r;
2816 }
2817
03cfe0d5
LP
2818 /* Now, explicitly close the log, so that we
2819 * then can close all remaining fds. Closing
2820 * the log explicitly first has the benefit
2821 * that the logging subsystem knows about it,
2822 * and is thus ready to be reopened should we
2823 * need it again. Note that the other fds
2824 * closed here are at least the locking and
2825 * barrier fds. */
2826 log_close();
2827 (void) fdset_close_others(fds);
2828
7732f92b 2829 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2830 char **a;
2831 size_t m;
2832
2833 /* Automatically search for the init system */
2834
75f32f04
ZJS
2835 m = strv_length(arg_parameters);
2836 a = newa(char*, m + 2);
2837 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2838 a[1 + m] = NULL;
03cfe0d5
LP
2839
2840 a[0] = (char*) "/usr/lib/systemd/systemd";
2841 execve(a[0], a, env_use);
2842
2843 a[0] = (char*) "/lib/systemd/systemd";
2844 execve(a[0], a, env_use);
2845
2846 a[0] = (char*) "/sbin/init";
2847 execve(a[0], a, env_use);
f757855e
LP
2848 } else if (!strv_isempty(arg_parameters))
2849 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2850 else {
5f932eb9 2851 if (!arg_chdir)
d929b0f9
ZJS
2852 /* If we cannot change the directory, we'll end up in /, that is expected. */
2853 (void) chdir(home ?: "/root");
5f932eb9 2854
03cfe0d5
LP
2855 execle("/bin/bash", "-bash", NULL, env_use);
2856 execle("/bin/sh", "-sh", NULL, env_use);
2857 }
2858
35607a8d 2859 r = -errno;
03cfe0d5 2860 (void) log_open();
35607a8d 2861 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2862}
2863
9c1e04d0
AP
2864static int setup_sd_notify_child(void) {
2865 static const int one = 1;
2866 int fd = -1;
2867 union sockaddr_union sa = {
2868 .sa.sa_family = AF_UNIX,
2869 };
2870 int r;
2871
2872 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2873 if (fd < 0)
2874 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2875
2876 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2877 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2878
2879 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2880 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2881 if (r < 0) {
2882 safe_close(fd);
2883 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2884 }
2885
2886 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2887 if (r < 0) {
2888 safe_close(fd);
2889 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2890 }
2891
2892 return fd;
2893}
2894
03cfe0d5
LP
2895static int outer_child(
2896 Barrier *barrier,
2897 const char *directory,
2898 const char *console,
2899 const char *root_device, bool root_device_rw,
2900 const char *home_device, bool home_device_rw,
2901 const char *srv_device, bool srv_device_rw,
a6bc7db9 2902 const char *esp_device,
03cfe0d5
LP
2903 bool interactive,
2904 bool secondary,
2905 int pid_socket,
e01ff70a 2906 int uuid_socket,
9c1e04d0 2907 int notify_socket,
03cfe0d5
LP
2908 int kmsg_socket,
2909 int rtnl_socket,
825d5287 2910 int uid_shift_socket,
f757855e 2911 FDSet *fds) {
03cfe0d5
LP
2912
2913 pid_t pid;
2914 ssize_t l;
2915 int r;
9c1e04d0 2916 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2917
2918 assert(barrier);
2919 assert(directory);
2920 assert(console);
2921 assert(pid_socket >= 0);
e01ff70a 2922 assert(uuid_socket >= 0);
9c1e04d0 2923 assert(notify_socket >= 0);
03cfe0d5
LP
2924 assert(kmsg_socket >= 0);
2925
efdb0237
LP
2926 cg_unified_flush();
2927
03cfe0d5
LP
2928 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2929 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2930
2931 if (interactive) {
2932 close_nointr(STDIN_FILENO);
2933 close_nointr(STDOUT_FILENO);
2934 close_nointr(STDERR_FILENO);
2935
2936 r = open_terminal(console, O_RDWR);
2937 if (r != STDIN_FILENO) {
2938 if (r >= 0) {
2939 safe_close(r);
2940 r = -EINVAL;
2941 }
2942
2943 return log_error_errno(r, "Failed to open console: %m");
2944 }
2945
2946 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2947 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2948 return log_error_errno(errno, "Failed to duplicate console: %m");
2949 }
2950
2951 r = reset_audit_loginuid();
2952 if (r < 0)
2953 return r;
2954
2955 /* Mark everything as slave, so that we still
2956 * receive mounts from the real root, but don't
2957 * propagate mounts to the real root. */
2958 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2959 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2960
2961 r = mount_devices(directory,
2962 root_device, root_device_rw,
2963 home_device, home_device_rw,
a6bc7db9
LP
2964 srv_device, srv_device_rw,
2965 esp_device);
03cfe0d5
LP
2966 if (r < 0)
2967 return r;
2968
391567f4
LP
2969 r = determine_uid_shift(directory);
2970 if (r < 0)
2971 return r;
2972
0de7acce 2973 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2974 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2975 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2976 if (l < 0)
2977 return log_error_errno(errno, "Failed to send UID shift: %m");
2978 if (l != sizeof(arg_uid_shift)) {
2979 log_error("Short write while sending UID shift.");
2980 return -EIO;
2981 }
0e7ac751 2982
0de7acce 2983 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2984 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2985 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2986 * not it will pick a different one, and send it back to us. */
2987
2988 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2989 if (l < 0)
2990 return log_error_errno(errno, "Failed to recv UID shift: %m");
2991 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2992 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2993 return -EIO;
2994 }
2995 }
2996
2997 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2998 }
2999
03cfe0d5
LP
3000 /* Turn directory into bind mount */
3001 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
3002 return log_error_errno(errno, "Failed to make bind mount: %m");
3003
19caffac
AC
3004 /* Mark everything as shared so our mounts get propagated down. This is
3005 * required to make new bind mounts available in systemd services
3006 * inside the containter that create a new mount namespace.
3007 * See https://github.com/systemd/systemd/issues/3860
3008 * Further submounts (such as /dev) done after this will inherit the
3009 * shared propagation mode.*/
3010 if (mount(NULL, directory, NULL, MS_SHARED|MS_REC, NULL) < 0)
3011 return log_error_errno(errno, "MS_SHARED|MS_REC failed: %m");
3012
7336138e 3013 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
3014 if (r < 0)
3015 return r;
3016
0de7acce
LP
3017 r = setup_volatile(
3018 directory,
3019 arg_volatile_mode,
3020 arg_userns_mode != USER_NAMESPACE_NO,
3021 arg_uid_shift,
3022 arg_uid_range,
3023 arg_selinux_context);
03cfe0d5
LP
3024 if (r < 0)
3025 return r;
3026
0de7acce
LP
3027 r = setup_volatile_state(
3028 directory,
3029 arg_volatile_mode,
3030 arg_userns_mode != USER_NAMESPACE_NO,
3031 arg_uid_shift,
3032 arg_uid_range,
3033 arg_selinux_context);
03cfe0d5
LP
3034 if (r < 0)
3035 return r;
3036
03cfe0d5
LP
3037 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3038 if (r < 0)
3039 return r;
3040
03cfe0d5 3041 if (arg_read_only) {
6b7c9f8b 3042 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3043 if (r < 0)
3044 return log_error_errno(r, "Failed to make tree read-only: %m");
3045 }
3046
0de7acce
LP
3047 r = mount_all(directory,
3048 arg_userns_mode != USER_NAMESPACE_NO,
3049 false,
3050 arg_private_network,
3051 arg_uid_shift,
3052 arg_uid_range,
3053 arg_selinux_apifs_context);
03cfe0d5
LP
3054 if (r < 0)
3055 return r;
3056
07fa00f9
LP
3057 r = copy_devnodes(directory);
3058 if (r < 0)
03cfe0d5
LP
3059 return r;
3060
3061 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3062
07fa00f9
LP
3063 r = setup_pts(directory);
3064 if (r < 0)
03cfe0d5
LP
3065 return r;
3066
3067 r = setup_propagate(directory);
3068 if (r < 0)
3069 return r;
3070
3071 r = setup_dev_console(directory, console);
3072 if (r < 0)
3073 return r;
3074
520e0d54 3075 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3076 if (r < 0)
3077 return r;
3078
3079 r = setup_timezone(directory);
3080 if (r < 0)
3081 return r;
3082
3083 r = setup_resolv_conf(directory);
3084 if (r < 0)
3085 return r;
3086
e01ff70a
MS
3087 r = setup_machine_id(directory);
3088 if (r < 0)
3089 return r;
3090
03cfe0d5
LP
3091 r = setup_journal(directory);
3092 if (r < 0)
3093 return r;
3094
0de7acce
LP
3095 r = mount_custom(
3096 directory,
3097 arg_custom_mounts,
3098 arg_n_custom_mounts,
3099 arg_userns_mode != USER_NAMESPACE_NO,
3100 arg_uid_shift,
3101 arg_uid_range,
3102 arg_selinux_apifs_context);
03cfe0d5
LP
3103 if (r < 0)
3104 return r;
3105
5a8ff0e6 3106 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3107 r = mount_cgroups(
3108 directory,
3109 arg_unified_cgroup_hierarchy,
3110 arg_userns_mode != USER_NAMESPACE_NO,
3111 arg_uid_shift,
3112 arg_uid_range,
5a8ff0e6
CB
3113 arg_selinux_apifs_context,
3114 arg_use_cgns);
0996ef00
CB
3115 if (r < 0)
3116 return r;
3117 }
03cfe0d5
LP
3118
3119 r = mount_move_root(directory);
3120 if (r < 0)
3121 return log_error_errno(r, "Failed to move root directory: %m");
3122
9c1e04d0
AP
3123 fd = setup_sd_notify_child();
3124 if (fd < 0)
3125 return fd;
3126
03cfe0d5 3127 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3128 arg_clone_ns_flags |
03cfe0d5 3129 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3130 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3131 if (pid < 0)
3132 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3133 if (pid == 0) {
3134 pid_socket = safe_close(pid_socket);
e01ff70a 3135 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3136 notify_socket = safe_close(notify_socket);
825d5287 3137 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3138
3139 /* The inner child has all namespaces that are
3140 * requested, so that we all are owned by the user if
3141 * user namespaces are turned on. */
3142
f757855e 3143 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3144 if (r < 0)
3145 _exit(EXIT_FAILURE);
3146
3147 _exit(EXIT_SUCCESS);
3148 }
3149
3150 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3151 if (l < 0)
3152 return log_error_errno(errno, "Failed to send PID: %m");
3153 if (l != sizeof(pid)) {
3154 log_error("Short write while sending PID.");
3155 return -EIO;
3156 }
3157
e01ff70a
MS
3158 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3159 if (l < 0)
3160 return log_error_errno(errno, "Failed to send machine ID: %m");
3161 if (l != sizeof(arg_uuid)) {
3162 log_error("Short write while sending machine ID.");
3163 return -EIO;
3164 }
3165
9c1e04d0
AP
3166 l = send_one_fd(notify_socket, fd, 0);
3167 if (l < 0)
3168 return log_error_errno(errno, "Failed to send notify fd: %m");
3169
03cfe0d5 3170 pid_socket = safe_close(pid_socket);
e01ff70a 3171 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3172 notify_socket = safe_close(notify_socket);
327e26d6
KN
3173 kmsg_socket = safe_close(kmsg_socket);
3174 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3175
3176 return 0;
3177}
3178
0e7ac751
LP
3179static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3180 unsigned n_tries = 100;
3181 uid_t candidate;
3182 int r;
3183
3184 assert(shift);
3185 assert(ret_lock_file);
0de7acce 3186 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3187 assert(arg_uid_range == 0x10000U);
3188
3189 candidate = *shift;
3190
3191 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3192
3193 for (;;) {
3194 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3195 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3196
3197 if (--n_tries <= 0)
3198 return -EBUSY;
3199
3200 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3201 goto next;
3202 if ((candidate & UINT32_C(0xFFFF)) != 0)
3203 goto next;
3204
3205 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3206 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3207 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3208 goto next;
3209 if (r < 0)
3210 return r;
3211
3212 /* Make some superficial checks whether the range is currently known in the user database */
3213 if (getpwuid(candidate))
3214 goto next;
3215 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3216 goto next;
3217 if (getgrgid(candidate))
3218 goto next;
3219 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3220 goto next;
3221
3222 *ret_lock_file = lf;
3223 lf = (struct LockFile) LOCK_FILE_INIT;
3224 *shift = candidate;
3225 return 0;
3226
3227 next:
3228 random_bytes(&candidate, sizeof(candidate));
3229 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3230 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3231 }
3232}
3233
03cfe0d5
LP
3234static int setup_uid_map(pid_t pid) {
3235 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3236 int r;
3237
3238 assert(pid > 1);
3239
3240 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3241 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3242 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3243 if (r < 0)
3244 return log_error_errno(r, "Failed to write UID map: %m");
3245
3246 /* We always assign the same UID and GID ranges */
3247 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3248 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3249 if (r < 0)
3250 return log_error_errno(r, "Failed to write GID map: %m");
3251
3252 return 0;
3253}
3254
9c1e04d0 3255static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3256 char buf[NOTIFY_BUFFER_MAX+1];
3257 char *p = NULL;
3258 struct iovec iovec = {
3259 .iov_base = buf,
3260 .iov_len = sizeof(buf)-1,
3261 };
3262 union {
3263 struct cmsghdr cmsghdr;
3264 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3265 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3266 } control = {};
3267 struct msghdr msghdr = {
3268 .msg_iov = &iovec,
3269 .msg_iovlen = 1,
3270 .msg_control = &control,
3271 .msg_controllen = sizeof(control),
3272 };
3273 struct cmsghdr *cmsg;
3274 struct ucred *ucred = NULL;
3275 ssize_t n;
3276 pid_t inner_child_pid;
3277 _cleanup_strv_free_ char **tags = NULL;
3278
3279 assert(userdata);
3280
3281 inner_child_pid = PTR_TO_PID(userdata);
3282
3283 if (revents != EPOLLIN) {
3284 log_warning("Got unexpected poll event for notify fd.");
3285 return 0;
3286 }
3287
3288 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3289 if (n < 0) {
3290 if (errno == EAGAIN || errno == EINTR)
3291 return 0;
3292
3293 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3294 }
3295 cmsg_close_all(&msghdr);
3296
3297 CMSG_FOREACH(cmsg, &msghdr) {
3298 if (cmsg->cmsg_level == SOL_SOCKET &&
3299 cmsg->cmsg_type == SCM_CREDENTIALS &&
3300 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3301
3302 ucred = (struct ucred*) CMSG_DATA(cmsg);
3303 }
3304 }
3305
3306 if (!ucred || ucred->pid != inner_child_pid) {
3307 log_warning("Received notify message without valid credentials. Ignoring.");
3308 return 0;
3309 }
3310
3311 if ((size_t) n >= sizeof(buf)) {
3312 log_warning("Received notify message exceeded maximum size. Ignoring.");
3313 return 0;
3314 }
3315
3316 buf[n] = 0;
3317 tags = strv_split(buf, "\n\r");
3318 if (!tags)
3319 return log_oom();
3320
3321 if (strv_find(tags, "READY=1"))
3322 sd_notifyf(false, "READY=1\n");
3323
3324 p = strv_find_startswith(tags, "STATUS=");
3325 if (p)
3326 sd_notifyf(false, "STATUS=Container running: %s", p);
3327
3328 return 0;
3329}
3330
3331static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3332 int r;
3333 sd_event_source *notify_event_source;
3334
3335 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3336 if (r < 0)
3337 return log_error_errno(r, "Failed to allocate notify event source: %m");
3338
3339 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3340
3341 return 0;
3342}
3343
f757855e
LP
3344static int load_settings(void) {
3345 _cleanup_(settings_freep) Settings *settings = NULL;
3346 _cleanup_fclose_ FILE *f = NULL;
3347 _cleanup_free_ char *p = NULL;
3348 const char *fn, *i;
3349 int r;
3350
3351 /* If all settings are masked, there's no point in looking for
3352 * the settings file */
3353 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3354 return 0;
3355
3356 fn = strjoina(arg_machine, ".nspawn");
3357
3358 /* We first look in the admin's directories in /etc and /run */
3359 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3360 _cleanup_free_ char *j = NULL;
3361
3362 j = strjoin(i, "/", fn, NULL);
3363 if (!j)
3364 return log_oom();
3365
3366 f = fopen(j, "re");
3367 if (f) {
3368 p = j;
3369 j = NULL;
3370
b938cb90 3371 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3372 if (arg_settings_trusted < 0)
3373 arg_settings_trusted = true;
3374
3375 break;
3376 }
3377
3378 if (errno != ENOENT)
3379 return log_error_errno(errno, "Failed to open %s: %m", j);
3380 }
3381
3382 if (!f) {
3383 /* After that, let's look for a file next to the
3384 * actual image we shall boot. */
3385
3386 if (arg_image) {
3387 p = file_in_same_dir(arg_image, fn);
3388 if (!p)
3389 return log_oom();
3390 } else if (arg_directory) {
3391 p = file_in_same_dir(arg_directory, fn);
3392 if (!p)
3393 return log_oom();
3394 }
3395
3396 if (p) {
3397 f = fopen(p, "re");
3398 if (!f && errno != ENOENT)
3399 return log_error_errno(errno, "Failed to open %s: %m", p);
3400
b938cb90 3401 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3402 if (arg_settings_trusted < 0)
3403 arg_settings_trusted = false;
3404 }
3405 }
3406
3407 if (!f)
3408 return 0;
3409
3410 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3411
3412 r = settings_load(f, p, &settings);
3413 if (r < 0)
3414 return r;
3415
3416 /* Copy over bits from the settings, unless they have been
3417 * explicitly masked by command line switches. */
3418
7732f92b
LP
3419 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3420 settings->start_mode >= 0) {
3421 arg_start_mode = settings->start_mode;
f757855e
LP
3422
3423 strv_free(arg_parameters);
3424 arg_parameters = settings->parameters;
3425 settings->parameters = NULL;
3426 }
3427
5f932eb9
LP
3428 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3429 settings->working_directory) {
3430 free(arg_chdir);
3431 arg_chdir = settings->working_directory;
3432 settings->working_directory = NULL;
3433 }
3434
f757855e
LP
3435 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3436 settings->environment) {
3437 strv_free(arg_setenv);
3438 arg_setenv = settings->environment;
3439 settings->environment = NULL;
3440 }
3441
3442 if ((arg_settings_mask & SETTING_USER) == 0 &&
3443 settings->user) {
3444 free(arg_user);
3445 arg_user = settings->user;
3446 settings->user = NULL;
3447 }
3448
3449 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3450 uint64_t plus;
f757855e 3451
0e265674
LP
3452 plus = settings->capability;
3453 if (settings_private_network(settings))
3454 plus |= (1ULL << CAP_NET_ADMIN);
3455
3456 if (!arg_settings_trusted && plus != 0) {
3457 if (settings->capability != 0)
3458 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3459 } else
520e0d54 3460 arg_caps_retain |= plus;
f757855e 3461
520e0d54 3462 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3463 }
3464
3465 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3466 settings->kill_signal > 0)
3467 arg_kill_signal = settings->kill_signal;
3468
3469 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3470 settings->personality != PERSONALITY_INVALID)
3471 arg_personality = settings->personality;
3472
3473 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3474 !sd_id128_is_null(settings->machine_id)) {
3475
3476 if (!arg_settings_trusted)
3477 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3478 else
3479 arg_uuid = settings->machine_id;
3480 }
3481
3482 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3483 settings->read_only >= 0)
3484 arg_read_only = settings->read_only;
3485
3486 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3487 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3488 arg_volatile_mode = settings->volatile_mode;
3489
3490 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3491 settings->n_custom_mounts > 0) {
3492
3493 if (!arg_settings_trusted)
3494 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3495 else {
3496 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3497 arg_custom_mounts = settings->custom_mounts;
3498 arg_n_custom_mounts = settings->n_custom_mounts;
3499
3500 settings->custom_mounts = NULL;
3501 settings->n_custom_mounts = 0;
3502 }
3503 }
3504
3505 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3506 (settings->private_network >= 0 ||
3507 settings->network_veth >= 0 ||
3508 settings->network_bridge ||
22b28dfd 3509 settings->network_zone ||
f757855e
LP
3510 settings->network_interfaces ||
3511 settings->network_macvlan ||
f6d6bad1
LP
3512 settings->network_ipvlan ||
3513 settings->network_veth_extra)) {
f757855e
LP
3514
3515 if (!arg_settings_trusted)
3516 log_warning("Ignoring network settings, file %s is not trusted.", p);
3517 else {
f6d6bad1 3518 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3519 arg_private_network = settings_private_network(settings);
3520
f757855e
LP
3521 strv_free(arg_network_interfaces);
3522 arg_network_interfaces = settings->network_interfaces;
3523 settings->network_interfaces = NULL;
3524
3525 strv_free(arg_network_macvlan);
3526 arg_network_macvlan = settings->network_macvlan;
3527 settings->network_macvlan = NULL;
3528
3529 strv_free(arg_network_ipvlan);
3530 arg_network_ipvlan = settings->network_ipvlan;
3531 settings->network_ipvlan = NULL;
3532
f6d6bad1
LP
3533 strv_free(arg_network_veth_extra);
3534 arg_network_veth_extra = settings->network_veth_extra;
3535 settings->network_veth_extra = NULL;
3536
f757855e
LP
3537 free(arg_network_bridge);
3538 arg_network_bridge = settings->network_bridge;
3539 settings->network_bridge = NULL;
22b28dfd
LP
3540
3541 free(arg_network_zone);
3542 arg_network_zone = settings->network_zone;
3543 settings->network_zone = NULL;
f757855e
LP
3544 }
3545 }
3546
3547 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3548 settings->expose_ports) {
3549
3550 if (!arg_settings_trusted)
3551 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3552 else {
3553 expose_port_free_all(arg_expose_ports);
3554 arg_expose_ports = settings->expose_ports;
3555 settings->expose_ports = NULL;
3556 }
3557 }
3558
0de7acce
LP
3559 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3560 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3561
3562 if (!arg_settings_trusted)
3563 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3564 else {
3565 arg_userns_mode = settings->userns_mode;
3566 arg_uid_shift = settings->uid_shift;
3567 arg_uid_range = settings->uid_range;
3568 arg_userns_chown = settings->userns_chown;
3569 }
3570 }
3571
9c1e04d0
AP
3572 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3573 arg_notify_ready = settings->notify_ready;
3574
f757855e
LP
3575 return 0;
3576}
3577
03cfe0d5
LP
3578int main(int argc, char *argv[]) {
3579
a6bc7db9 3580 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
3581 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3582 _cleanup_close_ int master = -1, image_fd = -1;
3583 _cleanup_fdset_free_ FDSet *fds = NULL;
3584 int r, n_fd_passed, loop_nr = -1;
5aa3eba5 3585 char veth_name[IFNAMSIZ] = "";
03cfe0d5 3586 bool secondary = false, remove_subvol = false;
72c0a2c2 3587 sigset_t mask_chld;
03cfe0d5
LP
3588 pid_t pid = 0;
3589 int ret = EXIT_SUCCESS;
3590 union in_addr_union exposed = {};
3591 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
7513c5b8 3592 bool interactive, veth_created = false;
03cfe0d5
LP
3593
3594 log_parse_environment();
3595 log_open();
3596
7732f92b
LP
3597 /* Make sure rename_process() in the stub init process can work */
3598 saved_argv = argv;
3599 saved_argc = argc;
3600
03cfe0d5
LP
3601 r = parse_argv(argc, argv);
3602 if (r <= 0)
3603 goto finish;
3604
03cfe0d5
LP
3605 if (geteuid() != 0) {
3606 log_error("Need to be root.");
3607 r = -EPERM;
3608 goto finish;
3609 }
f757855e
LP
3610 r = determine_names();
3611 if (r < 0)
3612 goto finish;
3613
3614 r = load_settings();
3615 if (r < 0)
3616 goto finish;
3617
3618 r = verify_arguments();
3619 if (r < 0)
3620 goto finish;
03cfe0d5
LP
3621
3622 n_fd_passed = sd_listen_fds(false);
3623 if (n_fd_passed > 0) {
3624 r = fdset_new_listen_fds(&fds, false);
3625 if (r < 0) {
3626 log_error_errno(r, "Failed to collect file descriptors: %m");
3627 goto finish;
3628 }
3629 }
3630
3631 if (arg_directory) {
3632 assert(!arg_image);
3633
3634 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3635 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3636 r = -EINVAL;
3637 goto finish;
3638 }
3639
3640 if (arg_ephemeral) {
3641 _cleanup_free_ char *np = NULL;
3642
3643 /* If the specified path is a mount point we
3644 * generate the new snapshot immediately
3645 * inside it under a random name. However if
3646 * the specified is not a mount point we
3647 * create the new snapshot in the parent
3648 * directory, just next to it. */
e26d6ce5 3649 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3650 if (r < 0) {
3651 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3652 goto finish;
3653 }
3654 if (r > 0)
770b5ce4 3655 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3656 else
770b5ce4 3657 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3658 if (r < 0) {
3659 log_error_errno(r, "Failed to generate name for snapshot: %m");
3660 goto finish;
3661 }
3662
3663 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3664 if (r < 0) {
3665 log_error_errno(r, "Failed to lock %s: %m", np);
3666 goto finish;
3667 }
3668
5bcd08db 3669 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3670 if (r < 0) {
3671 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3672 goto finish;
ec16945e
LP
3673 }
3674
3675 free(arg_directory);
3676 arg_directory = np;
8a16a7b4 3677 np = NULL;
ec16945e
LP
3678
3679 remove_subvol = true;
30535c16
LP
3680
3681 } else {
3682 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3683 if (r == -EBUSY) {
3684 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3685 goto finish;
3686 }
3687 if (r < 0) {
3688 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3689 goto finish;
30535c16
LP
3690 }
3691
3692 if (arg_template) {
5bcd08db 3693 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3694 if (r == -EEXIST) {
3695 if (!arg_quiet)
3696 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3697 } else if (r < 0) {
83521414 3698 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3699 goto finish;
3700 } else {
3701 if (!arg_quiet)
3702 log_info("Populated %s from template %s.", arg_directory, arg_template);
3703 }
3704 }
ec16945e
LP
3705 }
3706
7732f92b 3707 if (arg_start_mode == START_BOOT) {
1b9e5b12 3708 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3709 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3710 r = -EINVAL;
1b9e5b12
LP
3711 goto finish;
3712 }
3713 } else {
3714 const char *p;
3715
16fb773e
LP
3716 p = strjoina(arg_directory, "/usr/");
3717 if (laccess(p, F_OK) < 0) {
3718 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3719 r = -EINVAL;
1b9e5b12 3720 goto finish;
1b9e5b12
LP
3721 }
3722 }
ec16945e 3723
6b9132a9 3724 } else {
1b9e5b12 3725 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3726
ec16945e
LP
3727 assert(arg_image);
3728 assert(!arg_template);
3729
30535c16
LP
3730 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3731 if (r == -EBUSY) {
3732 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3733 goto finish;
3734 }
3735 if (r < 0) {
3736 r = log_error_errno(r, "Failed to create image lock: %m");
3737 goto finish;
3738 }
3739
1b9e5b12 3740 if (!mkdtemp(template)) {
56f64d95 3741 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3742 r = -errno;
6b9132a9 3743 goto finish;
1b9e5b12 3744 }
6b9132a9 3745
1b9e5b12
LP
3746 arg_directory = strdup(template);
3747 if (!arg_directory) {
3748 r = log_oom();
3749 goto finish;
6b9132a9 3750 }
88213476 3751
1b9e5b12
LP
3752 image_fd = setup_image(&device_path, &loop_nr);
3753 if (image_fd < 0) {
3754 r = image_fd;
842f3b0f
LP
3755 goto finish;
3756 }
1b9e5b12 3757
4d9f07b4
LP
3758 r = dissect_image(image_fd,
3759 &root_device, &root_device_rw,
3760 &home_device, &home_device_rw,
3761 &srv_device, &srv_device_rw,
a6bc7db9 3762 &esp_device,
4d9f07b4 3763 &secondary);
1b9e5b12
LP
3764 if (r < 0)
3765 goto finish;
842f3b0f 3766 }
842f3b0f 3767
5a8af538
LP
3768 r = custom_mounts_prepare();
3769 if (r < 0)
3770 goto finish;
3771
03cfe0d5
LP
3772 interactive =
3773 isatty(STDIN_FILENO) > 0 &&
3774 isatty(STDOUT_FILENO) > 0;
9c857b9d 3775
db7feb7e
LP
3776 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3777 if (master < 0) {
ec16945e 3778 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3779 goto finish;
3780 }
3781
611b312b
LP
3782 r = ptsname_malloc(master, &console);
3783 if (r < 0) {
3784 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3785 goto finish;
68b02049
DW
3786 }
3787
3788 if (arg_selinux_apifs_context) {
3789 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3790 if (r < 0)
3791 goto finish;
a258bf26
LP
3792 }
3793
a258bf26 3794 if (unlockpt(master) < 0) {
ec16945e 3795 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3796 goto finish;
3797 }
3798
9c857b9d
LP
3799 if (!arg_quiet)
3800 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3801 arg_machine, arg_image ?: arg_directory);
3802
72c0a2c2 3803 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3804
023fb90b
LP
3805 assert_se(sigemptyset(&mask_chld) == 0);
3806 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3807
03cfe0d5
LP
3808 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3809 r = log_error_errno(errno, "Failed to become subreaper: %m");
3810 goto finish;
3811 }
3812
d87be9b0 3813 for (;;) {
03cfe0d5 3814 static const struct sigaction sa = {
189d5bac 3815 .sa_handler = nop_signal_handler,
e866af3a
DH
3816 .sa_flags = SA_NOCLDSTOP,
3817 };
0e7ac751
LP
3818
3819 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3820 _cleanup_close_ int etc_passwd_lock = -1;
3821 _cleanup_close_pair_ int
3822 kmsg_socket_pair[2] = { -1, -1 },
3823 rtnl_socket_pair[2] = { -1, -1 },
3824 pid_socket_pair[2] = { -1, -1 },
3825 uuid_socket_pair[2] = { -1, -1 },
9c1e04d0 3826 notify_socket_pair[2] = { -1, -1 },
0e7ac751 3827 uid_shift_socket_pair[2] = { -1, -1 };
9c1e04d0 3828 _cleanup_close_ int notify_socket= -1;
0e7ac751 3829 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4afd3348 3830 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3831 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3832 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
7429b2eb 3833 ContainerStatus container_status = 0;
dbb60d69 3834 char last_char = 0;
0e7ac751
LP
3835 int ifi = 0;
3836 ssize_t l;
3837
0de7acce 3838 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3839 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3840 * check with getpwuid() if the specific user already exists. Note that /etc might be
3841 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3842 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3843 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3844 * really ours. */
3845
3846 etc_passwd_lock = take_etc_passwd_lock(NULL);
3847 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3848 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3849 goto finish;
3850 }
3851 }
e866af3a 3852
7566e267 3853 r = barrier_create(&barrier);
a2da110b 3854 if (r < 0) {
da927ba9 3855 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3856 goto finish;
3857 }
3858
4610de50 3859 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3860 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3861 goto finish;
3862 }
3863
4610de50 3864 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3865 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3866 goto finish;
3867 }
3868
4610de50 3869 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3870 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3871 goto finish;
3872 }
3873
e01ff70a
MS
3874 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3875 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3876 goto finish;
3877 }
3878
9c1e04d0
AP
3879 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
3880 r = log_error_errno(errno, "Failed to create notify socket pair: %m");
3881 goto finish;
3882 }
3883
0de7acce 3884 if (arg_userns_mode != USER_NAMESPACE_NO)
4610de50 3885 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3886 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3887 goto finish;
3888 }
3889
e866af3a
DH
3890 /* Child can be killed before execv(), so handle SIGCHLD
3891 * in order to interrupt parent's blocking calls and
3892 * give it a chance to call wait() and terminate. */
3893 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3894 if (r < 0) {
ec16945e 3895 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3896 goto finish;
3897 }
3898
e866af3a
DH
3899 r = sigaction(SIGCHLD, &sa, NULL);
3900 if (r < 0) {
ec16945e 3901 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3902 goto finish;
3903 }
3904
8869a0b4 3905 pid = raw_clone(SIGCHLD|CLONE_NEWNS);
d87be9b0
LP
3906 if (pid < 0) {
3907 if (errno == EINVAL)
ec16945e 3908 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3909 else
ec16945e 3910 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3911
d87be9b0
LP
3912 goto finish;
3913 }
a258bf26 3914
d87be9b0 3915 if (pid == 0) {
03cfe0d5 3916 /* The outer child only has a file system namespace. */
a2da110b
DH
3917 barrier_set_role(&barrier, BARRIER_CHILD);
3918
03e334a1 3919 master = safe_close(master);
a258bf26 3920
03e334a1 3921 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3922 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3923 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
e01ff70a 3924 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
9c1e04d0 3925 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
825d5287 3926 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3927
ce30c8dc
LP
3928 (void) reset_all_signal_handlers();
3929 (void) reset_signal_mask();
f5c1b9ee 3930
03cfe0d5
LP
3931 r = outer_child(&barrier,
3932 arg_directory,
3933 console,
3934 root_device, root_device_rw,
3935 home_device, home_device_rw,
3936 srv_device, srv_device_rw,
a6bc7db9 3937 esp_device,
03cfe0d5
LP
3938 interactive,
3939 secondary,
3940 pid_socket_pair[1],
e01ff70a 3941 uuid_socket_pair[1],
9c1e04d0 3942 notify_socket_pair[1],
03cfe0d5
LP
3943 kmsg_socket_pair[1],
3944 rtnl_socket_pair[1],
825d5287 3945 uid_shift_socket_pair[1],
f757855e 3946 fds);
0cb9fbcd 3947 if (r < 0)
a2da110b 3948 _exit(EXIT_FAILURE);
d87be9b0 3949
03cfe0d5 3950 _exit(EXIT_SUCCESS);
da5b3bad 3951 }
88213476 3952
a2da110b 3953 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3954
2feceb5e 3955 fds = fdset_free(fds);
842f3b0f 3956
6d0b55c2
LP
3957 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3958 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3959 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
e01ff70a 3960 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
9c1e04d0 3961 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
82116c43 3962 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3963
0de7acce 3964 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
3965 /* The child just let us know the UID shift it might have read from the image. */
3966 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3967 if (l < 0) {
3968 r = log_error_errno(errno, "Failed to read UID shift: %m");
3969 goto finish;
3970 }
3971 if (l != sizeof(arg_uid_shift)) {
3972 log_error("Short read while reading UID shift.");
3973 r = EIO;
3974 goto finish;
3975 }
3976
0de7acce 3977 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3978 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3979 * image, but if that's already in use, pick a new one, and report back to the child,
3980 * which one we now picked. */
3981
3982 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3983 if (r < 0) {
3984 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3985 goto finish;
3986 }
3987
3988 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3989 if (l < 0) {
3990 r = log_error_errno(errno, "Failed to send UID shift: %m");
3991 goto finish;
3992 }
3993 if (l != sizeof(arg_uid_shift)) {
3994 log_error("Short write while writing UID shift.");
3995 r = -EIO;
3996 goto finish;
3997 }
3998 }
3999 }
4000
03cfe0d5
LP
4001 /* Wait for the outer child. */
4002 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4003 if (r < 0)
4004 goto finish;
4005 if (r != 0) {
4006 r = -EIO;
4007 goto finish;
4008 }
4009 pid = 0;
6dac160c 4010
03cfe0d5
LP
4011 /* And now retrieve the PID of the inner child. */
4012 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4013 if (l < 0) {
4014 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4015 goto finish;
4016 }
4017 if (l != sizeof(pid)) {
76d44882 4018 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
4019 r = EIO;
4020 goto finish;
4021 }
354bfd2b 4022
e01ff70a
MS
4023 /* We also retrieve container UUID in case it was generated by outer child */
4024 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
4025 if (l < 0) {
4026 r = log_error_errno(errno, "Failed to read container machine ID: %m");
4027 goto finish;
4028 }
4029 if (l != sizeof(arg_uuid)) {
4030 log_error("Short read while reading container machined ID.");
4031 r = EIO;
4032 goto finish;
4033 }
4034
9c1e04d0
AP
4035 /* We also retrieve the socket used for notifications generated by outer child */
4036 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4037 if (notify_socket < 0) {
4038 r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
4039 goto finish;
4040 }
4041
03cfe0d5 4042 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 4043
0de7acce 4044 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
4045 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4046 log_error("Child died too early.");
4047 r = -ESRCH;
840295fc 4048 goto finish;
03cfe0d5 4049 }
ab046dde 4050
03cfe0d5 4051 r = setup_uid_map(pid);
840295fc
LP
4052 if (r < 0)
4053 goto finish;
ab046dde 4054
03cfe0d5
LP
4055 (void) barrier_place(&barrier); /* #2 */
4056 }
c74e630d 4057
9a2a5625 4058 if (arg_private_network) {
4bbfe7ad 4059
9a2a5625
LP
4060 r = move_network_interfaces(pid, arg_network_interfaces);
4061 if (r < 0)
4062 goto finish;
5aa4bb6b 4063
9a2a5625 4064 if (arg_network_veth) {
22b28dfd
LP
4065 r = setup_veth(arg_machine, pid, veth_name,
4066 arg_network_bridge || arg_network_zone);
9a2a5625
LP
4067 if (r < 0)
4068 goto finish;
4069 else if (r > 0)
4070 ifi = r;
6dac160c 4071
9a2a5625 4072 if (arg_network_bridge) {
22b28dfd
LP
4073 /* Add the interface to a bridge */
4074 r = setup_bridge(veth_name, arg_network_bridge, false);
4075 if (r < 0)
4076 goto finish;
4077 if (r > 0)
4078 ifi = r;
4079 } else if (arg_network_zone) {
4080 /* Add the interface to a bridge, possibly creating it */
4081 r = setup_bridge(veth_name, arg_network_zone, true);
9a2a5625
LP
4082 if (r < 0)
4083 goto finish;
4084 if (r > 0)
4085 ifi = r;
4086 }
4087 }
6dac160c 4088
f6d6bad1
LP
4089 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
4090 if (r < 0)
4091 goto finish;
4092
7513c5b8
LP
4093 /* We created the primary and extra veth links now; let's remember this, so that we know to
4094 remove them later on. Note that we don't bother with removing veth links that were created
4095 here when their setup failed half-way, because in that case the kernel should be able to
4096 remove them on its own, since they cannot be referenced by anything yet. */
4097 veth_created = true;
4098
9a2a5625
LP
4099 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
4100 if (r < 0)
4101 goto finish;
4102
4103 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
4104 if (r < 0)
4105 goto finish;
4106 }
6dac160c 4107
b7103bc5
LP
4108 if (arg_register) {
4109 r = register_machine(
4110 arg_machine,
4111 pid,
4112 arg_directory,
4113 arg_uuid,
4114 ifi,
4115 arg_slice,
4116 arg_custom_mounts, arg_n_custom_mounts,
4117 arg_kill_signal,
4118 arg_property,
6aadfa4c
ILG
4119 arg_keep_unit,
4120 arg_container_service_name);
b7103bc5
LP
4121 if (r < 0)
4122 goto finish;
4123 }
6dac160c 4124
34829a32 4125 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
4126 if (r < 0)
4127 goto finish;
4128
34829a32
LP
4129 if (arg_keep_unit) {
4130 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
4131 if (r < 0)
4132 goto finish;
4133 }
efdb0237 4134
34829a32 4135 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
4136 if (r < 0)
4137 goto finish;
6dac160c 4138
03cfe0d5
LP
4139 /* Notify the child that the parent is ready with all
4140 * its setup (including cgroup-ification), and that
4141 * the child can now hand over control to the code to
4142 * run inside the container. */
4143 (void) barrier_place(&barrier); /* #3 */
6dac160c 4144
03cfe0d5
LP
4145 /* Block SIGCHLD here, before notifying child.
4146 * process_pty() will handle it with the other signals. */
4147 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4148
03cfe0d5
LP
4149 /* Reset signal to default */
4150 r = default_signals(SIGCHLD, -1);
4151 if (r < 0) {
4152 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4153 goto finish;
4154 }
e866af3a 4155
9c1e04d0
AP
4156 r = sd_event_new(&event);
4157 if (r < 0) {
4158 log_error_errno(r, "Failed to get default event source: %m");
4159 goto finish;
4160 }
4161
4162 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
4163 if (r < 0)
4164 goto finish;
4165
03cfe0d5 4166 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
4167 if (!barrier_place_and_sync(&barrier)) { /* #4 */
4168 log_error("Child died too early.");
03cfe0d5
LP
4169 r = -ESRCH;
4170 goto finish;
4171 }
b12afc8c 4172
0e7ac751
LP
4173 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
4174 * in getpwuid(), thus we can release the /etc/passwd lock. */
4175 etc_passwd_lock = safe_close(etc_passwd_lock);
4176
03cfe0d5 4177 sd_notifyf(false,
03cfe0d5
LP
4178 "STATUS=Container running.\n"
4179 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
9c1e04d0
AP
4180 if (!arg_notify_ready)
4181 sd_notify(false, "READY=1\n");
88213476 4182
03cfe0d5
LP
4183 if (arg_kill_signal > 0) {
4184 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
4185 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
4186 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
4187 } else {
4188 /* Immediately exit */
4189 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4190 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4191 }
023fb90b 4192
03cfe0d5
LP
4193 /* simply exit on sigchld */
4194 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4195
03cfe0d5 4196 if (arg_expose_ports) {
7a8f6325 4197 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
4198 if (r < 0)
4199 goto finish;
023fb90b 4200
7a8f6325 4201 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 4202 }
023fb90b 4203
03cfe0d5 4204 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4205
ae3dde80 4206 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
4207 if (r < 0) {
4208 log_error_errno(r, "Failed to create PTY forwarder: %m");
4209 goto finish;
4210 }
023fb90b 4211
03cfe0d5
LP
4212 r = sd_event_loop(event);
4213 if (r < 0) {
4214 log_error_errno(r, "Failed to run event loop: %m");
4215 goto finish;
4216 }
6d0b55c2 4217
03cfe0d5 4218 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 4219
03cfe0d5 4220 forward = pty_forward_free(forward);
6d0b55c2 4221
03cfe0d5
LP
4222 if (!arg_quiet && last_char != '\n')
4223 putc('\n', stdout);
04d39279 4224
03cfe0d5 4225 /* Kill if it is not dead yet anyway */
b7103bc5
LP
4226 if (arg_register && !arg_keep_unit)
4227 terminate_machine(pid);
1f0cd86b 4228
840295fc 4229 /* Normally redundant, but better safe than sorry */
04d39279 4230 kill(pid, SIGKILL);
a258bf26 4231
113cea80 4232 r = wait_for_container(pid, &container_status);
04d39279
LP
4233 pid = 0;
4234
ec16945e 4235 if (r < 0)
ce9f1527
LP
4236 /* We failed to wait for the container, or the
4237 * container exited abnormally */
ec16945e 4238 goto finish;
9ed794a3 4239 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
ce9f1527
LP
4240 /* The container exited with a non-zero
4241 * status, or with zero status and no reboot
4242 * was requested. */
ec16945e 4243 ret = r;
d87be9b0 4244 break;
ec16945e 4245 }
88213476 4246
113cea80 4247 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4248
4249 if (arg_keep_unit) {
4250 /* Special handling if we are running as a
4251 * service: instead of simply restarting the
4252 * machine we want to restart the entire
4253 * service, so let's inform systemd about this
4254 * with the special exit code 133. The service
4255 * file uses RestartForceExitStatus=133 so
4256 * that this results in a full nspawn
4257 * restart. This is necessary since we might
4258 * have cgroup parameters set we want to have
4259 * flushed out. */
ec16945e
LP
4260 ret = 133;
4261 r = 0;
ce38dbc8
LP
4262 break;
4263 }
6d0b55c2 4264
7a8f6325 4265 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8 4266
ef3b2aa7 4267 (void) remove_veth_links(veth_name, arg_network_veth_extra);
7513c5b8 4268 veth_created = false;
d87be9b0 4269 }
88213476
LP
4270
4271finish:
af4ec430
LP
4272 sd_notify(false,
4273 "STOPPING=1\n"
4274 "STATUS=Terminating...");
4275
9444b1f2
LP
4276 if (pid > 0)
4277 kill(pid, SIGKILL);
88213476 4278
503546da
LP
4279 /* Try to flush whatever is still queued in the pty */
4280 if (master >= 0)
59f448cf 4281 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 4282
03cfe0d5
LP
4283 loop_remove(loop_nr, &image_fd);
4284
ec16945e
LP
4285 if (remove_subvol && arg_directory) {
4286 int k;
4287
5bcd08db 4288 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
4289 if (k < 0)
4290 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4291 }
4292
785890ac
LP
4293 if (arg_machine) {
4294 const char *p;
4295
63c372cb 4296 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4297 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4298 }
4299
7a8f6325 4300 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4301
4302 if (veth_created)
4303 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4304 (void) remove_bridge(arg_network_zone);
f757855e 4305
04d391da 4306 free(arg_directory);
ec16945e
LP
4307 free(arg_template);
4308 free(arg_image);
7027ff61 4309 free(arg_machine);
c74e630d 4310 free(arg_user);
5f932eb9 4311 free(arg_chdir);
c74e630d 4312 strv_free(arg_setenv);
f757855e 4313 free(arg_network_bridge);
c74e630d
LP
4314 strv_free(arg_network_interfaces);
4315 strv_free(arg_network_macvlan);
4bbfe7ad 4316 strv_free(arg_network_ipvlan);
f6d6bad1 4317 strv_free(arg_network_veth_extra);
f757855e
LP
4318 strv_free(arg_parameters);
4319 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4320 expose_port_free_all(arg_expose_ports);
6d0b55c2 4321
ec16945e 4322 return r < 0 ? EXIT_FAILURE : ret;
88213476 4323}