]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
core: make unit_free() accept NULL pointers
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
f97b34a6 60#include "format-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
2a49b612
ZJS
114#define EXIT_FORCE_RESTART 133
115
113cea80
DH
116typedef enum ContainerStatus {
117 CONTAINER_TERMINATED,
118 CONTAINER_REBOOTED
119} ContainerStatus;
120
57fb9fb5
LP
121typedef enum LinkJournal {
122 LINK_NO,
123 LINK_AUTO,
124 LINK_HOST,
125 LINK_GUEST
126} LinkJournal;
88213476
LP
127
128static char *arg_directory = NULL;
ec16945e 129static char *arg_template = NULL;
5f932eb9 130static char *arg_chdir = NULL;
687d0825 131static char *arg_user = NULL;
9444b1f2 132static sd_id128_t arg_uuid = {};
7027ff61 133static char *arg_machine = NULL;
c74e630d
LP
134static const char *arg_selinux_context = NULL;
135static const char *arg_selinux_apifs_context = NULL;
9444b1f2 136static const char *arg_slice = NULL;
ff01d048 137static bool arg_private_network = false;
bc2f673e 138static bool arg_read_only = false;
7732f92b 139static StartMode arg_start_mode = START_PID1;
ec16945e 140static bool arg_ephemeral = false;
57fb9fb5 141static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 142static bool arg_link_journal_try = false;
520e0d54 143static uint64_t arg_caps_retain =
50b52222
LP
144 (1ULL << CAP_AUDIT_CONTROL) |
145 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
146 (1ULL << CAP_CHOWN) |
147 (1ULL << CAP_DAC_OVERRIDE) |
148 (1ULL << CAP_DAC_READ_SEARCH) |
149 (1ULL << CAP_FOWNER) |
150 (1ULL << CAP_FSETID) |
151 (1ULL << CAP_IPC_OWNER) |
152 (1ULL << CAP_KILL) |
153 (1ULL << CAP_LEASE) |
154 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 155 (1ULL << CAP_MKNOD) |
5076f0cc
LP
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
5076f0cc 159 (1ULL << CAP_SETFCAP) |
50b52222 160 (1ULL << CAP_SETGID) |
5076f0cc
LP
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
50b52222 164 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
165 (1ULL << CAP_SYS_CHROOT) |
166 (1ULL << CAP_SYS_NICE) |
167 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
50b52222 169 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
170static CustomMount *arg_custom_mounts = NULL;
171static unsigned arg_n_custom_mounts = 0;
f4889f65 172static char **arg_setenv = NULL;
284c0b91 173static bool arg_quiet = false;
eb91eb18 174static bool arg_register = true;
89f7c846 175static bool arg_keep_unit = false;
aa28aefe 176static char **arg_network_interfaces = NULL;
c74e630d 177static char **arg_network_macvlan = NULL;
4bbfe7ad 178static char **arg_network_ipvlan = NULL;
69c79d3c 179static bool arg_network_veth = false;
f6d6bad1 180static char **arg_network_veth_extra = NULL;
f757855e 181static char *arg_network_bridge = NULL;
22b28dfd 182static char *arg_network_zone = NULL;
050f7277 183static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 184static char *arg_image = NULL;
f757855e 185static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
f36933fe 187static char **arg_property = NULL;
0de7acce 188static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 189static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 190static bool arg_userns_chown = false;
c6c8f6e2 191static int arg_kill_signal = 0;
5da38d07 192static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
193static SettingsMask arg_settings_mask = 0;
194static int arg_settings_trusted = -1;
195static char **arg_parameters = NULL;
6aadfa4c 196static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 197static bool arg_notify_ready = false;
5a8ff0e6 198static bool arg_use_cgns = true;
0c582db0 199static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 200static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
88213476 201
601185b4 202static void help(void) {
88213476
LP
203 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
204 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
205 " -h --help Show this help\n"
206 " --version Print version string\n"
69c79d3c 207 " -q --quiet Do not show status information\n"
1b9e5b12 208 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
209 " --template=PATH Initialize root directory from template directory,\n"
210 " if missing\n"
211 " -x --ephemeral Run container with snapshot of root directory, and\n"
212 " remove it after exit\n"
213 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 214 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 215 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 216 " --chdir=PATH Set working directory in the container\n"
a8828ed9 217 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 218 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 219 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 220 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 221 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 222 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 223 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 224 " Similar, but with user configured UID/GID range\n"
24597ee0 225 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
226 " --private-network Disable network in container\n"
227 " --network-interface=INTERFACE\n"
228 " Assign an existing network interface to the\n"
229 " container\n"
c74e630d
LP
230 " --network-macvlan=INTERFACE\n"
231 " Create a macvlan network interface based on an\n"
232 " existing network interface to the container\n"
4bbfe7ad
TG
233 " --network-ipvlan=INTERFACE\n"
234 " Create a ipvlan network interface based on an\n"
235 " existing network interface to the container\n"
a8eaaee7 236 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 237 " and container\n"
f6d6bad1
LP
238 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
239 " Add an additional virtual Ethernet link between\n"
240 " host and container\n"
ab046dde 241 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
242 " Add a virtual Ethernet connection to the container\n"
243 " and attach it to an existing bridge on the host\n"
244 " --network-zone=NAME Similar, but attach the new interface to an\n"
245 " an automatically managed bridge interface\n"
6d0b55c2 246 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 247 " Expose a container IP port on the host\n"
82adf6af
LP
248 " -Z --selinux-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " processes in the container\n"
251 " -L --selinux-apifs-context=SECLABEL\n"
252 " Set the SELinux security context to be used by\n"
253 " API/tmpfs file systems in the container\n"
a8828ed9
DW
254 " --capability=CAP In addition to the default, retain specified\n"
255 " capability\n"
256 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 257 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
258 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
259 " host, try-guest, try-host\n"
574edc90 260 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 261 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
262 " --bind=PATH[:PATH[:OPTIONS]]\n"
263 " Bind mount a file or directory from the host into\n"
a8828ed9 264 " the container\n"
5e5bfa6e
EY
265 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
266 " Similar, but creates a read-only bind mount\n"
06c17c39 267 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
268 " --overlay=PATH[:PATH...]:PATH\n"
269 " Create an overlay mount from the host to \n"
270 " the container\n"
271 " --overlay-ro=PATH[:PATH...]:PATH\n"
272 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 273 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 274 " --register=BOOLEAN Register container as machine\n"
89f7c846 275 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 276 " the service unit nspawn is running in\n"
6d0b55c2 277 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 278 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 279 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 280 , program_invocation_short_name);
88213476
LP
281}
282
5a8af538
LP
283static int custom_mounts_prepare(void) {
284 unsigned i;
285 int r;
286
287 /* Ensure the mounts are applied prefix first. */
288 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
289
290 /* Allocate working directories for the overlay file systems that need it */
291 for (i = 0; i < arg_n_custom_mounts; i++) {
292 CustomMount *m = &arg_custom_mounts[i];
293
0de7acce 294 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
295
296 if (arg_userns_chown) {
297 log_error("--private-users-chown may not be combined with custom root mounts.");
298 return -EINVAL;
299 } else if (arg_uid_shift == UID_INVALID) {
300 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
301 return -EINVAL;
302 }
825d5287
RM
303 }
304
5a8af538
LP
305 if (m->type != CUSTOM_MOUNT_OVERLAY)
306 continue;
307
308 if (m->work_dir)
309 continue;
310
311 if (m->read_only)
312 continue;
313
14bcf25c 314 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
315 if (r < 0)
316 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
317 }
318
319 return 0;
320}
321
0fd9563f 322static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 323 const char *e;
5da38d07
TH
324 int r, all_unified, systemd_unified;
325
efdb0237
LP
326 /* Allow the user to control whether the unified hierarchy is used */
327 e = getenv("UNIFIED_CGROUP_HIERARCHY");
328 if (e) {
329 r = parse_boolean(e);
330 if (r < 0)
331 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
332 if (r > 0)
333 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
334 else
335 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 336
efdb0237
LP
337 return 0;
338 }
339
98afd6af
ZJS
340 all_unified = cg_all_unified();
341 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
342
343 if (all_unified < 0 || systemd_unified < 0)
344 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
345 "Failed to determine whether the unified cgroups hierarchy is used: %m");
346
efdb0237 347 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
348 if (all_unified > 0) {
349 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
350 * routine only detects 231, so we'll have a false negative here for 230. */
351 r = systemd_installation_has_version(directory, 230);
352 if (r < 0)
353 return log_error_errno(r, "Failed to determine systemd version in container: %m");
354 if (r > 0)
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
356 else
357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
358 } else if (systemd_unified > 0) {
359 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
360 r = systemd_installation_has_version(directory, 232);
361 if (r < 0)
362 return log_error_errno(r, "Failed to determine systemd version in container: %m");
363 if (r > 0)
364 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
365 else
366 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
367 } else
5da38d07 368 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 369
efdb0237
LP
370 return 0;
371}
372
0c582db0
LB
373static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
374 int r;
375
376 r = getenv_bool(name);
377 if (r == -ENXIO)
378 return;
379 if (r < 0)
380 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
381 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
382}
383
4f086aab
SU
384static void parse_mount_settings_env(void) {
385 int r;
386 const char *e;
387
388 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
389 if (!e)
390 return;
391
392 if (streq(e, "network")) {
393 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
394 return;
395 }
396
397 r = parse_boolean(e);
398 if (r < 0) {
399 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
400 return;
401 } else if (r > 0)
402 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
403 else
404 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
405
406 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
407}
408
88213476
LP
409static int parse_argv(int argc, char *argv[]) {
410
a41fe3a2 411 enum {
acbeb427
ZJS
412 ARG_VERSION = 0x100,
413 ARG_PRIVATE_NETWORK,
bc2f673e 414 ARG_UUID,
5076f0cc 415 ARG_READ_ONLY,
57fb9fb5 416 ARG_CAPABILITY,
420c7379 417 ARG_DROP_CAPABILITY,
17fe0523
LP
418 ARG_LINK_JOURNAL,
419 ARG_BIND,
f4889f65 420 ARG_BIND_RO,
06c17c39 421 ARG_TMPFS,
5a8af538
LP
422 ARG_OVERLAY,
423 ARG_OVERLAY_RO,
eb91eb18 424 ARG_SHARE_SYSTEM,
89f7c846 425 ARG_REGISTER,
aa28aefe 426 ARG_KEEP_UNIT,
69c79d3c 427 ARG_NETWORK_INTERFACE,
c74e630d 428 ARG_NETWORK_MACVLAN,
4bbfe7ad 429 ARG_NETWORK_IPVLAN,
ab046dde 430 ARG_NETWORK_BRIDGE,
22b28dfd 431 ARG_NETWORK_ZONE,
f6d6bad1 432 ARG_NETWORK_VETH_EXTRA,
6afc95b7 433 ARG_PERSONALITY,
4d9f07b4 434 ARG_VOLATILE,
ec16945e 435 ARG_TEMPLATE,
f36933fe 436 ARG_PROPERTY,
6dac160c 437 ARG_PRIVATE_USERS,
c6c8f6e2 438 ARG_KILL_SIGNAL,
f757855e 439 ARG_SETTINGS,
5f932eb9 440 ARG_CHDIR,
7336138e 441 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 442 ARG_NOTIFY_READY,
a41fe3a2
LP
443 };
444
88213476 445 static const struct option options[] = {
27eb8e90
ZJS
446 { "help", no_argument, NULL, 'h' },
447 { "version", no_argument, NULL, ARG_VERSION },
448 { "directory", required_argument, NULL, 'D' },
449 { "template", required_argument, NULL, ARG_TEMPLATE },
450 { "ephemeral", no_argument, NULL, 'x' },
451 { "user", required_argument, NULL, 'u' },
452 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
453 { "as-pid2", no_argument, NULL, 'a' },
454 { "boot", no_argument, NULL, 'b' },
455 { "uuid", required_argument, NULL, ARG_UUID },
456 { "read-only", no_argument, NULL, ARG_READ_ONLY },
457 { "capability", required_argument, NULL, ARG_CAPABILITY },
458 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
459 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
460 { "bind", required_argument, NULL, ARG_BIND },
461 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
462 { "tmpfs", required_argument, NULL, ARG_TMPFS },
463 { "overlay", required_argument, NULL, ARG_OVERLAY },
464 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
465 { "machine", required_argument, NULL, 'M' },
466 { "slice", required_argument, NULL, 'S' },
467 { "setenv", required_argument, NULL, 'E' },
468 { "selinux-context", required_argument, NULL, 'Z' },
469 { "selinux-apifs-context", required_argument, NULL, 'L' },
470 { "quiet", no_argument, NULL, 'q' },
471 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
472 { "register", required_argument, NULL, ARG_REGISTER },
473 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
474 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
475 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
476 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
477 { "network-veth", no_argument, NULL, 'n' },
478 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
479 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
480 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
481 { "personality", required_argument, NULL, ARG_PERSONALITY },
482 { "image", required_argument, NULL, 'i' },
483 { "volatile", optional_argument, NULL, ARG_VOLATILE },
484 { "port", required_argument, NULL, 'p' },
485 { "property", required_argument, NULL, ARG_PROPERTY },
486 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
487 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
488 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
489 { "settings", required_argument, NULL, ARG_SETTINGS },
490 { "chdir", required_argument, NULL, ARG_CHDIR },
491 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 492 {}
88213476
LP
493 };
494
9444b1f2 495 int c, r;
6aadfa4c 496 const char *p, *e;
a42c8b54 497 uint64_t plus = 0, minus = 0;
f757855e 498 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
499
500 assert(argc >= 0);
501 assert(argv);
502
19aac838 503 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
504
505 switch (c) {
506
507 case 'h':
601185b4
ZJS
508 help();
509 return 0;
88213476 510
acbeb427 511 case ARG_VERSION:
3f6fd1ba 512 return version();
acbeb427 513
88213476 514 case 'D':
0f03c2a4 515 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 516 if (r < 0)
0f03c2a4 517 return r;
ec16945e
LP
518 break;
519
520 case ARG_TEMPLATE:
0f03c2a4 521 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 522 if (r < 0)
0f03c2a4 523 return r;
88213476
LP
524 break;
525
1b9e5b12 526 case 'i':
0f03c2a4 527 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 528 if (r < 0)
0f03c2a4 529 return r;
ec16945e
LP
530 break;
531
532 case 'x':
533 arg_ephemeral = true;
1b9e5b12
LP
534 break;
535
687d0825 536 case 'u':
2fc09a9c
DM
537 r = free_and_strdup(&arg_user, optarg);
538 if (r < 0)
7027ff61 539 return log_oom();
687d0825 540
f757855e 541 arg_settings_mask |= SETTING_USER;
687d0825
MV
542 break;
543
22b28dfd
LP
544 case ARG_NETWORK_ZONE: {
545 char *j;
546
547 j = strappend("vz-", optarg);
548 if (!j)
549 return log_oom();
550
551 if (!ifname_valid(j)) {
552 log_error("Network zone name not valid: %s", j);
553 free(j);
554 return -EINVAL;
555 }
556
557 free(arg_network_zone);
558 arg_network_zone = j;
559
560 arg_network_veth = true;
561 arg_private_network = true;
562 arg_settings_mask |= SETTING_NETWORK;
563 break;
564 }
565
ab046dde 566 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
567
568 if (!ifname_valid(optarg)) {
569 log_error("Bridge interface name not valid: %s", optarg);
570 return -EINVAL;
571 }
572
f757855e
LP
573 r = free_and_strdup(&arg_network_bridge, optarg);
574 if (r < 0)
575 return log_oom();
ab046dde
TG
576
577 /* fall through */
578
0dfaa006 579 case 'n':
69c79d3c
LP
580 arg_network_veth = true;
581 arg_private_network = true;
f757855e 582 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
583 break;
584
f6d6bad1
LP
585 case ARG_NETWORK_VETH_EXTRA:
586 r = veth_extra_parse(&arg_network_veth_extra, optarg);
587 if (r < 0)
588 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
589
590 arg_private_network = true;
591 arg_settings_mask |= SETTING_NETWORK;
592 break;
593
aa28aefe 594 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
595
596 if (!ifname_valid(optarg)) {
597 log_error("Network interface name not valid: %s", optarg);
598 return -EINVAL;
599 }
600
c74e630d
LP
601 if (strv_extend(&arg_network_interfaces, optarg) < 0)
602 return log_oom();
603
604 arg_private_network = true;
f757855e 605 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
606 break;
607
608 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
609
610 if (!ifname_valid(optarg)) {
611 log_error("MACVLAN network interface name not valid: %s", optarg);
612 return -EINVAL;
613 }
614
c74e630d 615 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
616 return log_oom();
617
4bbfe7ad 618 arg_private_network = true;
f757855e 619 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
620 break;
621
622 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
623
624 if (!ifname_valid(optarg)) {
625 log_error("IPVLAN network interface name not valid: %s", optarg);
626 return -EINVAL;
627 }
628
4bbfe7ad
TG
629 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
630 return log_oom();
631
aa28aefe
LP
632 /* fall through */
633
ff01d048
LP
634 case ARG_PRIVATE_NETWORK:
635 arg_private_network = true;
f757855e 636 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
637 break;
638
0f0dbc46 639 case 'b':
7732f92b
LP
640 if (arg_start_mode == START_PID2) {
641 log_error("--boot and --as-pid2 may not be combined.");
642 return -EINVAL;
643 }
644
645 arg_start_mode = START_BOOT;
646 arg_settings_mask |= SETTING_START_MODE;
647 break;
648
649 case 'a':
650 if (arg_start_mode == START_BOOT) {
651 log_error("--boot and --as-pid2 may not be combined.");
652 return -EINVAL;
653 }
654
655 arg_start_mode = START_PID2;
656 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
657 break;
658
144f0fc0 659 case ARG_UUID:
9444b1f2 660 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
661 if (r < 0)
662 return log_error_errno(r, "Invalid UUID: %s", optarg);
663
664 if (sd_id128_is_null(arg_uuid)) {
665 log_error("Machine UUID may not be all zeroes.");
666 return -EINVAL;
aa96c6cb 667 }
f757855e
LP
668
669 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 670 break;
aa96c6cb 671
9444b1f2 672 case 'S':
c74e630d 673 arg_slice = optarg;
144f0fc0
LP
674 break;
675
7027ff61 676 case 'M':
c1521918 677 if (isempty(optarg))
97b11eed 678 arg_machine = mfree(arg_machine);
c1521918 679 else {
0c3c4284 680 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
681 log_error("Invalid machine name: %s", optarg);
682 return -EINVAL;
683 }
7027ff61 684
0c3c4284
LP
685 r = free_and_strdup(&arg_machine, optarg);
686 if (r < 0)
eb91eb18
LP
687 return log_oom();
688
689 break;
690 }
7027ff61 691
82adf6af
LP
692 case 'Z':
693 arg_selinux_context = optarg;
a8828ed9
DW
694 break;
695
82adf6af
LP
696 case 'L':
697 arg_selinux_apifs_context = optarg;
a8828ed9
DW
698 break;
699
bc2f673e
LP
700 case ARG_READ_ONLY:
701 arg_read_only = true;
f757855e 702 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
703 break;
704
420c7379
LP
705 case ARG_CAPABILITY:
706 case ARG_DROP_CAPABILITY: {
6cbe4ed1 707 p = optarg;
9ed794a3 708 for (;;) {
6cbe4ed1 709 _cleanup_free_ char *t = NULL;
5076f0cc 710
6cbe4ed1
SS
711 r = extract_first_word(&p, &t, ",", 0);
712 if (r < 0)
713 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 714
6cbe4ed1
SS
715 if (r == 0)
716 break;
5076f0cc 717
39ed67d1
LP
718 if (streq(t, "all")) {
719 if (c == ARG_CAPABILITY)
a42c8b54 720 plus = (uint64_t) -1;
39ed67d1 721 else
a42c8b54 722 minus = (uint64_t) -1;
39ed67d1 723 } else {
2822da4f
LP
724 int cap;
725
726 cap = capability_from_name(t);
727 if (cap < 0) {
39ed67d1
LP
728 log_error("Failed to parse capability %s.", t);
729 return -EINVAL;
730 }
731
732 if (c == ARG_CAPABILITY)
a42c8b54 733 plus |= 1ULL << (uint64_t) cap;
39ed67d1 734 else
a42c8b54 735 minus |= 1ULL << (uint64_t) cap;
5076f0cc 736 }
5076f0cc
LP
737 }
738
f757855e 739 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
740 break;
741 }
742
57fb9fb5
LP
743 case 'j':
744 arg_link_journal = LINK_GUEST;
574edc90 745 arg_link_journal_try = true;
57fb9fb5
LP
746 break;
747
748 case ARG_LINK_JOURNAL:
53e438e3 749 if (streq(optarg, "auto")) {
57fb9fb5 750 arg_link_journal = LINK_AUTO;
53e438e3
LP
751 arg_link_journal_try = false;
752 } else if (streq(optarg, "no")) {
57fb9fb5 753 arg_link_journal = LINK_NO;
53e438e3
LP
754 arg_link_journal_try = false;
755 } else if (streq(optarg, "guest")) {
57fb9fb5 756 arg_link_journal = LINK_GUEST;
53e438e3
LP
757 arg_link_journal_try = false;
758 } else if (streq(optarg, "host")) {
57fb9fb5 759 arg_link_journal = LINK_HOST;
53e438e3
LP
760 arg_link_journal_try = false;
761 } else if (streq(optarg, "try-guest")) {
574edc90
MP
762 arg_link_journal = LINK_GUEST;
763 arg_link_journal_try = true;
764 } else if (streq(optarg, "try-host")) {
765 arg_link_journal = LINK_HOST;
766 arg_link_journal_try = true;
767 } else {
57fb9fb5
LP
768 log_error("Failed to parse link journal mode %s", optarg);
769 return -EINVAL;
770 }
771
772 break;
773
17fe0523 774 case ARG_BIND:
f757855e
LP
775 case ARG_BIND_RO:
776 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
777 if (r < 0)
778 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 779
f757855e 780 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 781 break;
06c17c39 782
f757855e
LP
783 case ARG_TMPFS:
784 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
785 if (r < 0)
786 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 787
f757855e 788 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 789 break;
5a8af538
LP
790
791 case ARG_OVERLAY:
792 case ARG_OVERLAY_RO: {
793 _cleanup_free_ char *upper = NULL, *destination = NULL;
794 _cleanup_strv_free_ char **lower = NULL;
795 CustomMount *m;
796 unsigned n = 0;
797 char **i;
798
62f9f39a
RM
799 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
800 if (r == -ENOMEM)
06c17c39 801 return log_oom();
62f9f39a
RM
802 else if (r < 0) {
803 log_error("Invalid overlay specification: %s", optarg);
804 return r;
805 }
06c17c39 806
5a8af538
LP
807 STRV_FOREACH(i, lower) {
808 if (!path_is_absolute(*i)) {
809 log_error("Overlay path %s is not absolute.", *i);
810 return -EINVAL;
811 }
812
813 n++;
814 }
815
816 if (n < 2) {
817 log_error("--overlay= needs at least two colon-separated directories specified.");
818 return -EINVAL;
819 }
820
821 if (n == 2) {
822 /* If two parameters are specified,
823 * the first one is the lower, the
824 * second one the upper directory. And
af86c440
ZJS
825 * we'll also define the destination
826 * mount point the same as the upper. */
5a8af538
LP
827 upper = lower[1];
828 lower[1] = NULL;
829
830 destination = strdup(upper);
831 if (!destination)
832 return log_oom();
833
834 } else {
835 upper = lower[n - 2];
836 destination = lower[n - 1];
837 lower[n - 2] = NULL;
838 }
839
f757855e 840 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
841 if (!m)
842 return log_oom();
843
844 m->destination = destination;
845 m->source = upper;
846 m->lower = lower;
847 m->read_only = c == ARG_OVERLAY_RO;
848
849 upper = destination = NULL;
850 lower = NULL;
06c17c39 851
f757855e 852 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
853 break;
854 }
855
a5f1cb3b 856 case 'E': {
f4889f65
LP
857 char **n;
858
859 if (!env_assignment_is_valid(optarg)) {
860 log_error("Environment variable assignment '%s' is not valid.", optarg);
861 return -EINVAL;
862 }
863
864 n = strv_env_set(arg_setenv, optarg);
865 if (!n)
866 return log_oom();
867
868 strv_free(arg_setenv);
869 arg_setenv = n;
f757855e
LP
870
871 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
872 break;
873 }
874
284c0b91
LP
875 case 'q':
876 arg_quiet = true;
877 break;
878
8a96d94e 879 case ARG_SHARE_SYSTEM:
a6b5216c 880 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
881 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
882 arg_clone_ns_flags = 0;
8a96d94e
LP
883 break;
884
eb91eb18
LP
885 case ARG_REGISTER:
886 r = parse_boolean(optarg);
887 if (r < 0) {
888 log_error("Failed to parse --register= argument: %s", optarg);
889 return r;
890 }
891
892 arg_register = r;
893 break;
894
89f7c846
LP
895 case ARG_KEEP_UNIT:
896 arg_keep_unit = true;
897 break;
898
6afc95b7
LP
899 case ARG_PERSONALITY:
900
ac45f971 901 arg_personality = personality_from_string(optarg);
050f7277 902 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
903 log_error("Unknown or unsupported personality '%s'.", optarg);
904 return -EINVAL;
905 }
906
f757855e 907 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
908 break;
909
4d9f07b4
LP
910 case ARG_VOLATILE:
911
912 if (!optarg)
f757855e 913 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 914 else {
f757855e 915 VolatileMode m;
4d9f07b4 916
f757855e
LP
917 m = volatile_mode_from_string(optarg);
918 if (m < 0) {
919 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 920 return -EINVAL;
f757855e
LP
921 } else
922 arg_volatile_mode = m;
6d0b55c2
LP
923 }
924
f757855e
LP
925 arg_settings_mask |= SETTING_VOLATILE_MODE;
926 break;
6d0b55c2 927
f757855e
LP
928 case 'p':
929 r = expose_port_parse(&arg_expose_ports, optarg);
930 if (r == -EEXIST)
931 return log_error_errno(r, "Duplicate port specification: %s", optarg);
932 if (r < 0)
933 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 934
f757855e 935 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 936 break;
6d0b55c2 937
f36933fe
LP
938 case ARG_PROPERTY:
939 if (strv_extend(&arg_property, optarg) < 0)
940 return log_oom();
941
942 break;
943
ae209204
ZJS
944 case ARG_PRIVATE_USERS: {
945 int boolean = -1;
0de7acce 946
ae209204
ZJS
947 if (!optarg)
948 boolean = true;
949 else if (!in_charset(optarg, DIGITS))
950 /* do *not* parse numbers as booleans */
951 boolean = parse_boolean(optarg);
952
953 if (boolean == false) {
0de7acce
LP
954 /* no: User namespacing off */
955 arg_userns_mode = USER_NAMESPACE_NO;
956 arg_uid_shift = UID_INVALID;
957 arg_uid_range = UINT32_C(0x10000);
ae209204 958 } else if (boolean == true) {
0de7acce
LP
959 /* yes: User namespacing on, UID range is read from root dir */
960 arg_userns_mode = USER_NAMESPACE_FIXED;
961 arg_uid_shift = UID_INVALID;
962 arg_uid_range = UINT32_C(0x10000);
963 } else if (streq(optarg, "pick")) {
964 /* pick: User namespacing on, UID range is picked randomly */
965 arg_userns_mode = USER_NAMESPACE_PICK;
966 arg_uid_shift = UID_INVALID;
967 arg_uid_range = UINT32_C(0x10000);
968 } else {
6c2058b3 969 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
970 const char *range, *shift;
971
0de7acce
LP
972 /* anything else: User namespacing on, UID range is explicitly configured */
973
6dac160c
LP
974 range = strchr(optarg, ':');
975 if (range) {
6c2058b3
ZJS
976 buffer = strndup(optarg, range - optarg);
977 if (!buffer)
978 return log_oom();
979 shift = buffer;
6dac160c
LP
980
981 range++;
bfd292ec
ZJS
982 r = safe_atou32(range, &arg_uid_range);
983 if (r < 0)
be715731 984 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
985 } else
986 shift = optarg;
987
be715731
ZJS
988 r = parse_uid(shift, &arg_uid_shift);
989 if (r < 0)
990 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
991
992 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
993 }
994
be715731
ZJS
995 if (arg_uid_range <= 0) {
996 log_error("UID range cannot be 0.");
997 return -EINVAL;
998 }
999
0de7acce 1000 arg_settings_mask |= SETTING_USERNS;
6dac160c 1001 break;
ae209204 1002 }
6dac160c 1003
0de7acce 1004 case 'U':
ccabee0d
LP
1005 if (userns_supported()) {
1006 arg_userns_mode = USER_NAMESPACE_PICK;
1007 arg_uid_shift = UID_INVALID;
1008 arg_uid_range = UINT32_C(0x10000);
1009
1010 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1011 }
1012
7336138e
LP
1013 break;
1014
0de7acce 1015 case ARG_PRIVATE_USERS_CHOWN:
19aac838 1016 arg_userns_chown = true;
0de7acce
LP
1017
1018 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
1019 break;
1020
c6c8f6e2
LP
1021 case ARG_KILL_SIGNAL:
1022 arg_kill_signal = signal_from_string_try_harder(optarg);
1023 if (arg_kill_signal < 0) {
1024 log_error("Cannot parse signal: %s", optarg);
1025 return -EINVAL;
1026 }
1027
f757855e
LP
1028 arg_settings_mask |= SETTING_KILL_SIGNAL;
1029 break;
1030
1031 case ARG_SETTINGS:
1032
1033 /* no → do not read files
1034 * yes → read files, do not override cmdline, trust only subset
1035 * override → read files, override cmdline, trust only subset
1036 * trusted → read files, do not override cmdline, trust all
1037 */
1038
1039 r = parse_boolean(optarg);
1040 if (r < 0) {
1041 if (streq(optarg, "trusted")) {
1042 mask_all_settings = false;
1043 mask_no_settings = false;
1044 arg_settings_trusted = true;
1045
1046 } else if (streq(optarg, "override")) {
1047 mask_all_settings = false;
1048 mask_no_settings = true;
1049 arg_settings_trusted = -1;
1050 } else
1051 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1052 } else if (r > 0) {
1053 /* yes */
1054 mask_all_settings = false;
1055 mask_no_settings = false;
1056 arg_settings_trusted = -1;
1057 } else {
1058 /* no */
1059 mask_all_settings = true;
1060 mask_no_settings = false;
1061 arg_settings_trusted = false;
1062 }
1063
c6c8f6e2
LP
1064 break;
1065
5f932eb9
LP
1066 case ARG_CHDIR:
1067 if (!path_is_absolute(optarg)) {
1068 log_error("Working directory %s is not an absolute path.", optarg);
1069 return -EINVAL;
1070 }
1071
1072 r = free_and_strdup(&arg_chdir, optarg);
1073 if (r < 0)
1074 return log_oom();
1075
1076 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1077 break;
1078
9c1e04d0
AP
1079 case ARG_NOTIFY_READY:
1080 r = parse_boolean(optarg);
1081 if (r < 0) {
1082 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1083 return -EINVAL;
1084 }
1085 arg_notify_ready = r;
1086 arg_settings_mask |= SETTING_NOTIFY_READY;
1087 break;
1088
88213476
LP
1089 case '?':
1090 return -EINVAL;
1091
1092 default:
eb9da376 1093 assert_not_reached("Unhandled option");
88213476 1094 }
88213476 1095
0c582db0
LB
1096 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1097 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1098 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1099 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1100
4f086aab
SU
1101 if (arg_userns_mode != USER_NAMESPACE_NO)
1102 arg_mount_settings |= MOUNT_USE_USERNS;
1103
1104 if (arg_private_network)
1105 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1106
1107 parse_mount_settings_env();
1108
48a8d337
LB
1109 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1110 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1111 arg_register = false;
0c582db0
LB
1112 if (arg_start_mode != START_PID1) {
1113 log_error("--boot cannot be used without namespacing.");
1114 return -EINVAL;
1115 }
1116 }
eb91eb18 1117
0de7acce 1118 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1119 arg_userns_chown = true;
1120
89f7c846
LP
1121 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1122 log_error("--keep-unit may not be used when invoked from a user session.");
1123 return -EINVAL;
1124 }
1125
1b9e5b12
LP
1126 if (arg_directory && arg_image) {
1127 log_error("--directory= and --image= may not be combined.");
1128 return -EINVAL;
1129 }
1130
ec16945e
LP
1131 if (arg_template && arg_image) {
1132 log_error("--template= and --image= may not be combined.");
1133 return -EINVAL;
1134 }
1135
1136 if (arg_template && !(arg_directory || arg_machine)) {
1137 log_error("--template= needs --directory= or --machine=.");
1138 return -EINVAL;
1139 }
1140
1141 if (arg_ephemeral && arg_template) {
1142 log_error("--ephemeral and --template= may not be combined.");
1143 return -EINVAL;
1144 }
1145
df9a75e4
LP
1146 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1147 log_error("--ephemeral and --link-journal= may not be combined.");
1148 return -EINVAL;
1149 }
1150
ccabee0d 1151 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1152 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1153 return -EOPNOTSUPP;
1154 }
1155
1156 if (arg_userns_chown && arg_read_only) {
1157 log_error("--read-only and --private-users-chown may not be combined.");
1158 return -EINVAL;
1159 }
f757855e 1160
22b28dfd
LP
1161 if (arg_network_bridge && arg_network_zone) {
1162 log_error("--network-bridge= and --network-zone= may not be combined.");
1163 return -EINVAL;
1164 }
1165
f757855e
LP
1166 if (argc > optind) {
1167 arg_parameters = strv_copy(argv + optind);
1168 if (!arg_parameters)
1169 return log_oom();
1170
7732f92b 1171 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1172 }
1173
1174 /* Load all settings from .nspawn files */
1175 if (mask_no_settings)
1176 arg_settings_mask = 0;
1177
1178 /* Don't load any settings from .nspawn files */
1179 if (mask_all_settings)
1180 arg_settings_mask = _SETTINGS_MASK_ALL;
1181
520e0d54 1182 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1183
6aadfa4c
ILG
1184 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1185 if (e)
1186 arg_container_service_name = e;
1187
5a8ff0e6
CB
1188 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1189 if (r < 0)
1190 arg_use_cgns = cg_ns_supported();
1191 else
1192 arg_use_cgns = r;
1193
f757855e
LP
1194 return 1;
1195}
1196
1197static int verify_arguments(void) {
4f086aab
SU
1198 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1199 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1200 return -EINVAL;
1201 }
1202
1203 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1204 log_error("Cannot combine --private-users with read-write mounts.");
1205 return -EINVAL;
1206 }
f757855e
LP
1207
1208 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1209 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1210 return -EINVAL;
1211 }
1212
6d0b55c2
LP
1213 if (arg_expose_ports && !arg_private_network) {
1214 log_error("Cannot use --port= without private networking.");
1215 return -EINVAL;
1216 }
1217
1c1ea217
EV
1218#ifndef HAVE_LIBIPTC
1219 if (arg_expose_ports) {
1220 log_error("--port= is not supported, compiled without libiptc support.");
1221 return -EOPNOTSUPP;
1222 }
1223#endif
1224
7732f92b 1225 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1226 arg_kill_signal = SIGRTMIN+3;
1227
f757855e 1228 return 0;
88213476
LP
1229}
1230
03cfe0d5
LP
1231static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1232 assert(p);
1233
0de7acce 1234 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1235 return 0;
1236
1237 if (uid == UID_INVALID && gid == GID_INVALID)
1238 return 0;
1239
1240 if (uid != UID_INVALID) {
1241 uid += arg_uid_shift;
1242
1243 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1244 return -EOVERFLOW;
1245 }
1246
1247 if (gid != GID_INVALID) {
1248 gid += (gid_t) arg_uid_shift;
1249
1250 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1251 return -EOVERFLOW;
1252 }
1253
1254 if (lchown(p, uid, gid) < 0)
1255 return -errno;
b12afc8c
LP
1256
1257 return 0;
1258}
1259
03cfe0d5
LP
1260static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1261 const char *q;
1262
1263 q = prefix_roota(root, path);
1264 if (mkdir(q, mode) < 0) {
1265 if (errno == EEXIST)
1266 return 0;
1267 return -errno;
1268 }
1269
1270 return userns_lchown(q, uid, gid);
1271}
1272
e58a1277 1273static int setup_timezone(const char *dest) {
03cfe0d5
LP
1274 _cleanup_free_ char *p = NULL, *q = NULL;
1275 const char *where, *check, *what;
d4036145
LP
1276 char *z, *y;
1277 int r;
f8440af5 1278
e58a1277
LP
1279 assert(dest);
1280
1281 /* Fix the timezone, if possible */
d4036145
LP
1282 r = readlink_malloc("/etc/localtime", &p);
1283 if (r < 0) {
0b493a02
MP
1284 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1285 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1286 * with a symbolic link to a time zone data file.
0b493a02
MP
1287 *
1288 * Example:
21dc0227 1289 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1290 */
d4036145
LP
1291 return 0;
1292 }
1293
1294 z = path_startswith(p, "../usr/share/zoneinfo/");
1295 if (!z)
1296 z = path_startswith(p, "/usr/share/zoneinfo/");
1297 if (!z) {
1298 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1299 return 0;
1300 }
1301
03cfe0d5 1302 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1303 r = readlink_malloc(where, &q);
1304 if (r >= 0) {
1305 y = path_startswith(q, "../usr/share/zoneinfo/");
1306 if (!y)
1307 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1308
d4036145
LP
1309 /* Already pointing to the right place? Then do nothing .. */
1310 if (y && streq(y, z))
1311 return 0;
1312 }
1313
03cfe0d5 1314 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1315 check = prefix_roota(dest, check);
03cfe0d5 1316 if (laccess(check, F_OK) < 0) {
d4036145
LP
1317 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1318 return 0;
1319 }
68fb0892 1320
79d80fc1
TG
1321 r = unlink(where);
1322 if (r < 0 && errno != ENOENT) {
56f64d95 1323 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1324 return 0;
1325 }
4d9f07b4 1326
03cfe0d5 1327 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1328 if (symlink(what, where) < 0) {
56f64d95 1329 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1330 return 0;
1331 }
e58a1277 1332
03cfe0d5
LP
1333 r = userns_lchown(where, 0, 0);
1334 if (r < 0)
1335 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1336
e58a1277 1337 return 0;
88213476
LP
1338}
1339
2547bb41 1340static int setup_resolv_conf(const char *dest) {
03cfe0d5 1341 const char *where = NULL;
79d80fc1 1342 int r;
2547bb41
LP
1343
1344 assert(dest);
1345
1346 if (arg_private_network)
1347 return 0;
1348
1349 /* Fix resolv.conf, if possible */
03cfe0d5 1350 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1351
7debb05d
CH
1352 if (access("/run/systemd/resolve/resolv.conf", F_OK) >= 0 &&
1353 access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
3539724c
LP
1354 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1355 * container, so that the container can use the host's resolver. Given that network namespacing is
1356 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1357 * advantage that the container will be able to follow the host's DNS server configuration changes
1358 * transparently. */
1359
60e76d48
ZJS
1360 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1361 if (r >= 0)
1362 return mount_verbose(LOG_ERR, NULL, where, NULL,
1363 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1364 }
1365
1366 /* If that didn't work, let's copy the file */
f2068bcc 1367 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1368 if (r < 0) {
3539724c
LP
1369 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1370 * resolved or something similar runs inside and the symlink points there.
68a313c5 1371 *
3539724c 1372 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1373 */
1374 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1375 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1376 return 0;
1377 }
2547bb41 1378
03cfe0d5
LP
1379 r = userns_lchown(where, 0, 0);
1380 if (r < 0)
3539724c 1381 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1382
2547bb41
LP
1383 return 0;
1384}
1385
04bc4a3f 1386static int setup_boot_id(const char *dest) {
3bbaff3e 1387 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1388 const char *from, *to;
04bc4a3f
LP
1389 int r;
1390
04bc4a3f
LP
1391 /* Generate a new randomized boot ID, so that each boot-up of
1392 * the container gets a new one */
1393
03cfe0d5
LP
1394 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1395 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1396
1397 r = sd_id128_randomize(&rnd);
f647962d
MS
1398 if (r < 0)
1399 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1400
15b1248a 1401 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1402 if (r < 0)
1403 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1404
60e76d48
ZJS
1405 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1406 if (r >= 0)
1407 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1408 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1409
3bbaff3e 1410 (void) unlink(from);
04bc4a3f
LP
1411 return r;
1412}
1413
e58a1277 1414static int copy_devnodes(const char *dest) {
88213476
LP
1415
1416 static const char devnodes[] =
1417 "null\0"
1418 "zero\0"
1419 "full\0"
1420 "random\0"
1421 "urandom\0"
85614d66
TG
1422 "tty\0"
1423 "net/tun\0";
88213476
LP
1424
1425 const char *d;
e58a1277 1426 int r = 0;
7fd1b19b 1427 _cleanup_umask_ mode_t u;
a258bf26
LP
1428
1429 assert(dest);
124640f1
LP
1430
1431 u = umask(0000);
88213476 1432
03cfe0d5
LP
1433 /* Create /dev/net, so that we can create /dev/net/tun in it */
1434 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1435 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1436
88213476 1437 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1438 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1439 struct stat st;
88213476 1440
7f112f50 1441 from = strappend("/dev/", d);
03cfe0d5 1442 to = prefix_root(dest, from);
88213476
LP
1443
1444 if (stat(from, &st) < 0) {
1445
4a62c710
MS
1446 if (errno != ENOENT)
1447 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1448
a258bf26 1449 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1450
03cfe0d5 1451 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1452 return -EIO;
a258bf26 1453
85614d66 1454 } else {
81f5049b 1455 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1456 /*
1457 * This is some sort of protection too against
1458 * recursive userns chown on shared /dev/
1459 */
1460 if (errno == EEXIST)
1461 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1462 if (errno != EPERM)
1463 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1464
1465 /* Some systems abusively restrict mknod but
1466 * allow bind mounts. */
1467 r = touch(to);
1468 if (r < 0)
1469 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1470 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1471 if (r < 0)
1472 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1473 }
6278cf60 1474
03cfe0d5
LP
1475 r = userns_lchown(to, 0, 0);
1476 if (r < 0)
1477 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1478 }
88213476
LP
1479 }
1480
e58a1277
LP
1481 return r;
1482}
88213476 1483
03cfe0d5
LP
1484static int setup_pts(const char *dest) {
1485 _cleanup_free_ char *options = NULL;
1486 const char *p;
709f6e46 1487 int r;
03cfe0d5
LP
1488
1489#ifdef HAVE_SELINUX
1490 if (arg_selinux_apifs_context)
1491 (void) asprintf(&options,
3dce8915 1492 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1493 arg_uid_shift + TTY_GID,
1494 arg_selinux_apifs_context);
1495 else
1496#endif
1497 (void) asprintf(&options,
3dce8915 1498 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1499 arg_uid_shift + TTY_GID);
f2d88580 1500
03cfe0d5 1501 if (!options)
f2d88580
LP
1502 return log_oom();
1503
03cfe0d5 1504 /* Mount /dev/pts itself */
cc9fce65 1505 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1506 if (mkdir(p, 0755) < 0)
1507 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1508 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1509 if (r < 0)
1510 return r;
709f6e46
MS
1511 r = userns_lchown(p, 0, 0);
1512 if (r < 0)
1513 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1514
1515 /* Create /dev/ptmx symlink */
1516 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1517 if (symlink("pts/ptmx", p) < 0)
1518 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1519 r = userns_lchown(p, 0, 0);
1520 if (r < 0)
1521 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1522
03cfe0d5
LP
1523 /* And fix /dev/pts/ptmx ownership */
1524 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1525 r = userns_lchown(p, 0, 0);
1526 if (r < 0)
1527 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1528
f2d88580
LP
1529 return 0;
1530}
1531
e58a1277 1532static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1533 _cleanup_umask_ mode_t u;
1534 const char *to;
e58a1277 1535 int r;
e58a1277
LP
1536
1537 assert(dest);
1538 assert(console);
1539
1540 u = umask(0000);
1541
03cfe0d5 1542 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1543 if (r < 0)
1544 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1545
a258bf26
LP
1546 /* We need to bind mount the right tty to /dev/console since
1547 * ptys can only exist on pts file systems. To have something
81f5049b 1548 * to bind mount things on we create a empty regular file. */
a258bf26 1549
03cfe0d5 1550 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1551 r = touch(to);
1552 if (r < 0)
1553 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1554
60e76d48 1555 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1556}
1557
1558static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1559 const char *from, *to;
7fd1b19b 1560 _cleanup_umask_ mode_t u;
d9603714 1561 int fd, r;
e58a1277 1562
e58a1277 1563 assert(kmsg_socket >= 0);
a258bf26 1564
e58a1277 1565 u = umask(0000);
a258bf26 1566
03cfe0d5 1567 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1568 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1569 * on the reading side behave very similar to /proc/kmsg,
1570 * their writing side behaves differently from /dev/kmsg in
1571 * that writing blocks when nothing is reading. In order to
1572 * avoid any problems with containers deadlocking due to this
1573 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1574 from = prefix_roota(dest, "/run/kmsg");
1575 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1576
4a62c710 1577 if (mkfifo(from, 0600) < 0)
03cfe0d5 1578 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1579 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1580 if (r < 0)
1581 return r;
e58a1277
LP
1582
1583 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1584 if (fd < 0)
1585 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1586
e58a1277
LP
1587 /* Store away the fd in the socket, so that it stays open as
1588 * long as we run the child */
3ee897d6 1589 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1590 safe_close(fd);
e58a1277 1591
d9603714
DH
1592 if (r < 0)
1593 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1594
03cfe0d5
LP
1595 /* And now make the FIFO unavailable as /run/kmsg... */
1596 (void) unlink(from);
1597
25ea79fe 1598 return 0;
88213476
LP
1599}
1600
1c4baffc 1601static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1602 union in_addr_union *exposed = userdata;
1603
1604 assert(rtnl);
1605 assert(m);
1606 assert(exposed);
1607
7a8f6325 1608 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1609 return 0;
1610}
1611
3a74cea5 1612static int setup_hostname(void) {
3a74cea5 1613
0c582db0 1614 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1615 return 0;
1616
605f81a8 1617 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1618 return -errno;
3a74cea5 1619
7027ff61 1620 return 0;
3a74cea5
LP
1621}
1622
57fb9fb5 1623static int setup_journal(const char *directory) {
e01ff70a 1624 sd_id128_t this_id;
0f5e1382 1625 _cleanup_free_ char *d = NULL;
e01ff70a 1626 const char *p, *q;
8054d749 1627 bool try;
e01ff70a 1628 char id[33];
57fb9fb5
LP
1629 int r;
1630
df9a75e4
LP
1631 /* Don't link journals in ephemeral mode */
1632 if (arg_ephemeral)
1633 return 0;
1634
8054d749
LP
1635 if (arg_link_journal == LINK_NO)
1636 return 0;
1637
1638 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1639
4d680aee 1640 r = sd_id128_get_machine(&this_id);
f647962d
MS
1641 if (r < 0)
1642 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1643
e01ff70a 1644 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1645 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1646 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1647 if (try)
4d680aee 1648 return 0;
df9a75e4 1649 return -EEXIST;
4d680aee
ZJS
1650 }
1651
03cfe0d5
LP
1652 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1653 if (r < 0)
1654 return log_error_errno(r, "Failed to create /var: %m");
1655
1656 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1657 if (r < 0)
1658 return log_error_errno(r, "Failed to create /var/log: %m");
1659
1660 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1661 if (r < 0)
1662 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1663
e01ff70a
MS
1664 (void) sd_id128_to_string(arg_uuid, id);
1665
03cfe0d5
LP
1666 p = strjoina("/var/log/journal/", id);
1667 q = prefix_roota(directory, p);
27407a01 1668
e26d6ce5 1669 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1670 if (try)
1671 return 0;
27407a01 1672
8054d749
LP
1673 log_error("%s: already a mount point, refusing to use for journal", p);
1674 return -EEXIST;
57fb9fb5
LP
1675 }
1676
e26d6ce5 1677 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1678 if (try)
1679 return 0;
57fb9fb5 1680
8054d749
LP
1681 log_error("%s: already a mount point, refusing to use for journal", q);
1682 return -EEXIST;
57fb9fb5
LP
1683 }
1684
1685 r = readlink_and_make_absolute(p, &d);
1686 if (r >= 0) {
1687 if ((arg_link_journal == LINK_GUEST ||
1688 arg_link_journal == LINK_AUTO) &&
1689 path_equal(d, q)) {
1690
03cfe0d5 1691 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1692 if (r < 0)
709f6e46 1693 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1694 return 0;
57fb9fb5
LP
1695 }
1696
4a62c710
MS
1697 if (unlink(p) < 0)
1698 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1699 } else if (r == -EINVAL) {
1700
1701 if (arg_link_journal == LINK_GUEST &&
1702 rmdir(p) < 0) {
1703
27407a01
ZJS
1704 if (errno == ENOTDIR) {
1705 log_error("%s already exists and is neither a symlink nor a directory", p);
1706 return r;
4314d33f
MS
1707 } else
1708 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1709 }
4314d33f
MS
1710 } else if (r != -ENOENT)
1711 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1712
1713 if (arg_link_journal == LINK_GUEST) {
1714
1715 if (symlink(q, p) < 0) {
8054d749 1716 if (try) {
56f64d95 1717 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1718 return 0;
4314d33f
MS
1719 } else
1720 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1721 }
1722
03cfe0d5 1723 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1724 if (r < 0)
709f6e46 1725 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1726 return 0;
57fb9fb5
LP
1727 }
1728
1729 if (arg_link_journal == LINK_HOST) {
ccddd104 1730 /* don't create parents here — if the host doesn't have
574edc90 1731 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1732
1733 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1734 if (try) {
56f64d95 1735 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1736 return 0;
4314d33f
MS
1737 } else
1738 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1739 }
1740
27407a01
ZJS
1741 } else if (access(p, F_OK) < 0)
1742 return 0;
57fb9fb5 1743
cdb2b9d0
LP
1744 if (dir_is_empty(q) == 0)
1745 log_warning("%s is not empty, proceeding anyway.", q);
1746
03cfe0d5 1747 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1748 if (r < 0)
1749 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1750
60e76d48
ZJS
1751 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1752 if (r < 0)
4a62c710 1753 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1754
27407a01 1755 return 0;
57fb9fb5
LP
1756}
1757
88213476 1758static int drop_capabilities(void) {
520e0d54 1759 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1760}
1761
db999e0f
LP
1762static int reset_audit_loginuid(void) {
1763 _cleanup_free_ char *p = NULL;
1764 int r;
1765
0c582db0 1766 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1767 return 0;
1768
1769 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1770 if (r == -ENOENT)
db999e0f 1771 return 0;
f647962d
MS
1772 if (r < 0)
1773 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1774
1775 /* Already reset? */
1776 if (streq(p, "4294967295"))
1777 return 0;
1778
ad118bda 1779 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1780 if (r < 0) {
10a87006
LP
1781 log_error_errno(r,
1782 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1783 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1784 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1785 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1786 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1787
db999e0f 1788 sleep(5);
77b6e194 1789 }
db999e0f
LP
1790
1791 return 0;
77b6e194
LP
1792}
1793
24fb1112 1794
785890ac
LP
1795static int setup_propagate(const char *root) {
1796 const char *p, *q;
709f6e46 1797 int r;
785890ac
LP
1798
1799 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1800 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1801 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1802 (void) mkdir_p(p, 0600);
1803
709f6e46
MS
1804 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1805 if (r < 0)
1806 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1807
709f6e46
MS
1808 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1809 if (r < 0)
1810 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1811
709f6e46
MS
1812 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1813 if (r < 0)
1814 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1815
03cfe0d5 1816 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1817 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1818 if (r < 0)
1819 return r;
785890ac 1820
60e76d48
ZJS
1821 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1822 if (r < 0)
1823 return r;
785890ac 1824
19caffac
AC
1825 /* machined will MS_MOVE into that directory, and that's only
1826 * supported for non-shared mounts. */
60e76d48 1827 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1828}
1829
1b9e5b12
LP
1830static int setup_image(char **device_path, int *loop_nr) {
1831 struct loop_info64 info = {
1832 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1833 };
1834 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1835 _cleanup_free_ char* loopdev = NULL;
1836 struct stat st;
1837 int r, nr;
1838
1839 assert(device_path);
1840 assert(loop_nr);
ec16945e 1841 assert(arg_image);
1b9e5b12
LP
1842
1843 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1844 if (fd < 0)
1845 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1846
4a62c710
MS
1847 if (fstat(fd, &st) < 0)
1848 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1849
1850 if (S_ISBLK(st.st_mode)) {
1851 char *p;
1852
1853 p = strdup(arg_image);
1854 if (!p)
1855 return log_oom();
1856
1857 *device_path = p;
1858
1859 *loop_nr = -1;
1860
1861 r = fd;
1862 fd = -1;
1863
1864 return r;
1865 }
1866
1867 if (!S_ISREG(st.st_mode)) {
070edd97 1868 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1869 return -EINVAL;
1870 }
1871
1872 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1873 if (control < 0)
1874 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1875
1876 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1877 if (nr < 0)
1878 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1879
1880 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1881 return log_oom();
1882
1883 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1884 if (loop < 0)
1885 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1886
4a62c710
MS
1887 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1888 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1889
1890 if (arg_read_only)
1891 info.lo_flags |= LO_FLAGS_READ_ONLY;
1892
4a62c710
MS
1893 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1894 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1895
1896 *device_path = loopdev;
1897 loopdev = NULL;
1898
1899 *loop_nr = nr;
1900
1901 r = loop;
1902 loop = -1;
1903
1904 return r;
1905}
1906
ada4799a
LP
1907#define PARTITION_TABLE_BLURB \
1908 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1909 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1910 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1911 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1912 "to be bootable with systemd-nspawn."
1913
1b9e5b12
LP
1914static int dissect_image(
1915 int fd,
727fd4fd
LP
1916 char **root_device, bool *root_device_rw,
1917 char **home_device, bool *home_device_rw,
1918 char **srv_device, bool *srv_device_rw,
a6bc7db9 1919 char **esp_device,
1b9e5b12
LP
1920 bool *secondary) {
1921
1922#ifdef HAVE_BLKID
a6bc7db9 1923 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1924#ifdef GPT_ROOT_NATIVE
1925 int root_nr = -1;
1926#endif
1927#ifdef GPT_ROOT_SECONDARY
1928 int secondary_root_nr = -1;
1929#endif
a6bc7db9 1930 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1931 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1932 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1933 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1934 _cleanup_udev_unref_ struct udev *udev = NULL;
1935 struct udev_list_entry *first, *item;
f6c51a81 1936 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1937 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1938 const char *pttype = NULL;
1939 blkid_partlist pl;
1940 struct stat st;
c09ef2e4 1941 unsigned i;
1b9e5b12
LP
1942 int r;
1943
1944 assert(fd >= 0);
1945 assert(root_device);
1946 assert(home_device);
1947 assert(srv_device);
a6bc7db9 1948 assert(esp_device);
1b9e5b12 1949 assert(secondary);
ec16945e 1950 assert(arg_image);
1b9e5b12
LP
1951
1952 b = blkid_new_probe();
1953 if (!b)
1954 return log_oom();
1955
1956 errno = 0;
1957 r = blkid_probe_set_device(b, fd, 0, 0);
1958 if (r != 0) {
1959 if (errno == 0)
1960 return log_oom();
1961
e1427b13 1962 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1963 }
1964
1965 blkid_probe_enable_partitions(b, 1);
1966 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1967
1968 errno = 0;
1969 r = blkid_do_safeprobe(b);
1970 if (r == -2 || r == 1) {
ada4799a
LP
1971 log_error("Failed to identify any partition table on\n"
1972 " %s\n"
1973 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1974 return -EINVAL;
1975 } else if (r != 0) {
1976 if (errno == 0)
1977 errno = EIO;
e1427b13 1978 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1979 }
1980
48861960 1981 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1982
1983 is_gpt = streq_ptr(pttype, "gpt");
1984 is_mbr = streq_ptr(pttype, "dos");
1985
1986 if (!is_gpt && !is_mbr) {
1987 log_error("No GPT or MBR partition table discovered on\n"
1988 " %s\n"
1989 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1990 return -EINVAL;
1991 }
1992
1993 errno = 0;
1994 pl = blkid_probe_get_partitions(b);
1995 if (!pl) {
1996 if (errno == 0)
1997 return log_oom();
1998
1999 log_error("Failed to list partitions of %s", arg_image);
2000 return -errno;
2001 }
2002
2003 udev = udev_new();
2004 if (!udev)
2005 return log_oom();
2006
4a62c710
MS
2007 if (fstat(fd, &st) < 0)
2008 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2009
c09ef2e4
LP
2010 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2011 if (!d)
1b9e5b12
LP
2012 return log_oom();
2013
c09ef2e4
LP
2014 for (i = 0;; i++) {
2015 int n, m;
1b9e5b12 2016
c09ef2e4
LP
2017 if (i >= 10) {
2018 log_error("Kernel partitions never appeared.");
2019 return -ENXIO;
2020 }
2021
2022 e = udev_enumerate_new(udev);
2023 if (!e)
2024 return log_oom();
2025
2026 r = udev_enumerate_add_match_parent(e, d);
2027 if (r < 0)
2028 return log_oom();
2029
2030 r = udev_enumerate_scan_devices(e);
2031 if (r < 0)
2032 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2033
2034 /* Count the partitions enumerated by the kernel */
2035 n = 0;
2036 first = udev_enumerate_get_list_entry(e);
2037 udev_list_entry_foreach(item, first)
2038 n++;
2039
2040 /* Count the partitions enumerated by blkid */
2041 m = blkid_partlist_numof_partitions(pl);
2042 if (n == m + 1)
2043 break;
2044 if (n > m + 1) {
2045 log_error("blkid and kernel partition list do not match.");
2046 return -EIO;
2047 }
2048 if (n < m + 1) {
2049 unsigned j;
2050
2051 /* The kernel has probed fewer partitions than
2052 * blkid? Maybe the kernel prober is still
2053 * running or it got EBUSY because udev
2054 * already opened the device. Let's reprobe
2055 * the device, which is a synchronous call
2056 * that waits until probing is complete. */
2057
2058 for (j = 0; j < 20; j++) {
2059
2060 r = ioctl(fd, BLKRRPART, 0);
2061 if (r < 0)
2062 r = -errno;
2063 if (r >= 0 || r != -EBUSY)
2064 break;
2065
2066 /* If something else has the device
2067 * open, such as an udev rule, the
2068 * ioctl will return EBUSY. Since
2069 * there's no way to wait until it
2070 * isn't busy anymore, let's just wait
2071 * a bit, and try again.
2072 *
2073 * This is really something they
2074 * should fix in the kernel! */
2075
2076 usleep(50 * USEC_PER_MSEC);
2077 }
2078
2079 if (r < 0)
2080 return log_error_errno(r, "Failed to reread partition table: %m");
2081 }
2082
2083 e = udev_enumerate_unref(e);
2084 }
1b9e5b12
LP
2085
2086 first = udev_enumerate_get_list_entry(e);
2087 udev_list_entry_foreach(item, first) {
2088 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2089 const char *node;
727fd4fd 2090 unsigned long long flags;
1b9e5b12
LP
2091 blkid_partition pp;
2092 dev_t qn;
2093 int nr;
2094
2095 errno = 0;
2096 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2097 if (!q) {
2098 if (!errno)
2099 errno = ENOMEM;
2100
e1427b13 2101 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2102 }
2103
2104 qn = udev_device_get_devnum(q);
2105 if (major(qn) == 0)
2106 continue;
2107
2108 if (st.st_rdev == qn)
2109 continue;
2110
2111 node = udev_device_get_devnode(q);
2112 if (!node)
2113 continue;
2114
2115 pp = blkid_partlist_devno_to_partition(pl, qn);
2116 if (!pp)
2117 continue;
2118
727fd4fd 2119 flags = blkid_partition_get_flags(pp);
727fd4fd 2120
1b9e5b12
LP
2121 nr = blkid_partition_get_partno(pp);
2122 if (nr < 0)
2123 continue;
2124
ada4799a
LP
2125 if (is_gpt) {
2126 sd_id128_t type_id;
2127 const char *stype;
1b9e5b12 2128
f6c51a81
LP
2129 if (flags & GPT_FLAG_NO_AUTO)
2130 continue;
2131
ada4799a
LP
2132 stype = blkid_partition_get_type_string(pp);
2133 if (!stype)
2134 continue;
1b9e5b12 2135
ada4799a 2136 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2137 continue;
2138
ada4799a 2139 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2140
ada4799a
LP
2141 if (home && nr >= home_nr)
2142 continue;
1b9e5b12 2143
ada4799a
LP
2144 home_nr = nr;
2145 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2146
ada4799a
LP
2147 r = free_and_strdup(&home, node);
2148 if (r < 0)
2149 return log_oom();
727fd4fd 2150
ada4799a
LP
2151 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2152
2153 if (srv && nr >= srv_nr)
2154 continue;
2155
2156 srv_nr = nr;
2157 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2158
2159 r = free_and_strdup(&srv, node);
2160 if (r < 0)
2161 return log_oom();
a6bc7db9
LP
2162 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2163
2164 if (esp && nr >= esp_nr)
2165 continue;
2166
2167 esp_nr = nr;
2168
2169 r = free_and_strdup(&esp, node);
2170 if (r < 0)
2171 return log_oom();
ada4799a 2172 }
1b9e5b12 2173#ifdef GPT_ROOT_NATIVE
ada4799a 2174 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2175
ada4799a
LP
2176 if (root && nr >= root_nr)
2177 continue;
1b9e5b12 2178
ada4799a
LP
2179 root_nr = nr;
2180 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2181
ada4799a
LP
2182 r = free_and_strdup(&root, node);
2183 if (r < 0)
2184 return log_oom();
2185 }
1b9e5b12
LP
2186#endif
2187#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2188 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2189
2190 if (secondary_root && nr >= secondary_root_nr)
2191 continue;
2192
2193 secondary_root_nr = nr;
2194 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2195
2196 r = free_and_strdup(&secondary_root, node);
2197 if (r < 0)
2198 return log_oom();
2199 }
2200#endif
f6c51a81
LP
2201 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2202
2203 if (generic)
2204 multiple_generic = true;
2205 else {
2206 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2207
2208 r = free_and_strdup(&generic, node);
2209 if (r < 0)
2210 return log_oom();
2211 }
2212 }
ada4799a
LP
2213
2214 } else if (is_mbr) {
2215 int type;
1b9e5b12 2216
f6c51a81
LP
2217 if (flags != 0x80) /* Bootable flag */
2218 continue;
2219
ada4799a
LP
2220 type = blkid_partition_get_type(pp);
2221 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2222 continue;
2223
f6c51a81
LP
2224 if (generic)
2225 multiple_generic = true;
2226 else {
2227 generic_rw = true;
727fd4fd 2228
f6c51a81
LP
2229 r = free_and_strdup(&root, node);
2230 if (r < 0)
2231 return log_oom();
2232 }
1b9e5b12 2233 }
1b9e5b12
LP
2234 }
2235
1b9e5b12
LP
2236 if (root) {
2237 *root_device = root;
2238 root = NULL;
727fd4fd
LP
2239
2240 *root_device_rw = root_rw;
1b9e5b12
LP
2241 *secondary = false;
2242 } else if (secondary_root) {
2243 *root_device = secondary_root;
2244 secondary_root = NULL;
727fd4fd
LP
2245
2246 *root_device_rw = secondary_root_rw;
1b9e5b12 2247 *secondary = true;
f6c51a81
LP
2248 } else if (generic) {
2249
2250 /* There were no partitions with precise meanings
2251 * around, but we found generic partitions. In this
2252 * case, if there's only one, we can go ahead and boot
2253 * it, otherwise we bail out, because we really cannot
2254 * make any sense of it. */
2255
2256 if (multiple_generic) {
2257 log_error("Identified multiple bootable Linux partitions on\n"
2258 " %s\n"
2259 PARTITION_TABLE_BLURB, arg_image);
2260 return -EINVAL;
2261 }
2262
2263 *root_device = generic;
2264 generic = NULL;
2265
2266 *root_device_rw = generic_rw;
2267 *secondary = false;
2268 } else {
2269 log_error("Failed to identify root partition in disk image\n"
2270 " %s\n"
2271 PARTITION_TABLE_BLURB, arg_image);
2272 return -EINVAL;
1b9e5b12
LP
2273 }
2274
2275 if (home) {
2276 *home_device = home;
2277 home = NULL;
727fd4fd
LP
2278
2279 *home_device_rw = home_rw;
1b9e5b12
LP
2280 }
2281
2282 if (srv) {
2283 *srv_device = srv;
2284 srv = NULL;
727fd4fd
LP
2285
2286 *srv_device_rw = srv_rw;
1b9e5b12
LP
2287 }
2288
a6bc7db9
LP
2289 if (esp) {
2290 *esp_device = esp;
2291 esp = NULL;
2292 }
2293
1b9e5b12
LP
2294 return 0;
2295#else
2296 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2297 return -EOPNOTSUPP;
1b9e5b12
LP
2298#endif
2299}
2300
727fd4fd 2301static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2302#ifdef HAVE_BLKID
2303 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2bce2acc 2304 const char *fstype, *p, *options;
1b9e5b12
LP
2305 int r;
2306
2307 assert(what);
2308 assert(where);
2309
727fd4fd
LP
2310 if (arg_read_only)
2311 rw = false;
2312
1b9e5b12 2313 if (directory)
63c372cb 2314 p = strjoina(where, directory);
1b9e5b12
LP
2315 else
2316 p = where;
2317
2318 errno = 0;
2319 b = blkid_new_probe_from_filename(what);
2320 if (!b) {
2321 if (errno == 0)
2322 return log_oom();
e1427b13 2323 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2324 }
2325
2326 blkid_probe_enable_superblocks(b, 1);
2327 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2328
2329 errno = 0;
2330 r = blkid_do_safeprobe(b);
2331 if (r == -1 || r == 1) {
2332 log_error("Cannot determine file system type of %s", what);
2333 return -EINVAL;
2334 } else if (r != 0) {
2335 if (errno == 0)
2336 errno = EIO;
e1427b13 2337 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2338 }
2339
2340 errno = 0;
2341 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2342 if (errno == 0)
2343 errno = EINVAL;
2344 log_error("Failed to determine file system type of %s", what);
2345 return -errno;
2346 }
2347
2348 if (streq(fstype, "crypto_LUKS")) {
2349 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2350 return -EOPNOTSUPP;
1b9e5b12
LP
2351 }
2352
2bce2acc
LP
2353 /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
2354 * sparse when possible. */
2355 if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
2356 const char *l;
2357
2358 l = path_startswith(what, "/dev");
2359 if (l && startswith(l, "loop"))
2360 options = "discard";
2361 }
2362
2363 return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
1b9e5b12
LP
2364#else
2365 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2366 return -EOPNOTSUPP;
1b9e5b12
LP
2367#endif
2368}
2369
317feb4d 2370static int setup_machine_id(const char *directory) {
691675ba
LP
2371 const char *etc_machine_id;
2372 sd_id128_t id;
3bbaff3e 2373 int r;
e01ff70a 2374
317feb4d
LP
2375 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2376 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2377 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2378 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2379 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2380 * container behaves nicely). */
2381
e01ff70a
MS
2382 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2383
691675ba 2384 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2385 if (r < 0) {
2386 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2387 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2388
317feb4d
LP
2389 if (sd_id128_is_null(arg_uuid)) {
2390 r = sd_id128_randomize(&arg_uuid);
2391 if (r < 0)
2392 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2393 }
2394 } else {
2395 if (sd_id128_is_null(id)) {
2396 log_error("Machine ID in container image is zero, refusing.");
2397 return -EINVAL;
2398 }
e01ff70a 2399
317feb4d
LP
2400 arg_uuid = id;
2401 }
691675ba 2402
e01ff70a
MS
2403 return 0;
2404}
2405
7336138e
LP
2406static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2407 int r;
2408
2409 assert(directory);
2410
0de7acce 2411 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2412 return 0;
2413
2414 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2415 if (r == -EOPNOTSUPP)
2416 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2417 if (r == -EBADE)
2418 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2419 if (r < 0)
2420 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2421 if (r == 0)
2422 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2423 else
2424 log_debug("Patched directory tree to match UID/GID range.");
2425
2426 return r;
2427}
2428
727fd4fd
LP
2429static int mount_devices(
2430 const char *where,
2431 const char *root_device, bool root_device_rw,
2432 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2433 const char *srv_device, bool srv_device_rw,
2434 const char *esp_device) {
1b9e5b12
LP
2435 int r;
2436
2437 assert(where);
2438
2439 if (root_device) {
727fd4fd 2440 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2441 if (r < 0)
2442 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2443 }
2444
2445 if (home_device) {
727fd4fd 2446 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2449 }
2450
2451 if (srv_device) {
727fd4fd 2452 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2453 if (r < 0)
2454 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2455 }
2456
a6bc7db9
LP
2457 if (esp_device) {
2458 const char *mp, *x;
2459
2460 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2461
2462 mp = "/efi";
2463 x = strjoina(arg_directory, mp);
2464 r = dir_is_empty(x);
2465 if (r == -ENOENT) {
2466 mp = "/boot";
2467 x = strjoina(arg_directory, mp);
2468 r = dir_is_empty(x);
2469 }
2470
2471 if (r > 0) {
2472 r = mount_device(esp_device, arg_directory, mp, true);
2473 if (r < 0)
2474 return log_error_errno(r, "Failed to mount ESP: %m");
2475 }
2476 }
2477
1b9e5b12
LP
2478 return 0;
2479}
2480
2481static void loop_remove(int nr, int *image_fd) {
2482 _cleanup_close_ int control = -1;
e8c8ddcc 2483 int r;
1b9e5b12
LP
2484
2485 if (nr < 0)
2486 return;
2487
2488 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2489 r = ioctl(*image_fd, LOOP_CLR_FD);
2490 if (r < 0)
5e4074aa 2491 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2492 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2493 }
2494
2495 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2496 if (control < 0) {
56f64d95 2497 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2498 return;
e8c8ddcc 2499 }
1b9e5b12 2500
e8c8ddcc
TG
2501 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2502 if (r < 0)
5e4074aa 2503 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2504}
2505
113cea80 2506/*
6d416b9c
LS
2507 * Return values:
2508 * < 0 : wait_for_terminate() failed to get the state of the
2509 * container, the container was terminated by a signal, or
2510 * failed for an unknown reason. No change is made to the
2511 * container argument.
2512 * > 0 : The program executed in the container terminated with an
2513 * error. The exit code of the program executed in the
919699ec
LP
2514 * container is returned. The container argument has been set
2515 * to CONTAINER_TERMINATED.
6d416b9c
LS
2516 * 0 : The container is being rebooted, has been shut down or exited
2517 * successfully. The container argument has been set to either
2518 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2519 *
6d416b9c
LS
2520 * That is, success is indicated by a return value of zero, and an
2521 * error is indicated by a non-zero value.
113cea80
DH
2522 */
2523static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2524 siginfo_t status;
919699ec 2525 int r;
113cea80
DH
2526
2527 r = wait_for_terminate(pid, &status);
f647962d
MS
2528 if (r < 0)
2529 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2530
2531 switch (status.si_code) {
fddbb89c 2532
113cea80 2533 case CLD_EXITED:
b5a2179b 2534 if (status.si_status == 0)
919699ec 2535 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2536 else
919699ec 2537 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2538
919699ec
LP
2539 *container = CONTAINER_TERMINATED;
2540 return status.si_status;
113cea80
DH
2541
2542 case CLD_KILLED:
2543 if (status.si_status == SIGINT) {
919699ec 2544 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2545 *container = CONTAINER_TERMINATED;
919699ec
LP
2546 return 0;
2547
113cea80 2548 } else if (status.si_status == SIGHUP) {
919699ec 2549 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2550 *container = CONTAINER_REBOOTED;
919699ec 2551 return 0;
113cea80 2552 }
919699ec 2553
113cea80
DH
2554 /* CLD_KILLED fallthrough */
2555
2556 case CLD_DUMPED:
fddbb89c 2557 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2558 return -EIO;
113cea80
DH
2559
2560 default:
fddbb89c 2561 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2562 return -EIO;
113cea80 2563 }
113cea80
DH
2564}
2565
023fb90b
LP
2566static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2567 pid_t pid;
2568
4a0b58c4 2569 pid = PTR_TO_PID(userdata);
023fb90b 2570 if (pid > 0) {
c6c8f6e2 2571 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2572 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2573 sd_event_source_set_userdata(s, NULL);
2574 return 0;
2575 }
2576 }
2577
2578 sd_event_exit(sd_event_source_get_event(s), 0);
2579 return 0;
2580}
2581
ec16945e 2582static int determine_names(void) {
1b9cebf6 2583 int r;
ec16945e 2584
c1521918
LP
2585 if (arg_template && !arg_directory && arg_machine) {
2586
2587 /* If --template= was specified then we should not
2588 * search for a machine, but instead create a new one
2589 * in /var/lib/machine. */
2590
605405c6 2591 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2592 if (!arg_directory)
2593 return log_oom();
2594 }
2595
ec16945e 2596 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2597 if (arg_machine) {
2598 _cleanup_(image_unrefp) Image *i = NULL;
2599
2600 r = image_find(arg_machine, &i);
2601 if (r < 0)
2602 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2603 if (r == 0) {
1b9cebf6
LP
2604 log_error("No image for machine '%s': %m", arg_machine);
2605 return -ENOENT;
2606 }
2607
aceac2f0 2608 if (i->type == IMAGE_RAW)
0f03c2a4 2609 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2610 else
0f03c2a4 2611 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2612 if (r < 0)
0f3be6ca 2613 return log_oom();
1b9cebf6 2614
aee327b8
LP
2615 if (!arg_ephemeral)
2616 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2617 } else
ec16945e
LP
2618 arg_directory = get_current_dir_name();
2619
0f3be6ca 2620 if (!arg_directory && !arg_image) {
1b9cebf6 2621 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2622 return -EINVAL;
2623 }
2624 }
2625
2626 if (!arg_machine) {
b9ba4dab
LP
2627 if (arg_directory && path_equal(arg_directory, "/"))
2628 arg_machine = gethostname_malloc();
2629 else
2630 arg_machine = strdup(basename(arg_image ?: arg_directory));
ec16945e
LP
2631 if (!arg_machine)
2632 return log_oom();
2633
ae691c1d 2634 hostname_cleanup(arg_machine);
ec16945e
LP
2635 if (!machine_name_is_valid(arg_machine)) {
2636 log_error("Failed to determine machine name automatically, please use -M.");
2637 return -EINVAL;
2638 }
b9ba4dab
LP
2639
2640 if (arg_ephemeral) {
2641 char *b;
2642
2643 /* Add a random suffix when this is an
2644 * ephemeral machine, so that we can run many
2645 * instances at once without manually having
2646 * to specify -M each time. */
2647
2648 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2649 return log_oom();
2650
2651 free(arg_machine);
2652 arg_machine = b;
2653 }
ec16945e
LP
2654 }
2655
2656 return 0;
2657}
2658
03cfe0d5 2659static int determine_uid_shift(const char *directory) {
6dac160c
LP
2660 int r;
2661
0de7acce 2662 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2663 arg_uid_shift = 0;
6dac160c 2664 return 0;
03cfe0d5 2665 }
6dac160c
LP
2666
2667 if (arg_uid_shift == UID_INVALID) {
2668 struct stat st;
2669
03cfe0d5 2670 r = stat(directory, &st);
6dac160c 2671 if (r < 0)
03cfe0d5 2672 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2673
2674 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2675
2676 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2677 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2678 return -EINVAL;
2679 }
2680
2681 arg_uid_range = UINT32_C(0x10000);
2682 }
2683
2684 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2685 log_error("UID base too high for UID range.");
2686 return -EINVAL;
2687 }
2688
6dac160c
LP
2689 return 0;
2690}
2691
03cfe0d5
LP
2692static int inner_child(
2693 Barrier *barrier,
2694 const char *directory,
2695 bool secondary,
2696 int kmsg_socket,
2697 int rtnl_socket,
f757855e 2698 FDSet *fds) {
69c79d3c 2699
03cfe0d5 2700 _cleanup_free_ char *home = NULL;
e01ff70a 2701 char as_uuid[37];
6aadfa4c 2702 unsigned n_env = 1;
03cfe0d5
LP
2703 const char *envp[] = {
2704 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2705 NULL, /* container */
03cfe0d5
LP
2706 NULL, /* TERM */
2707 NULL, /* HOME */
2708 NULL, /* USER */
2709 NULL, /* LOGNAME */
2710 NULL, /* container_uuid */
2711 NULL, /* LISTEN_FDS */
2712 NULL, /* LISTEN_PID */
9c1e04d0 2713 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2714 NULL
2715 };
88213476 2716
2371271c 2717 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2718 int r;
88213476 2719
03cfe0d5
LP
2720 assert(barrier);
2721 assert(directory);
2722 assert(kmsg_socket >= 0);
88213476 2723
efdb0237
LP
2724 cg_unified_flush();
2725
0de7acce 2726 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2727 /* Tell the parent, that it now can write the UID map. */
2728 (void) barrier_place(barrier); /* #1 */
7027ff61 2729
03cfe0d5
LP
2730 /* Wait until the parent wrote the UID map */
2731 if (!barrier_place_and_sync(barrier)) { /* #2 */
2732 log_error("Parent died too early");
2733 return -ESRCH;
2734 }
88213476
LP
2735 }
2736
6d66bd3b
EV
2737 r = reset_uid_gid();
2738 if (r < 0)
2739 return log_error_errno(r, "Couldn't become new root: %m");
2740
0de7acce 2741 r = mount_all(NULL,
4f086aab 2742 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2743 arg_uid_shift,
2744 arg_uid_range,
2745 arg_selinux_apifs_context);
2746
03cfe0d5
LP
2747 if (r < 0)
2748 return r;
2749
4f086aab 2750 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2751 if (r < 0)
2752 return r;
2753
03cfe0d5
LP
2754 /* Wait until we are cgroup-ified, so that we
2755 * can mount the right cgroup path writable */
2756 if (!barrier_place_and_sync(barrier)) { /* #3 */
2757 log_error("Parent died too early");
2758 return -ESRCH;
88213476
LP
2759 }
2760
5a8ff0e6 2761 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2762 r = unshare(CLONE_NEWCGROUP);
2763 if (r < 0)
2764 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2765 r = mount_cgroups(
2766 "",
2767 arg_unified_cgroup_hierarchy,
2768 arg_userns_mode != USER_NAMESPACE_NO,
2769 arg_uid_shift,
2770 arg_uid_range,
5a8ff0e6 2771 arg_selinux_apifs_context,
ada54120 2772 true);
0996ef00
CB
2773 if (r < 0)
2774 return r;
2775 } else {
2776 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2777 if (r < 0)
2778 return r;
2779 }
ec16945e 2780
03cfe0d5
LP
2781 r = setup_boot_id(NULL);
2782 if (r < 0)
2783 return r;
ec16945e 2784
03cfe0d5
LP
2785 r = setup_kmsg(NULL, kmsg_socket);
2786 if (r < 0)
2787 return r;
2788 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2789
03cfe0d5 2790 umask(0022);
30535c16 2791
03cfe0d5
LP
2792 if (setsid() < 0)
2793 return log_error_errno(errno, "setsid() failed: %m");
2794
2795 if (arg_private_network)
2796 loopback_setup();
2797
7a8f6325
LP
2798 if (arg_expose_ports) {
2799 r = expose_port_send_rtnl(rtnl_socket);
2800 if (r < 0)
2801 return r;
2802 rtnl_socket = safe_close(rtnl_socket);
2803 }
03cfe0d5 2804
709f6e46
MS
2805 r = drop_capabilities();
2806 if (r < 0)
2807 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2808
2809 setup_hostname();
2810
050f7277 2811 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2812 if (personality(arg_personality) < 0)
2813 return log_error_errno(errno, "personality() failed: %m");
2814 } else if (secondary) {
2815 if (personality(PER_LINUX32) < 0)
2816 return log_error_errno(errno, "personality() failed: %m");
2817 }
2818
2819#ifdef HAVE_SELINUX
2820 if (arg_selinux_context)
2ed96880 2821 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2822 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2823#endif
2824
ee645080 2825 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2826 if (r < 0)
2827 return r;
2828
6aadfa4c
ILG
2829 /* LXC sets container=lxc, so follow the scheme here */
2830 envp[n_env++] = strjoina("container=", arg_container_service_name);
2831
03cfe0d5
LP
2832 envp[n_env] = strv_find_prefix(environ, "TERM=");
2833 if (envp[n_env])
313cefa1 2834 n_env++;
03cfe0d5
LP
2835
2836 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2837 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2838 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2839 return log_oom();
2840
3bbaff3e 2841 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2842
691675ba 2843 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2844 return log_oom();
03cfe0d5
LP
2845
2846 if (fdset_size(fds) > 0) {
2847 r = fdset_cloexec(fds, false);
2848 if (r < 0)
2849 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2850
2851 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2852 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2853 return log_oom();
2854 }
9c1e04d0
AP
2855 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2856 return log_oom();
03cfe0d5 2857
2371271c
TG
2858 env_use = strv_env_merge(2, envp, arg_setenv);
2859 if (!env_use)
2860 return log_oom();
03cfe0d5
LP
2861
2862 /* Let the parent know that we are ready and
2863 * wait until the parent is ready with the
2864 * setup, too... */
2865 if (!barrier_place_and_sync(barrier)) { /* #4 */
2866 log_error("Parent died too early");
2867 return -ESRCH;
2868 }
2869
5f932eb9
LP
2870 if (arg_chdir)
2871 if (chdir(arg_chdir) < 0)
2872 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2873
7732f92b
LP
2874 if (arg_start_mode == START_PID2) {
2875 r = stub_pid1();
2876 if (r < 0)
2877 return r;
2878 }
2879
03cfe0d5
LP
2880 /* Now, explicitly close the log, so that we
2881 * then can close all remaining fds. Closing
2882 * the log explicitly first has the benefit
2883 * that the logging subsystem knows about it,
2884 * and is thus ready to be reopened should we
2885 * need it again. Note that the other fds
2886 * closed here are at least the locking and
2887 * barrier fds. */
2888 log_close();
2889 (void) fdset_close_others(fds);
2890
7732f92b 2891 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2892 char **a;
2893 size_t m;
2894
2895 /* Automatically search for the init system */
2896
75f32f04
ZJS
2897 m = strv_length(arg_parameters);
2898 a = newa(char*, m + 2);
2899 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2900 a[1 + m] = NULL;
03cfe0d5
LP
2901
2902 a[0] = (char*) "/usr/lib/systemd/systemd";
2903 execve(a[0], a, env_use);
2904
2905 a[0] = (char*) "/lib/systemd/systemd";
2906 execve(a[0], a, env_use);
2907
2908 a[0] = (char*) "/sbin/init";
2909 execve(a[0], a, env_use);
f757855e
LP
2910 } else if (!strv_isempty(arg_parameters))
2911 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2912 else {
5f932eb9 2913 if (!arg_chdir)
d929b0f9
ZJS
2914 /* If we cannot change the directory, we'll end up in /, that is expected. */
2915 (void) chdir(home ?: "/root");
5f932eb9 2916
03cfe0d5
LP
2917 execle("/bin/bash", "-bash", NULL, env_use);
2918 execle("/bin/sh", "-sh", NULL, env_use);
2919 }
2920
35607a8d 2921 r = -errno;
03cfe0d5 2922 (void) log_open();
35607a8d 2923 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2924}
2925
9c1e04d0
AP
2926static int setup_sd_notify_child(void) {
2927 static const int one = 1;
2928 int fd = -1;
2929 union sockaddr_union sa = {
2930 .sa.sa_family = AF_UNIX,
2931 };
2932 int r;
2933
2934 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2935 if (fd < 0)
2936 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2937
2938 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2939 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2940
2941 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2942 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2943 if (r < 0) {
2944 safe_close(fd);
2945 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2946 }
2947
2948 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2949 if (r < 0) {
2950 safe_close(fd);
2951 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2952 }
2953
2954 return fd;
2955}
2956
03cfe0d5
LP
2957static int outer_child(
2958 Barrier *barrier,
2959 const char *directory,
2960 const char *console,
2961 const char *root_device, bool root_device_rw,
2962 const char *home_device, bool home_device_rw,
2963 const char *srv_device, bool srv_device_rw,
a6bc7db9 2964 const char *esp_device,
03cfe0d5
LP
2965 bool interactive,
2966 bool secondary,
2967 int pid_socket,
e01ff70a 2968 int uuid_socket,
9c1e04d0 2969 int notify_socket,
03cfe0d5
LP
2970 int kmsg_socket,
2971 int rtnl_socket,
825d5287 2972 int uid_shift_socket,
f757855e 2973 FDSet *fds) {
03cfe0d5
LP
2974
2975 pid_t pid;
2976 ssize_t l;
2977 int r;
9c1e04d0 2978 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2979
2980 assert(barrier);
2981 assert(directory);
2982 assert(console);
2983 assert(pid_socket >= 0);
e01ff70a 2984 assert(uuid_socket >= 0);
9c1e04d0 2985 assert(notify_socket >= 0);
03cfe0d5
LP
2986 assert(kmsg_socket >= 0);
2987
efdb0237
LP
2988 cg_unified_flush();
2989
03cfe0d5
LP
2990 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2991 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2992
2993 if (interactive) {
2994 close_nointr(STDIN_FILENO);
2995 close_nointr(STDOUT_FILENO);
2996 close_nointr(STDERR_FILENO);
2997
2998 r = open_terminal(console, O_RDWR);
2999 if (r != STDIN_FILENO) {
3000 if (r >= 0) {
3001 safe_close(r);
3002 r = -EINVAL;
3003 }
3004
3005 return log_error_errno(r, "Failed to open console: %m");
3006 }
3007
3008 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3009 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
3010 return log_error_errno(errno, "Failed to duplicate console: %m");
3011 }
3012
3013 r = reset_audit_loginuid();
3014 if (r < 0)
3015 return r;
3016
3017 /* Mark everything as slave, so that we still
3018 * receive mounts from the real root, but don't
3019 * propagate mounts to the real root. */
60e76d48
ZJS
3020 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3021 if (r < 0)
3022 return r;
03cfe0d5
LP
3023
3024 r = mount_devices(directory,
3025 root_device, root_device_rw,
3026 home_device, home_device_rw,
a6bc7db9
LP
3027 srv_device, srv_device_rw,
3028 esp_device);
03cfe0d5
LP
3029 if (r < 0)
3030 return r;
3031
391567f4
LP
3032 r = determine_uid_shift(directory);
3033 if (r < 0)
3034 return r;
3035
0fd9563f
ZJS
3036 r = detect_unified_cgroup_hierarchy(directory);
3037 if (r < 0)
3038 return r;
3039
0de7acce 3040 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3041 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3042 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3043 if (l < 0)
3044 return log_error_errno(errno, "Failed to send UID shift: %m");
3045 if (l != sizeof(arg_uid_shift)) {
3046 log_error("Short write while sending UID shift.");
3047 return -EIO;
3048 }
0e7ac751 3049
0de7acce 3050 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3051 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3052 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3053 * not it will pick a different one, and send it back to us. */
3054
3055 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3056 if (l < 0)
3057 return log_error_errno(errno, "Failed to recv UID shift: %m");
3058 if (l != sizeof(arg_uid_shift)) {
595bfe7d 3059 log_error("Short read while receiving UID shift.");
0e7ac751
LP
3060 return -EIO;
3061 }
3062 }
3063
3064 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3065 }
3066
03cfe0d5 3067 /* Turn directory into bind mount */
60e76d48
ZJS
3068 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3069 if (r < 0)
3070 return r;
03cfe0d5 3071
19caffac
AC
3072 /* Mark everything as shared so our mounts get propagated down. This is
3073 * required to make new bind mounts available in systemd services
3074 * inside the containter that create a new mount namespace.
3075 * See https://github.com/systemd/systemd/issues/3860
3076 * Further submounts (such as /dev) done after this will inherit the
3077 * shared propagation mode.*/
60e76d48
ZJS
3078 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3079 if (r < 0)
3080 return r;
19caffac 3081
7336138e 3082 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
3083 if (r < 0)
3084 return r;
3085
0de7acce
LP
3086 r = setup_volatile(
3087 directory,
3088 arg_volatile_mode,
3089 arg_userns_mode != USER_NAMESPACE_NO,
3090 arg_uid_shift,
3091 arg_uid_range,
3092 arg_selinux_context);
03cfe0d5
LP
3093 if (r < 0)
3094 return r;
3095
0de7acce
LP
3096 r = setup_volatile_state(
3097 directory,
3098 arg_volatile_mode,
3099 arg_userns_mode != USER_NAMESPACE_NO,
3100 arg_uid_shift,
3101 arg_uid_range,
3102 arg_selinux_context);
03cfe0d5
LP
3103 if (r < 0)
3104 return r;
3105
03cfe0d5
LP
3106 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3107 if (r < 0)
3108 return r;
3109
03cfe0d5 3110 if (arg_read_only) {
6b7c9f8b 3111 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3112 if (r < 0)
3113 return log_error_errno(r, "Failed to make tree read-only: %m");
3114 }
3115
0de7acce 3116 r = mount_all(directory,
4f086aab 3117 arg_mount_settings,
0de7acce
LP
3118 arg_uid_shift,
3119 arg_uid_range,
3120 arg_selinux_apifs_context);
03cfe0d5
LP
3121 if (r < 0)
3122 return r;
3123
07fa00f9
LP
3124 r = copy_devnodes(directory);
3125 if (r < 0)
03cfe0d5
LP
3126 return r;
3127
3128 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3129
07fa00f9
LP
3130 r = setup_pts(directory);
3131 if (r < 0)
03cfe0d5
LP
3132 return r;
3133
3134 r = setup_propagate(directory);
3135 if (r < 0)
3136 return r;
3137
3138 r = setup_dev_console(directory, console);
3139 if (r < 0)
3140 return r;
3141
520e0d54 3142 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3143 if (r < 0)
3144 return r;
3145
3146 r = setup_timezone(directory);
3147 if (r < 0)
3148 return r;
3149
3150 r = setup_resolv_conf(directory);
3151 if (r < 0)
3152 return r;
3153
e01ff70a
MS
3154 r = setup_machine_id(directory);
3155 if (r < 0)
3156 return r;
3157
03cfe0d5
LP
3158 r = setup_journal(directory);
3159 if (r < 0)
3160 return r;
3161
0de7acce
LP
3162 r = mount_custom(
3163 directory,
3164 arg_custom_mounts,
3165 arg_n_custom_mounts,
3166 arg_userns_mode != USER_NAMESPACE_NO,
3167 arg_uid_shift,
3168 arg_uid_range,
3169 arg_selinux_apifs_context);
03cfe0d5
LP
3170 if (r < 0)
3171 return r;
3172
5a8ff0e6 3173 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3174 r = mount_cgroups(
3175 directory,
3176 arg_unified_cgroup_hierarchy,
3177 arg_userns_mode != USER_NAMESPACE_NO,
3178 arg_uid_shift,
3179 arg_uid_range,
5a8ff0e6 3180 arg_selinux_apifs_context,
ada54120 3181 false);
0996ef00
CB
3182 if (r < 0)
3183 return r;
3184 }
03cfe0d5
LP
3185
3186 r = mount_move_root(directory);
3187 if (r < 0)
3188 return log_error_errno(r, "Failed to move root directory: %m");
3189
9c1e04d0
AP
3190 fd = setup_sd_notify_child();
3191 if (fd < 0)
3192 return fd;
3193
03cfe0d5 3194 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3195 arg_clone_ns_flags |
03cfe0d5 3196 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3197 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3198 if (pid < 0)
3199 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3200 if (pid == 0) {
3201 pid_socket = safe_close(pid_socket);
e01ff70a 3202 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3203 notify_socket = safe_close(notify_socket);
825d5287 3204 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3205
3206 /* The inner child has all namespaces that are
3207 * requested, so that we all are owned by the user if
3208 * user namespaces are turned on. */
3209
f757855e 3210 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3211 if (r < 0)
3212 _exit(EXIT_FAILURE);
3213
3214 _exit(EXIT_SUCCESS);
3215 }
3216
3217 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3218 if (l < 0)
3219 return log_error_errno(errno, "Failed to send PID: %m");
3220 if (l != sizeof(pid)) {
3221 log_error("Short write while sending PID.");
3222 return -EIO;
3223 }
3224
e01ff70a
MS
3225 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3226 if (l < 0)
3227 return log_error_errno(errno, "Failed to send machine ID: %m");
3228 if (l != sizeof(arg_uuid)) {
3229 log_error("Short write while sending machine ID.");
3230 return -EIO;
3231 }
3232
9c1e04d0
AP
3233 l = send_one_fd(notify_socket, fd, 0);
3234 if (l < 0)
3235 return log_error_errno(errno, "Failed to send notify fd: %m");
3236
03cfe0d5 3237 pid_socket = safe_close(pid_socket);
e01ff70a 3238 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3239 notify_socket = safe_close(notify_socket);
327e26d6
KN
3240 kmsg_socket = safe_close(kmsg_socket);
3241 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3242
3243 return 0;
3244}
3245
0e7ac751
LP
3246static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3247 unsigned n_tries = 100;
3248 uid_t candidate;
3249 int r;
3250
3251 assert(shift);
3252 assert(ret_lock_file);
0de7acce 3253 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3254 assert(arg_uid_range == 0x10000U);
3255
3256 candidate = *shift;
3257
3258 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3259
3260 for (;;) {
3261 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3262 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3263
3264 if (--n_tries <= 0)
3265 return -EBUSY;
3266
3267 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3268 goto next;
3269 if ((candidate & UINT32_C(0xFFFF)) != 0)
3270 goto next;
3271
3272 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3273 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3274 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3275 goto next;
3276 if (r < 0)
3277 return r;
3278
3279 /* Make some superficial checks whether the range is currently known in the user database */
3280 if (getpwuid(candidate))
3281 goto next;
3282 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3283 goto next;
3284 if (getgrgid(candidate))
3285 goto next;
3286 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3287 goto next;
3288
3289 *ret_lock_file = lf;
3290 lf = (struct LockFile) LOCK_FILE_INIT;
3291 *shift = candidate;
3292 return 0;
3293
3294 next:
3295 random_bytes(&candidate, sizeof(candidate));
3296 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3297 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3298 }
3299}
3300
03cfe0d5
LP
3301static int setup_uid_map(pid_t pid) {
3302 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3303 int r;
3304
3305 assert(pid > 1);
3306
3307 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3308 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3309 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3310 if (r < 0)
3311 return log_error_errno(r, "Failed to write UID map: %m");
3312
3313 /* We always assign the same UID and GID ranges */
3314 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3315 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3316 if (r < 0)
3317 return log_error_errno(r, "Failed to write GID map: %m");
3318
3319 return 0;
3320}
3321
9c1e04d0 3322static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3323 char buf[NOTIFY_BUFFER_MAX+1];
3324 char *p = NULL;
3325 struct iovec iovec = {
3326 .iov_base = buf,
3327 .iov_len = sizeof(buf)-1,
3328 };
3329 union {
3330 struct cmsghdr cmsghdr;
3331 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3332 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3333 } control = {};
3334 struct msghdr msghdr = {
3335 .msg_iov = &iovec,
3336 .msg_iovlen = 1,
3337 .msg_control = &control,
3338 .msg_controllen = sizeof(control),
3339 };
3340 struct cmsghdr *cmsg;
3341 struct ucred *ucred = NULL;
3342 ssize_t n;
3343 pid_t inner_child_pid;
3344 _cleanup_strv_free_ char **tags = NULL;
3345
3346 assert(userdata);
3347
3348 inner_child_pid = PTR_TO_PID(userdata);
3349
3350 if (revents != EPOLLIN) {
3351 log_warning("Got unexpected poll event for notify fd.");
3352 return 0;
3353 }
3354
3355 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3356 if (n < 0) {
3357 if (errno == EAGAIN || errno == EINTR)
3358 return 0;
3359
3360 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3361 }
3362 cmsg_close_all(&msghdr);
3363
3364 CMSG_FOREACH(cmsg, &msghdr) {
3365 if (cmsg->cmsg_level == SOL_SOCKET &&
3366 cmsg->cmsg_type == SCM_CREDENTIALS &&
3367 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3368
3369 ucred = (struct ucred*) CMSG_DATA(cmsg);
3370 }
3371 }
3372
3373 if (!ucred || ucred->pid != inner_child_pid) {
3374 log_warning("Received notify message without valid credentials. Ignoring.");
3375 return 0;
3376 }
3377
3378 if ((size_t) n >= sizeof(buf)) {
3379 log_warning("Received notify message exceeded maximum size. Ignoring.");
3380 return 0;
3381 }
3382
3383 buf[n] = 0;
3384 tags = strv_split(buf, "\n\r");
3385 if (!tags)
3386 return log_oom();
3387
3388 if (strv_find(tags, "READY=1"))
3389 sd_notifyf(false, "READY=1\n");
3390
3391 p = strv_find_startswith(tags, "STATUS=");
3392 if (p)
3393 sd_notifyf(false, "STATUS=Container running: %s", p);
3394
3395 return 0;
3396}
3397
3398static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3399 int r;
3400 sd_event_source *notify_event_source;
3401
3402 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3403 if (r < 0)
3404 return log_error_errno(r, "Failed to allocate notify event source: %m");
3405
3406 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3407
3408 return 0;
3409}
3410
f757855e
LP
3411static int load_settings(void) {
3412 _cleanup_(settings_freep) Settings *settings = NULL;
3413 _cleanup_fclose_ FILE *f = NULL;
3414 _cleanup_free_ char *p = NULL;
3415 const char *fn, *i;
3416 int r;
3417
3418 /* If all settings are masked, there's no point in looking for
3419 * the settings file */
3420 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3421 return 0;
3422
3423 fn = strjoina(arg_machine, ".nspawn");
3424
3425 /* We first look in the admin's directories in /etc and /run */
3426 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3427 _cleanup_free_ char *j = NULL;
3428
605405c6 3429 j = strjoin(i, "/", fn);
f757855e
LP
3430 if (!j)
3431 return log_oom();
3432
3433 f = fopen(j, "re");
3434 if (f) {
3435 p = j;
3436 j = NULL;
3437
b938cb90 3438 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3439 if (arg_settings_trusted < 0)
3440 arg_settings_trusted = true;
3441
3442 break;
3443 }
3444
3445 if (errno != ENOENT)
3446 return log_error_errno(errno, "Failed to open %s: %m", j);
3447 }
3448
3449 if (!f) {
3450 /* After that, let's look for a file next to the
3451 * actual image we shall boot. */
3452
3453 if (arg_image) {
3454 p = file_in_same_dir(arg_image, fn);
3455 if (!p)
3456 return log_oom();
3457 } else if (arg_directory) {
3458 p = file_in_same_dir(arg_directory, fn);
3459 if (!p)
3460 return log_oom();
3461 }
3462
3463 if (p) {
3464 f = fopen(p, "re");
3465 if (!f && errno != ENOENT)
3466 return log_error_errno(errno, "Failed to open %s: %m", p);
3467
b938cb90 3468 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3469 if (arg_settings_trusted < 0)
3470 arg_settings_trusted = false;
3471 }
3472 }
3473
3474 if (!f)
3475 return 0;
3476
3477 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3478
3479 r = settings_load(f, p, &settings);
3480 if (r < 0)
3481 return r;
3482
3483 /* Copy over bits from the settings, unless they have been
3484 * explicitly masked by command line switches. */
3485
7732f92b
LP
3486 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3487 settings->start_mode >= 0) {
3488 arg_start_mode = settings->start_mode;
f757855e
LP
3489
3490 strv_free(arg_parameters);
3491 arg_parameters = settings->parameters;
3492 settings->parameters = NULL;
3493 }
3494
5f932eb9
LP
3495 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3496 settings->working_directory) {
3497 free(arg_chdir);
3498 arg_chdir = settings->working_directory;
3499 settings->working_directory = NULL;
3500 }
3501
f757855e
LP
3502 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3503 settings->environment) {
3504 strv_free(arg_setenv);
3505 arg_setenv = settings->environment;
3506 settings->environment = NULL;
3507 }
3508
3509 if ((arg_settings_mask & SETTING_USER) == 0 &&
3510 settings->user) {
3511 free(arg_user);
3512 arg_user = settings->user;
3513 settings->user = NULL;
3514 }
3515
3516 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3517 uint64_t plus;
f757855e 3518
0e265674
LP
3519 plus = settings->capability;
3520 if (settings_private_network(settings))
3521 plus |= (1ULL << CAP_NET_ADMIN);
3522
3523 if (!arg_settings_trusted && plus != 0) {
3524 if (settings->capability != 0)
3525 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3526 } else
520e0d54 3527 arg_caps_retain |= plus;
f757855e 3528
520e0d54 3529 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3530 }
3531
3532 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3533 settings->kill_signal > 0)
3534 arg_kill_signal = settings->kill_signal;
3535
3536 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3537 settings->personality != PERSONALITY_INVALID)
3538 arg_personality = settings->personality;
3539
3540 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3541 !sd_id128_is_null(settings->machine_id)) {
3542
3543 if (!arg_settings_trusted)
3544 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3545 else
3546 arg_uuid = settings->machine_id;
3547 }
3548
3549 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3550 settings->read_only >= 0)
3551 arg_read_only = settings->read_only;
3552
3553 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3554 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3555 arg_volatile_mode = settings->volatile_mode;
3556
3557 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3558 settings->n_custom_mounts > 0) {
3559
3560 if (!arg_settings_trusted)
3561 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3562 else {
3563 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3564 arg_custom_mounts = settings->custom_mounts;
3565 arg_n_custom_mounts = settings->n_custom_mounts;
3566
3567 settings->custom_mounts = NULL;
3568 settings->n_custom_mounts = 0;
3569 }
3570 }
3571
3572 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3573 (settings->private_network >= 0 ||
3574 settings->network_veth >= 0 ||
3575 settings->network_bridge ||
22b28dfd 3576 settings->network_zone ||
f757855e
LP
3577 settings->network_interfaces ||
3578 settings->network_macvlan ||
f6d6bad1
LP
3579 settings->network_ipvlan ||
3580 settings->network_veth_extra)) {
f757855e
LP
3581
3582 if (!arg_settings_trusted)
3583 log_warning("Ignoring network settings, file %s is not trusted.", p);
3584 else {
f6d6bad1 3585 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3586 arg_private_network = settings_private_network(settings);
3587
f757855e
LP
3588 strv_free(arg_network_interfaces);
3589 arg_network_interfaces = settings->network_interfaces;
3590 settings->network_interfaces = NULL;
3591
3592 strv_free(arg_network_macvlan);
3593 arg_network_macvlan = settings->network_macvlan;
3594 settings->network_macvlan = NULL;
3595
3596 strv_free(arg_network_ipvlan);
3597 arg_network_ipvlan = settings->network_ipvlan;
3598 settings->network_ipvlan = NULL;
3599
f6d6bad1
LP
3600 strv_free(arg_network_veth_extra);
3601 arg_network_veth_extra = settings->network_veth_extra;
3602 settings->network_veth_extra = NULL;
3603
f757855e
LP
3604 free(arg_network_bridge);
3605 arg_network_bridge = settings->network_bridge;
3606 settings->network_bridge = NULL;
22b28dfd
LP
3607
3608 free(arg_network_zone);
3609 arg_network_zone = settings->network_zone;
3610 settings->network_zone = NULL;
f757855e
LP
3611 }
3612 }
3613
3614 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3615 settings->expose_ports) {
3616
3617 if (!arg_settings_trusted)
3618 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3619 else {
3620 expose_port_free_all(arg_expose_ports);
3621 arg_expose_ports = settings->expose_ports;
3622 settings->expose_ports = NULL;
3623 }
3624 }
3625
0de7acce
LP
3626 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3627 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3628
3629 if (!arg_settings_trusted)
3630 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3631 else {
3632 arg_userns_mode = settings->userns_mode;
3633 arg_uid_shift = settings->uid_shift;
3634 arg_uid_range = settings->uid_range;
3635 arg_userns_chown = settings->userns_chown;
3636 }
3637 }
3638
9c1e04d0
AP
3639 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3640 arg_notify_ready = settings->notify_ready;
3641
f757855e
LP
3642 return 0;
3643}
3644
b0067625
ZJS
3645static int run(int master,
3646 const char* console,
3647 const char *root_device, bool root_device_rw,
3648 const char *home_device, bool home_device_rw,
3649 const char *srv_device, bool srv_device_rw,
3650 const char *esp_device,
3651 bool interactive,
3652 bool secondary,
3653 FDSet *fds,
3654 char veth_name[IFNAMSIZ], bool *veth_created,
3655 union in_addr_union *exposed,
3656 pid_t *pid, int *ret) {
3657
3658 static const struct sigaction sa = {
3659 .sa_handler = nop_signal_handler,
3660 .sa_flags = SA_NOCLDSTOP,
3661 };
3662
3663 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3664 _cleanup_close_ int etc_passwd_lock = -1;
3665 _cleanup_close_pair_ int
3666 kmsg_socket_pair[2] = { -1, -1 },
3667 rtnl_socket_pair[2] = { -1, -1 },
3668 pid_socket_pair[2] = { -1, -1 },
3669 uuid_socket_pair[2] = { -1, -1 },
3670 notify_socket_pair[2] = { -1, -1 },
3671 uid_shift_socket_pair[2] = { -1, -1 };
3672 _cleanup_close_ int notify_socket= -1;
3673 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3674 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3675 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3676 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3677 ContainerStatus container_status = 0;
3678 char last_char = 0;
3679 int ifi = 0, r;
3680 ssize_t l;
3681 sigset_t mask_chld;
3682
3683 assert_se(sigemptyset(&mask_chld) == 0);
3684 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3685
3686 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3687 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3688 * check with getpwuid() if the specific user already exists. Note that /etc might be
3689 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3690 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3691 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3692 * really ours. */
3693
3694 etc_passwd_lock = take_etc_passwd_lock(NULL);
3695 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3696 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3697 }
3698
3699 r = barrier_create(&barrier);
3700 if (r < 0)
3701 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3702
3703 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3704 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3705
3706 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3707 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3708
3709 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3710 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3711
3712 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3713 return log_error_errno(errno, "Failed to create id socket pair: %m");
3714
3715 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3716 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3717
3718 if (arg_userns_mode != USER_NAMESPACE_NO)
3719 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3720 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3721
3722 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3723 * parent's blocking calls and give it a chance to call wait() and terminate. */
3724 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3725 if (r < 0)
3726 return log_error_errno(errno, "Failed to change the signal mask: %m");
3727
3728 r = sigaction(SIGCHLD, &sa, NULL);
3729 if (r < 0)
3730 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3731
3732 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3733 if (*pid < 0)
3734 return log_error_errno(errno, "clone() failed%s: %m",
3735 errno == EINVAL ?
3736 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3737
3738 if (*pid == 0) {
3739 /* The outer child only has a file system namespace. */
3740 barrier_set_role(&barrier, BARRIER_CHILD);
3741
3742 master = safe_close(master);
3743
3744 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3745 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3746 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3747 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3748 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3749 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3750
3751 (void) reset_all_signal_handlers();
3752 (void) reset_signal_mask();
3753
3754 r = outer_child(&barrier,
3755 arg_directory,
3756 console,
3757 root_device, root_device_rw,
3758 home_device, home_device_rw,
3759 srv_device, srv_device_rw,
3760 esp_device,
3761 interactive,
3762 secondary,
3763 pid_socket_pair[1],
3764 uuid_socket_pair[1],
3765 notify_socket_pair[1],
3766 kmsg_socket_pair[1],
3767 rtnl_socket_pair[1],
3768 uid_shift_socket_pair[1],
3769 fds);
3770 if (r < 0)
3771 _exit(EXIT_FAILURE);
3772
3773 _exit(EXIT_SUCCESS);
3774 }
3775
3776 barrier_set_role(&barrier, BARRIER_PARENT);
3777
3778 fds = fdset_free(fds);
3779
3780 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3781 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3782 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3783 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3784 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3785 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3786
3787 if (arg_userns_mode != USER_NAMESPACE_NO) {
3788 /* The child just let us know the UID shift it might have read from the image. */
3789 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3790 if (l < 0)
3791 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3792 if (l != sizeof arg_uid_shift) {
3793 log_error("Short read while reading UID shift.");
3794 return -EIO;
3795 }
3796
3797 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3798 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3799 * image, but if that's already in use, pick a new one, and report back to the child,
3800 * which one we now picked. */
3801
3802 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3803 if (r < 0)
3804 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3805
3806 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3807 if (l < 0)
3808 return log_error_errno(errno, "Failed to send UID shift: %m");
3809 if (l != sizeof arg_uid_shift) {
3810 log_error("Short write while writing UID shift.");
3811 return -EIO;
3812 }
3813 }
3814 }
3815
3816 /* Wait for the outer child. */
3817 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3818 if (r != 0)
3819 return r < 0 ? r : -EIO;
3820
3821 /* And now retrieve the PID of the inner child. */
3822 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3823 if (l < 0)
3824 return log_error_errno(errno, "Failed to read inner child PID: %m");
3825 if (l != sizeof *pid) {
3826 log_error("Short read while reading inner child PID.");
3827 return -EIO;
3828 }
3829
3830 /* We also retrieve container UUID in case it was generated by outer child */
3831 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3832 if (l < 0)
3833 return log_error_errno(errno, "Failed to read container machine ID: %m");
3834 if (l != sizeof(arg_uuid)) {
3835 log_error("Short read while reading container machined ID.");
3836 return -EIO;
3837 }
3838
3839 /* We also retrieve the socket used for notifications generated by outer child */
3840 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3841 if (notify_socket < 0)
3842 return log_error_errno(notify_socket,
3843 "Failed to receive notification socket from the outer child: %m");
3844
3845 log_debug("Init process invoked as PID "PID_FMT, *pid);
3846
3847 if (arg_userns_mode != USER_NAMESPACE_NO) {
3848 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3849 log_error("Child died too early.");
3850 return -ESRCH;
3851 }
3852
3853 r = setup_uid_map(*pid);
3854 if (r < 0)
3855 return r;
3856
3857 (void) barrier_place(&barrier); /* #2 */
3858 }
3859
3860 if (arg_private_network) {
3861
3862 r = move_network_interfaces(*pid, arg_network_interfaces);
3863 if (r < 0)
3864 return r;
3865
3866 if (arg_network_veth) {
3867 r = setup_veth(arg_machine, *pid, veth_name,
3868 arg_network_bridge || arg_network_zone);
3869 if (r < 0)
3870 return r;
3871 else if (r > 0)
3872 ifi = r;
3873
3874 if (arg_network_bridge) {
3875 /* Add the interface to a bridge */
3876 r = setup_bridge(veth_name, arg_network_bridge, false);
3877 if (r < 0)
3878 return r;
3879 if (r > 0)
3880 ifi = r;
3881 } else if (arg_network_zone) {
3882 /* Add the interface to a bridge, possibly creating it */
3883 r = setup_bridge(veth_name, arg_network_zone, true);
3884 if (r < 0)
3885 return r;
3886 if (r > 0)
3887 ifi = r;
3888 }
3889 }
3890
3891 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3892 if (r < 0)
3893 return r;
3894
3895 /* We created the primary and extra veth links now; let's remember this, so that we know to
3896 remove them later on. Note that we don't bother with removing veth links that were created
3897 here when their setup failed half-way, because in that case the kernel should be able to
3898 remove them on its own, since they cannot be referenced by anything yet. */
3899 *veth_created = true;
3900
3901 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3902 if (r < 0)
3903 return r;
3904
3905 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3906 if (r < 0)
3907 return r;
3908 }
3909
3910 if (arg_register) {
3911 r = register_machine(
3912 arg_machine,
3913 *pid,
3914 arg_directory,
3915 arg_uuid,
3916 ifi,
3917 arg_slice,
3918 arg_custom_mounts, arg_n_custom_mounts,
3919 arg_kill_signal,
3920 arg_property,
3921 arg_keep_unit,
3922 arg_container_service_name);
3923 if (r < 0)
3924 return r;
3925 }
3926
f0bef277 3927 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3928 if (r < 0)
3929 return r;
3930
3931 if (arg_keep_unit) {
3932 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3933 if (r < 0)
3934 return r;
3935 }
3936
3937 r = chown_cgroup(*pid, arg_uid_shift);
3938 if (r < 0)
3939 return r;
3940
3941 /* Notify the child that the parent is ready with all
3942 * its setup (including cgroup-ification), and that
3943 * the child can now hand over control to the code to
3944 * run inside the container. */
3945 (void) barrier_place(&barrier); /* #3 */
3946
3947 /* Block SIGCHLD here, before notifying child.
3948 * process_pty() will handle it with the other signals. */
3949 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3950
3951 /* Reset signal to default */
3952 r = default_signals(SIGCHLD, -1);
3953 if (r < 0)
3954 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3955
3956 r = sd_event_new(&event);
3957 if (r < 0)
3958 return log_error_errno(r, "Failed to get default event source: %m");
3959
3960 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
3961 if (r < 0)
3962 return r;
3963
3964 /* Let the child know that we are ready and wait that the child is completely ready now. */
3965 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3966 log_error("Child died too early.");
3967 return -ESRCH;
3968 }
3969
3970 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3971 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3972 etc_passwd_lock = safe_close(etc_passwd_lock);
3973
3974 sd_notifyf(false,
3975 "STATUS=Container running.\n"
3976 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3977 if (!arg_notify_ready)
3978 sd_notify(false, "READY=1\n");
3979
3980 if (arg_kill_signal > 0) {
3981 /* Try to kill the init system on SIGINT or SIGTERM */
3982 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3983 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3984 } else {
3985 /* Immediately exit */
3986 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3987 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3988 }
3989
3990 /* simply exit on sigchld */
3991 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3992
3993 if (arg_expose_ports) {
3994 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3995 if (r < 0)
3996 return r;
3997
3998 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3999 }
4000
4001 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4002
4003 r = pty_forward_new(event, master,
4004 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
4005 &forward);
4006 if (r < 0)
4007 return log_error_errno(r, "Failed to create PTY forwarder: %m");
4008
4009 r = sd_event_loop(event);
4010 if (r < 0)
4011 return log_error_errno(r, "Failed to run event loop: %m");
4012
4013 pty_forward_get_last_char(forward, &last_char);
4014
4015 forward = pty_forward_free(forward);
4016
4017 if (!arg_quiet && last_char != '\n')
4018 putc('\n', stdout);
4019
4020 /* Kill if it is not dead yet anyway */
4021 if (arg_register && !arg_keep_unit)
4022 terminate_machine(*pid);
4023
4024 /* Normally redundant, but better safe than sorry */
c67b0082 4025 (void) kill(*pid, SIGKILL);
b0067625
ZJS
4026
4027 r = wait_for_container(*pid, &container_status);
4028 *pid = 0;
4029
4030 if (r < 0)
4031 /* We failed to wait for the container, or the container exited abnormally. */
4032 return r;
4033 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
4034 /* r > 0 → The container exited with a non-zero status.
4035 * As a special case, we need to replace 133 with a different value,
4036 * because 133 is special-cased in the service file to reboot the container.
4037 * otherwise → The container exited with zero status and a reboot was not requested.
4038 */
2a49b612 4039 if (r == EXIT_FORCE_RESTART)
27e29a1e 4040 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4041 *ret = r;
b0067625
ZJS
4042 return 0; /* finito */
4043 }
4044
4045 /* CONTAINER_REBOOTED, loop again */
4046
4047 if (arg_keep_unit) {
4048 /* Special handling if we are running as a service: instead of simply
4049 * restarting the machine we want to restart the entire service, so let's
4050 * inform systemd about this with the special exit code 133. The service
4051 * file uses RestartForceExitStatus=133 so that this results in a full
4052 * nspawn restart. This is necessary since we might have cgroup parameters
4053 * set we want to have flushed out. */
2a49b612
ZJS
4054 *ret = EXIT_FORCE_RESTART;
4055 return 0; /* finito */
b0067625
ZJS
4056 }
4057
4058 expose_port_flush(arg_expose_ports, exposed);
4059
4060 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4061 *veth_created = false;
4062 return 1; /* loop again */
4063}
4064
03cfe0d5
LP
4065int main(int argc, char *argv[]) {
4066
a6bc7db9 4067 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
4068 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4069 _cleanup_close_ int master = -1, image_fd = -1;
4070 _cleanup_fdset_free_ FDSet *fds = NULL;
cfed63f6 4071 int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS;
5aa3eba5 4072 char veth_name[IFNAMSIZ] = "";
17cbb288 4073 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 4074 pid_t pid = 0;
03cfe0d5
LP
4075 union in_addr_union exposed = {};
4076 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
4077 bool interactive, veth_created = false, remove_tmprootdir = false;
4078 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
03cfe0d5
LP
4079
4080 log_parse_environment();
4081 log_open();
4082
7732f92b
LP
4083 /* Make sure rename_process() in the stub init process can work */
4084 saved_argv = argv;
4085 saved_argc = argc;
4086
03cfe0d5
LP
4087 r = parse_argv(argc, argv);
4088 if (r <= 0)
4089 goto finish;
4090
03cfe0d5
LP
4091 if (geteuid() != 0) {
4092 log_error("Need to be root.");
4093 r = -EPERM;
4094 goto finish;
4095 }
f757855e
LP
4096 r = determine_names();
4097 if (r < 0)
4098 goto finish;
4099
4100 r = load_settings();
4101 if (r < 0)
4102 goto finish;
4103
4104 r = verify_arguments();
4105 if (r < 0)
4106 goto finish;
03cfe0d5
LP
4107
4108 n_fd_passed = sd_listen_fds(false);
4109 if (n_fd_passed > 0) {
4110 r = fdset_new_listen_fds(&fds, false);
4111 if (r < 0) {
4112 log_error_errno(r, "Failed to collect file descriptors: %m");
4113 goto finish;
4114 }
4115 }
4116
4117 if (arg_directory) {
4118 assert(!arg_image);
4119
4120 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4121 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4122 r = -EINVAL;
4123 goto finish;
4124 }
4125
4126 if (arg_ephemeral) {
4127 _cleanup_free_ char *np = NULL;
4128
4129 /* If the specified path is a mount point we
4130 * generate the new snapshot immediately
4131 * inside it under a random name. However if
4132 * the specified is not a mount point we
4133 * create the new snapshot in the parent
4134 * directory, just next to it. */
e26d6ce5 4135 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4136 if (r < 0) {
4137 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4138 goto finish;
4139 }
4140 if (r > 0)
770b5ce4 4141 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4142 else
770b5ce4 4143 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 4144 if (r < 0) {
0f3be6ca 4145 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
4146 goto finish;
4147 }
4148
4149 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4150 if (r < 0) {
4151 log_error_errno(r, "Failed to lock %s: %m", np);
4152 goto finish;
4153 }
4154
17cbb288
LP
4155 r = btrfs_subvol_snapshot(arg_directory, np,
4156 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4157 BTRFS_SNAPSHOT_FALLBACK_COPY |
4158 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4159 BTRFS_SNAPSHOT_RECURSIVE |
4160 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4161 if (r < 0) {
4162 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4163 goto finish;
ec16945e
LP
4164 }
4165
4166 free(arg_directory);
4167 arg_directory = np;
8a16a7b4 4168 np = NULL;
ec16945e 4169
17cbb288 4170 remove_directory = true;
30535c16
LP
4171
4172 } else {
4173 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4174 if (r == -EBUSY) {
4175 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4176 goto finish;
4177 }
4178 if (r < 0) {
4179 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4180 goto finish;
30535c16
LP
4181 }
4182
4183 if (arg_template) {
17cbb288
LP
4184 r = btrfs_subvol_snapshot(arg_template, arg_directory,
4185 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
4186 BTRFS_SNAPSHOT_FALLBACK_COPY |
4187 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
4188 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
4189 BTRFS_SNAPSHOT_RECURSIVE |
4190 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4191 if (r == -EEXIST) {
4192 if (!arg_quiet)
4193 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4194 } else if (r < 0) {
83521414 4195 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4196 goto finish;
4197 } else {
4198 if (!arg_quiet)
4199 log_info("Populated %s from template %s.", arg_directory, arg_template);
4200 }
4201 }
ec16945e
LP
4202 }
4203
7732f92b 4204 if (arg_start_mode == START_BOOT) {
1b9e5b12 4205 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4206 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4207 r = -EINVAL;
1b9e5b12
LP
4208 goto finish;
4209 }
4210 } else {
4211 const char *p;
4212
16fb773e
LP
4213 p = strjoina(arg_directory, "/usr/");
4214 if (laccess(p, F_OK) < 0) {
4215 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 4216 r = -EINVAL;
1b9e5b12 4217 goto finish;
1b9e5b12
LP
4218 }
4219 }
ec16945e 4220
6b9132a9 4221 } else {
ec16945e
LP
4222 assert(arg_image);
4223 assert(!arg_template);
4224
0f3be6ca
LP
4225 if (arg_ephemeral) {
4226 _cleanup_free_ char *np = NULL;
4227
4228 r = tempfn_random(arg_image, "machine.", &np);
4229 if (r < 0) {
4230 log_error_errno(r, "Failed to generate name for image snapshot: %m");
4231 goto finish;
4232 }
4233
4234 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4235 if (r < 0) {
4236 r = log_error_errno(r, "Failed to create image lock: %m");
4237 goto finish;
4238 }
4239
4240 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL);
4241 if (r < 0) {
4242 r = log_error_errno(r, "Failed to copy image file: %m");
4243 goto finish;
4244 }
4245
4246 free(arg_image);
4247 arg_image = np;
4248 np = NULL;
4249
4250 remove_image = true;
4251 } else {
4252 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4253 if (r == -EBUSY) {
4254 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4255 goto finish;
4256 }
4257 if (r < 0) {
4258 r = log_error_errno(r, "Failed to create image lock: %m");
4259 goto finish;
4260 }
30535c16
LP
4261 }
4262
c67b0082 4263 if (!mkdtemp(tmprootdir)) {
0f3be6ca 4264 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 4265 goto finish;
1b9e5b12 4266 }
6b9132a9 4267
c67b0082
LP
4268 remove_tmprootdir = true;
4269
4270 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
4271 if (!arg_directory) {
4272 r = log_oom();
4273 goto finish;
6b9132a9 4274 }
88213476 4275
1b9e5b12
LP
4276 image_fd = setup_image(&device_path, &loop_nr);
4277 if (image_fd < 0) {
4278 r = image_fd;
842f3b0f
LP
4279 goto finish;
4280 }
1b9e5b12 4281
4d9f07b4
LP
4282 r = dissect_image(image_fd,
4283 &root_device, &root_device_rw,
4284 &home_device, &home_device_rw,
4285 &srv_device, &srv_device_rw,
a6bc7db9 4286 &esp_device,
4d9f07b4 4287 &secondary);
1b9e5b12
LP
4288 if (r < 0)
4289 goto finish;
0f3be6ca
LP
4290
4291 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
4292 if (remove_image && unlink(arg_image) >= 0)
4293 remove_image = false;
842f3b0f 4294 }
842f3b0f 4295
5a8af538
LP
4296 r = custom_mounts_prepare();
4297 if (r < 0)
4298 goto finish;
4299
03cfe0d5
LP
4300 interactive =
4301 isatty(STDIN_FILENO) > 0 &&
4302 isatty(STDOUT_FILENO) > 0;
9c857b9d 4303
db7feb7e
LP
4304 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4305 if (master < 0) {
ec16945e 4306 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4307 goto finish;
4308 }
4309
611b312b
LP
4310 r = ptsname_malloc(master, &console);
4311 if (r < 0) {
4312 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4313 goto finish;
68b02049
DW
4314 }
4315
4316 if (arg_selinux_apifs_context) {
4317 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4318 if (r < 0)
4319 goto finish;
a258bf26
LP
4320 }
4321
a258bf26 4322 if (unlockpt(master) < 0) {
ec16945e 4323 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4324 goto finish;
4325 }
4326
9c857b9d
LP
4327 if (!arg_quiet)
4328 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4329 arg_machine, arg_image ?: arg_directory);
4330
72c0a2c2 4331 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4332
03cfe0d5
LP
4333 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4334 r = log_error_errno(errno, "Failed to become subreaper: %m");
4335 goto finish;
4336 }
4337
d87be9b0 4338 for (;;) {
b0067625
ZJS
4339 r = run(master,
4340 console,
4341 root_device, root_device_rw,
4342 home_device, home_device_rw,
4343 srv_device, srv_device_rw,
4344 esp_device,
4345 interactive, secondary,
4346 fds,
4347 veth_name, &veth_created,
4348 &exposed,
4349 &pid, &ret);
4350 if (r <= 0)
d87be9b0 4351 break;
d87be9b0 4352 }
88213476
LP
4353
4354finish:
af4ec430 4355 sd_notify(false,
2a49b612
ZJS
4356 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
4357 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 4358
9444b1f2 4359 if (pid > 0)
c67b0082 4360 (void) kill(pid, SIGKILL);
88213476 4361
503546da 4362 /* Try to flush whatever is still queued in the pty */
6a0f896b 4363 if (master >= 0) {
59f448cf 4364 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
6a0f896b
LP
4365 master = safe_close(master);
4366 }
4367
4368 if (pid > 0)
4369 (void) wait_for_terminate(pid, NULL);
503546da 4370
03cfe0d5
LP
4371 loop_remove(loop_nr, &image_fd);
4372
17cbb288 4373 if (remove_directory && arg_directory) {
ec16945e
LP
4374 int k;
4375
17cbb288 4376 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 4377 if (k < 0)
17cbb288 4378 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
4379 }
4380
0f3be6ca
LP
4381 if (remove_image && arg_image) {
4382 if (unlink(arg_image) < 0)
4383 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
4384 }
4385
c67b0082
LP
4386 if (remove_tmprootdir) {
4387 if (rmdir(tmprootdir) < 0)
4388 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
4389 }
4390
785890ac
LP
4391 if (arg_machine) {
4392 const char *p;
4393
63c372cb 4394 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4395 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4396 }
4397
7a8f6325 4398 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4399
4400 if (veth_created)
4401 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4402 (void) remove_bridge(arg_network_zone);
f757855e 4403
04d391da 4404 free(arg_directory);
ec16945e
LP
4405 free(arg_template);
4406 free(arg_image);
7027ff61 4407 free(arg_machine);
c74e630d 4408 free(arg_user);
5f932eb9 4409 free(arg_chdir);
c74e630d 4410 strv_free(arg_setenv);
f757855e 4411 free(arg_network_bridge);
c74e630d
LP
4412 strv_free(arg_network_interfaces);
4413 strv_free(arg_network_macvlan);
4bbfe7ad 4414 strv_free(arg_network_ipvlan);
f6d6bad1 4415 strv_free(arg_network_veth_extra);
f757855e
LP
4416 strv_free(arg_parameters);
4417 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4418 expose_port_free_all(arg_expose_ports);
6d0b55c2 4419
ec16945e 4420 return r < 0 ? EXIT_FAILURE : ret;
88213476 4421}