]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #6266 from keszybz/drop-autotools
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e 20#ifdef HAVE_BLKID
6b5cf3ea 21#include <blkid.h>
8fe0087e 22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
6916b164 41#include <sys/wait.h>
8fe0087e 42#include <unistd.h>
1b9e5b12 43
b053cd5f 44#include "sd-bus.h"
1f0cd86b 45#include "sd-daemon.h"
1f0cd86b 46#include "sd-id128.h"
8fe0087e 47
b5efdb8a 48#include "alloc-util.h"
8fe0087e
LP
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
b053cd5f 53#include "bus-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
2d845785 59#include "dissect-image.h"
8fe0087e 60#include "env-util.h"
3ffd4af2 61#include "fd-util.h"
842f3b0f 62#include "fdset.h"
a5c32cff 63#include "fileio.h"
f97b34a6 64#include "format-util.h"
f4f15635 65#include "fs-util.h"
1b9e5b12 66#include "gpt.h"
4623e8e6 67#include "hexdecoct.h"
8fe0087e 68#include "hostname-util.h"
910fd145 69#include "id128-util.h"
8fe0087e 70#include "log.h"
2d845785 71#include "loop-util.h"
8fe0087e 72#include "loopback-setup.h"
1b9cebf6 73#include "machine-image.h"
8fe0087e
LP
74#include "macro.h"
75#include "missing.h"
76#include "mkdir.h"
4349cd7c 77#include "mount-util.h"
8fe0087e 78#include "netlink-util.h"
07630cea
LP
79#include "nspawn-cgroup.h"
80#include "nspawn-expose-ports.h"
81#include "nspawn-mount.h"
82#include "nspawn-network.h"
7336138e 83#include "nspawn-patch-uid.h"
07630cea 84#include "nspawn-register.h"
910fd145 85#include "nspawn-seccomp.h"
07630cea
LP
86#include "nspawn-settings.h"
87#include "nspawn-setuid.h"
7732f92b 88#include "nspawn-stub-pid1.h"
6bedfcbb 89#include "parse-util.h"
8fe0087e 90#include "path-util.h"
0b452006 91#include "process-util.h"
8fe0087e
LP
92#include "ptyfwd.h"
93#include "random-util.h"
8869a0b4 94#include "raw-clone.h"
8fe0087e 95#include "rm-rf.h"
68b02049 96#include "selinux-util.h"
8fe0087e 97#include "signal-util.h"
2583fbea 98#include "socket-util.h"
8fcde012 99#include "stat-util.h"
15a5e950 100#include "stdio-util.h"
07630cea 101#include "string-util.h"
8fe0087e
LP
102#include "strv.h"
103#include "terminal-util.h"
104#include "udev-util.h"
affb60b1 105#include "umask-util.h"
b1d4f8e1 106#include "user-util.h"
8fe0087e 107#include "util.h"
e9642be2 108
0e7ac751 109/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
110 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
111 * may have their own allocation ranges too. */
0e7ac751
LP
112#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
113#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 114
9c1e04d0
AP
115/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED
125} ContainerStatus;
126
57fb9fb5
LP
127typedef enum LinkJournal {
128 LINK_NO,
129 LINK_AUTO,
130 LINK_HOST,
131 LINK_GUEST
132} LinkJournal;
88213476
LP
133
134static char *arg_directory = NULL;
ec16945e 135static char *arg_template = NULL;
5f932eb9 136static char *arg_chdir = NULL;
b53ede69
PW
137static char *arg_pivot_root_new = NULL;
138static char *arg_pivot_root_old = NULL;
687d0825 139static char *arg_user = NULL;
9444b1f2 140static sd_id128_t arg_uuid = {};
7027ff61 141static char *arg_machine = NULL;
c74e630d
LP
142static const char *arg_selinux_context = NULL;
143static const char *arg_selinux_apifs_context = NULL;
9444b1f2 144static const char *arg_slice = NULL;
ff01d048 145static bool arg_private_network = false;
bc2f673e 146static bool arg_read_only = false;
7732f92b 147static StartMode arg_start_mode = START_PID1;
ec16945e 148static bool arg_ephemeral = false;
57fb9fb5 149static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 150static bool arg_link_journal_try = false;
520e0d54 151static uint64_t arg_caps_retain =
50b52222
LP
152 (1ULL << CAP_AUDIT_CONTROL) |
153 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
154 (1ULL << CAP_CHOWN) |
155 (1ULL << CAP_DAC_OVERRIDE) |
156 (1ULL << CAP_DAC_READ_SEARCH) |
157 (1ULL << CAP_FOWNER) |
158 (1ULL << CAP_FSETID) |
159 (1ULL << CAP_IPC_OWNER) |
160 (1ULL << CAP_KILL) |
161 (1ULL << CAP_LEASE) |
162 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 163 (1ULL << CAP_MKNOD) |
5076f0cc
LP
164 (1ULL << CAP_NET_BIND_SERVICE) |
165 (1ULL << CAP_NET_BROADCAST) |
166 (1ULL << CAP_NET_RAW) |
5076f0cc 167 (1ULL << CAP_SETFCAP) |
50b52222 168 (1ULL << CAP_SETGID) |
5076f0cc
LP
169 (1ULL << CAP_SETPCAP) |
170 (1ULL << CAP_SETUID) |
171 (1ULL << CAP_SYS_ADMIN) |
50b52222 172 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
173 (1ULL << CAP_SYS_CHROOT) |
174 (1ULL << CAP_SYS_NICE) |
175 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 176 (1ULL << CAP_SYS_RESOURCE) |
50b52222 177 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
178static CustomMount *arg_custom_mounts = NULL;
179static unsigned arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
050f7277 191static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 192static char *arg_image = NULL;
f757855e 193static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 194static ExposePort *arg_expose_ports = NULL;
f36933fe 195static char **arg_property = NULL;
0de7acce 196static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 197static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 198static bool arg_userns_chown = false;
c6c8f6e2 199static int arg_kill_signal = 0;
5da38d07 200static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
201static SettingsMask arg_settings_mask = 0;
202static int arg_settings_trusted = -1;
203static char **arg_parameters = NULL;
6aadfa4c 204static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 205static bool arg_notify_ready = false;
5a8ff0e6 206static bool arg_use_cgns = true;
0c582db0 207static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 208static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
209static void *arg_root_hash = NULL;
210static size_t arg_root_hash_size = 0;
88213476 211
601185b4 212static void help(void) {
88213476
LP
213 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
214 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
215 " -h --help Show this help\n"
216 " --version Print version string\n"
69c79d3c 217 " -q --quiet Do not show status information\n"
1b9e5b12 218 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
219 " --template=PATH Initialize root directory from template directory,\n"
220 " if missing\n"
221 " -x --ephemeral Run container with snapshot of root directory, and\n"
222 " remove it after exit\n"
223 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 224 " --root-hash=HASH Specify verity root hash\n"
7732f92b 225 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 226 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 227 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
228 " --pivot-root=PATH[:PATH]\n"
229 " Pivot root to given directory in the container\n"
a8828ed9 230 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 231 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 232 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 233 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 234 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 235 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 236 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 237 " Similar, but with user configured UID/GID range\n"
24597ee0 238 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
239 " --private-network Disable network in container\n"
240 " --network-interface=INTERFACE\n"
241 " Assign an existing network interface to the\n"
242 " container\n"
c74e630d
LP
243 " --network-macvlan=INTERFACE\n"
244 " Create a macvlan network interface based on an\n"
245 " existing network interface to the container\n"
4bbfe7ad
TG
246 " --network-ipvlan=INTERFACE\n"
247 " Create a ipvlan network interface based on an\n"
248 " existing network interface to the container\n"
a8eaaee7 249 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 250 " and container\n"
f6d6bad1
LP
251 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
252 " Add an additional virtual Ethernet link between\n"
253 " host and container\n"
ab046dde 254 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
255 " Add a virtual Ethernet connection to the container\n"
256 " and attach it to an existing bridge on the host\n"
257 " --network-zone=NAME Similar, but attach the new interface to an\n"
258 " an automatically managed bridge interface\n"
6d0b55c2 259 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 260 " Expose a container IP port on the host\n"
82adf6af
LP
261 " -Z --selinux-context=SECLABEL\n"
262 " Set the SELinux security context to be used by\n"
263 " processes in the container\n"
264 " -L --selinux-apifs-context=SECLABEL\n"
265 " Set the SELinux security context to be used by\n"
266 " API/tmpfs file systems in the container\n"
a8828ed9
DW
267 " --capability=CAP In addition to the default, retain specified\n"
268 " capability\n"
269 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 270 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
271 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
272 " host, try-guest, try-host\n"
574edc90 273 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 274 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
275 " --bind=PATH[:PATH[:OPTIONS]]\n"
276 " Bind mount a file or directory from the host into\n"
a8828ed9 277 " the container\n"
5e5bfa6e
EY
278 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
279 " Similar, but creates a read-only bind mount\n"
06c17c39 280 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
281 " --overlay=PATH[:PATH...]:PATH\n"
282 " Create an overlay mount from the host to \n"
283 " the container\n"
284 " --overlay-ro=PATH[:PATH...]:PATH\n"
285 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 286 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 287 " --register=BOOLEAN Register container as machine\n"
89f7c846 288 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 289 " the service unit nspawn is running in\n"
6d0b55c2 290 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 291 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 292 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 293 , program_invocation_short_name);
88213476
LP
294}
295
86c0dd4a 296static int custom_mount_check_all(void) {
5a8af538 297 unsigned i;
5a8af538 298
5a8af538
LP
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
0de7acce 302 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
303
304 if (arg_userns_chown) {
305 log_error("--private-users-chown may not be combined with custom root mounts.");
306 return -EINVAL;
307 } else if (arg_uid_shift == UID_INVALID) {
308 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
309 return -EINVAL;
310 }
825d5287 311 }
5a8af538
LP
312 }
313
314 return 0;
315}
316
0fd9563f 317static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 318 const char *e;
415fc41c 319 int r;
5da38d07 320
efdb0237
LP
321 /* Allow the user to control whether the unified hierarchy is used */
322 e = getenv("UNIFIED_CGROUP_HIERARCHY");
323 if (e) {
324 r = parse_boolean(e);
325 if (r < 0)
326 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
327 if (r > 0)
328 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
329 else
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 331
efdb0237
LP
332 return 0;
333 }
334
335 /* Otherwise inherit the default from the host system */
b4cccbc1
LP
336 r = cg_all_unified();
337 if (r < 0)
338 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
339 if (r > 0) {
a8725a06
ZJS
340 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
341 * routine only detects 231, so we'll have a false negative here for 230. */
342 r = systemd_installation_has_version(directory, 230);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine systemd version in container: %m");
345 if (r > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
347 else
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 349 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
350 /* Mixed cgroup hierarchy support was added in 233 */
351 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
352 if (r < 0)
353 return log_error_errno(r, "Failed to determine systemd version in container: %m");
354 if (r > 0)
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
356 else
357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
358 } else
5da38d07 359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 360
efdb0237
LP
361 return 0;
362}
363
0c582db0
LB
364static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
365 int r;
366
367 r = getenv_bool(name);
368 if (r == -ENXIO)
369 return;
370 if (r < 0)
371 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
372 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
373}
374
4f086aab
SU
375static void parse_mount_settings_env(void) {
376 int r;
377 const char *e;
378
379 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
380 if (!e)
381 return;
382
383 if (streq(e, "network")) {
384 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
385 return;
386 }
387
388 r = parse_boolean(e);
389 if (r < 0) {
390 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
391 return;
ab8ee0f2 392 }
4f086aab 393
ab8ee0f2
ZJS
394 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
395 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
396}
397
88213476
LP
398static int parse_argv(int argc, char *argv[]) {
399
a41fe3a2 400 enum {
acbeb427
ZJS
401 ARG_VERSION = 0x100,
402 ARG_PRIVATE_NETWORK,
bc2f673e 403 ARG_UUID,
5076f0cc 404 ARG_READ_ONLY,
57fb9fb5 405 ARG_CAPABILITY,
420c7379 406 ARG_DROP_CAPABILITY,
17fe0523
LP
407 ARG_LINK_JOURNAL,
408 ARG_BIND,
f4889f65 409 ARG_BIND_RO,
06c17c39 410 ARG_TMPFS,
5a8af538
LP
411 ARG_OVERLAY,
412 ARG_OVERLAY_RO,
eb91eb18 413 ARG_SHARE_SYSTEM,
89f7c846 414 ARG_REGISTER,
aa28aefe 415 ARG_KEEP_UNIT,
69c79d3c 416 ARG_NETWORK_INTERFACE,
c74e630d 417 ARG_NETWORK_MACVLAN,
4bbfe7ad 418 ARG_NETWORK_IPVLAN,
ab046dde 419 ARG_NETWORK_BRIDGE,
22b28dfd 420 ARG_NETWORK_ZONE,
f6d6bad1 421 ARG_NETWORK_VETH_EXTRA,
6afc95b7 422 ARG_PERSONALITY,
4d9f07b4 423 ARG_VOLATILE,
ec16945e 424 ARG_TEMPLATE,
f36933fe 425 ARG_PROPERTY,
6dac160c 426 ARG_PRIVATE_USERS,
c6c8f6e2 427 ARG_KILL_SIGNAL,
f757855e 428 ARG_SETTINGS,
5f932eb9 429 ARG_CHDIR,
b53ede69 430 ARG_PIVOT_ROOT,
7336138e 431 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 432 ARG_NOTIFY_READY,
4623e8e6 433 ARG_ROOT_HASH,
a41fe3a2
LP
434 };
435
88213476 436 static const struct option options[] = {
27eb8e90
ZJS
437 { "help", no_argument, NULL, 'h' },
438 { "version", no_argument, NULL, ARG_VERSION },
439 { "directory", required_argument, NULL, 'D' },
440 { "template", required_argument, NULL, ARG_TEMPLATE },
441 { "ephemeral", no_argument, NULL, 'x' },
442 { "user", required_argument, NULL, 'u' },
443 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
444 { "as-pid2", no_argument, NULL, 'a' },
445 { "boot", no_argument, NULL, 'b' },
446 { "uuid", required_argument, NULL, ARG_UUID },
447 { "read-only", no_argument, NULL, ARG_READ_ONLY },
448 { "capability", required_argument, NULL, ARG_CAPABILITY },
449 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
450 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
451 { "bind", required_argument, NULL, ARG_BIND },
452 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
453 { "tmpfs", required_argument, NULL, ARG_TMPFS },
454 { "overlay", required_argument, NULL, ARG_OVERLAY },
455 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
456 { "machine", required_argument, NULL, 'M' },
457 { "slice", required_argument, NULL, 'S' },
458 { "setenv", required_argument, NULL, 'E' },
459 { "selinux-context", required_argument, NULL, 'Z' },
460 { "selinux-apifs-context", required_argument, NULL, 'L' },
461 { "quiet", no_argument, NULL, 'q' },
462 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
463 { "register", required_argument, NULL, ARG_REGISTER },
464 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
465 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
466 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
467 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
468 { "network-veth", no_argument, NULL, 'n' },
469 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
470 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
471 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
472 { "personality", required_argument, NULL, ARG_PERSONALITY },
473 { "image", required_argument, NULL, 'i' },
474 { "volatile", optional_argument, NULL, ARG_VOLATILE },
475 { "port", required_argument, NULL, 'p' },
476 { "property", required_argument, NULL, ARG_PROPERTY },
477 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
478 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
479 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
480 { "settings", required_argument, NULL, ARG_SETTINGS },
481 { "chdir", required_argument, NULL, ARG_CHDIR },
b53ede69 482 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
27eb8e90 483 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
4623e8e6 484 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
eb9da376 485 {}
88213476
LP
486 };
487
9444b1f2 488 int c, r;
6aadfa4c 489 const char *p, *e;
a42c8b54 490 uint64_t plus = 0, minus = 0;
f757855e 491 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
492
493 assert(argc >= 0);
494 assert(argv);
495
2e1f244e 496 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
497
498 switch (c) {
499
500 case 'h':
601185b4
ZJS
501 help();
502 return 0;
88213476 503
acbeb427 504 case ARG_VERSION:
3f6fd1ba 505 return version();
acbeb427 506
88213476 507 case 'D':
0f03c2a4 508 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 509 if (r < 0)
0f03c2a4 510 return r;
ec16945e
LP
511 break;
512
513 case ARG_TEMPLATE:
0f03c2a4 514 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 515 if (r < 0)
0f03c2a4 516 return r;
88213476
LP
517 break;
518
1b9e5b12 519 case 'i':
0f03c2a4 520 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 521 if (r < 0)
0f03c2a4 522 return r;
ec16945e
LP
523 break;
524
525 case 'x':
526 arg_ephemeral = true;
1b9e5b12
LP
527 break;
528
687d0825 529 case 'u':
2fc09a9c
DM
530 r = free_and_strdup(&arg_user, optarg);
531 if (r < 0)
7027ff61 532 return log_oom();
687d0825 533
f757855e 534 arg_settings_mask |= SETTING_USER;
687d0825
MV
535 break;
536
22b28dfd
LP
537 case ARG_NETWORK_ZONE: {
538 char *j;
539
540 j = strappend("vz-", optarg);
541 if (!j)
542 return log_oom();
543
544 if (!ifname_valid(j)) {
545 log_error("Network zone name not valid: %s", j);
546 free(j);
547 return -EINVAL;
548 }
549
550 free(arg_network_zone);
551 arg_network_zone = j;
552
553 arg_network_veth = true;
554 arg_private_network = true;
555 arg_settings_mask |= SETTING_NETWORK;
556 break;
557 }
558
ab046dde 559 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
560
561 if (!ifname_valid(optarg)) {
562 log_error("Bridge interface name not valid: %s", optarg);
563 return -EINVAL;
564 }
565
f757855e
LP
566 r = free_and_strdup(&arg_network_bridge, optarg);
567 if (r < 0)
568 return log_oom();
ab046dde
TG
569
570 /* fall through */
571
0dfaa006 572 case 'n':
69c79d3c
LP
573 arg_network_veth = true;
574 arg_private_network = true;
f757855e 575 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
576 break;
577
f6d6bad1
LP
578 case ARG_NETWORK_VETH_EXTRA:
579 r = veth_extra_parse(&arg_network_veth_extra, optarg);
580 if (r < 0)
581 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
582
583 arg_private_network = true;
584 arg_settings_mask |= SETTING_NETWORK;
585 break;
586
aa28aefe 587 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
588
589 if (!ifname_valid(optarg)) {
590 log_error("Network interface name not valid: %s", optarg);
591 return -EINVAL;
592 }
593
c74e630d
LP
594 if (strv_extend(&arg_network_interfaces, optarg) < 0)
595 return log_oom();
596
597 arg_private_network = true;
f757855e 598 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
599 break;
600
601 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
602
603 if (!ifname_valid(optarg)) {
604 log_error("MACVLAN network interface name not valid: %s", optarg);
605 return -EINVAL;
606 }
607
c74e630d 608 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
609 return log_oom();
610
4bbfe7ad 611 arg_private_network = true;
f757855e 612 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
613 break;
614
615 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
616
617 if (!ifname_valid(optarg)) {
618 log_error("IPVLAN network interface name not valid: %s", optarg);
619 return -EINVAL;
620 }
621
4bbfe7ad
TG
622 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
623 return log_oom();
624
aa28aefe
LP
625 /* fall through */
626
ff01d048
LP
627 case ARG_PRIVATE_NETWORK:
628 arg_private_network = true;
f757855e 629 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
630 break;
631
0f0dbc46 632 case 'b':
7732f92b
LP
633 if (arg_start_mode == START_PID2) {
634 log_error("--boot and --as-pid2 may not be combined.");
635 return -EINVAL;
636 }
637
638 arg_start_mode = START_BOOT;
639 arg_settings_mask |= SETTING_START_MODE;
640 break;
641
642 case 'a':
643 if (arg_start_mode == START_BOOT) {
644 log_error("--boot and --as-pid2 may not be combined.");
645 return -EINVAL;
646 }
647
648 arg_start_mode = START_PID2;
649 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
650 break;
651
144f0fc0 652 case ARG_UUID:
9444b1f2 653 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
654 if (r < 0)
655 return log_error_errno(r, "Invalid UUID: %s", optarg);
656
657 if (sd_id128_is_null(arg_uuid)) {
658 log_error("Machine UUID may not be all zeroes.");
659 return -EINVAL;
aa96c6cb 660 }
f757855e
LP
661
662 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 663 break;
aa96c6cb 664
9444b1f2 665 case 'S':
c74e630d 666 arg_slice = optarg;
144f0fc0
LP
667 break;
668
7027ff61 669 case 'M':
c1521918 670 if (isempty(optarg))
97b11eed 671 arg_machine = mfree(arg_machine);
c1521918 672 else {
0c3c4284 673 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
674 log_error("Invalid machine name: %s", optarg);
675 return -EINVAL;
676 }
7027ff61 677
0c3c4284
LP
678 r = free_and_strdup(&arg_machine, optarg);
679 if (r < 0)
eb91eb18 680 return log_oom();
eb91eb18 681 }
9ce6d1b3 682 break;
7027ff61 683
82adf6af
LP
684 case 'Z':
685 arg_selinux_context = optarg;
a8828ed9
DW
686 break;
687
82adf6af
LP
688 case 'L':
689 arg_selinux_apifs_context = optarg;
a8828ed9
DW
690 break;
691
bc2f673e
LP
692 case ARG_READ_ONLY:
693 arg_read_only = true;
f757855e 694 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
695 break;
696
420c7379
LP
697 case ARG_CAPABILITY:
698 case ARG_DROP_CAPABILITY: {
6cbe4ed1 699 p = optarg;
9ed794a3 700 for (;;) {
6cbe4ed1 701 _cleanup_free_ char *t = NULL;
5076f0cc 702
6cbe4ed1
SS
703 r = extract_first_word(&p, &t, ",", 0);
704 if (r < 0)
705 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 706
6cbe4ed1
SS
707 if (r == 0)
708 break;
5076f0cc 709
39ed67d1
LP
710 if (streq(t, "all")) {
711 if (c == ARG_CAPABILITY)
a42c8b54 712 plus = (uint64_t) -1;
39ed67d1 713 else
a42c8b54 714 minus = (uint64_t) -1;
39ed67d1 715 } else {
2822da4f
LP
716 int cap;
717
718 cap = capability_from_name(t);
719 if (cap < 0) {
39ed67d1
LP
720 log_error("Failed to parse capability %s.", t);
721 return -EINVAL;
722 }
723
724 if (c == ARG_CAPABILITY)
a42c8b54 725 plus |= 1ULL << (uint64_t) cap;
39ed67d1 726 else
a42c8b54 727 minus |= 1ULL << (uint64_t) cap;
5076f0cc 728 }
5076f0cc
LP
729 }
730
f757855e 731 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
732 break;
733 }
734
57fb9fb5
LP
735 case 'j':
736 arg_link_journal = LINK_GUEST;
574edc90 737 arg_link_journal_try = true;
57fb9fb5
LP
738 break;
739
740 case ARG_LINK_JOURNAL:
53e438e3 741 if (streq(optarg, "auto")) {
57fb9fb5 742 arg_link_journal = LINK_AUTO;
53e438e3
LP
743 arg_link_journal_try = false;
744 } else if (streq(optarg, "no")) {
57fb9fb5 745 arg_link_journal = LINK_NO;
53e438e3
LP
746 arg_link_journal_try = false;
747 } else if (streq(optarg, "guest")) {
57fb9fb5 748 arg_link_journal = LINK_GUEST;
53e438e3
LP
749 arg_link_journal_try = false;
750 } else if (streq(optarg, "host")) {
57fb9fb5 751 arg_link_journal = LINK_HOST;
53e438e3
LP
752 arg_link_journal_try = false;
753 } else if (streq(optarg, "try-guest")) {
574edc90
MP
754 arg_link_journal = LINK_GUEST;
755 arg_link_journal_try = true;
756 } else if (streq(optarg, "try-host")) {
757 arg_link_journal = LINK_HOST;
758 arg_link_journal_try = true;
759 } else {
57fb9fb5
LP
760 log_error("Failed to parse link journal mode %s", optarg);
761 return -EINVAL;
762 }
763
764 break;
765
17fe0523 766 case ARG_BIND:
f757855e
LP
767 case ARG_BIND_RO:
768 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
769 if (r < 0)
770 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 771
f757855e 772 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 773 break;
06c17c39 774
f757855e
LP
775 case ARG_TMPFS:
776 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
777 if (r < 0)
778 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 779
f757855e 780 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 781 break;
5a8af538
LP
782
783 case ARG_OVERLAY:
ad85779a
LP
784 case ARG_OVERLAY_RO:
785 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
786 if (r == -EADDRNOTAVAIL)
787 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
788 if (r < 0)
789 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 790
f757855e 791 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 792 break;
06c17c39 793
a5f1cb3b 794 case 'E': {
f4889f65
LP
795 char **n;
796
797 if (!env_assignment_is_valid(optarg)) {
798 log_error("Environment variable assignment '%s' is not valid.", optarg);
799 return -EINVAL;
800 }
801
802 n = strv_env_set(arg_setenv, optarg);
803 if (!n)
804 return log_oom();
805
806 strv_free(arg_setenv);
807 arg_setenv = n;
f757855e
LP
808
809 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
810 break;
811 }
812
284c0b91
LP
813 case 'q':
814 arg_quiet = true;
815 break;
816
8a96d94e 817 case ARG_SHARE_SYSTEM:
a6b5216c 818 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
819 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
820 arg_clone_ns_flags = 0;
8a96d94e
LP
821 break;
822
eb91eb18
LP
823 case ARG_REGISTER:
824 r = parse_boolean(optarg);
825 if (r < 0) {
826 log_error("Failed to parse --register= argument: %s", optarg);
827 return r;
828 }
829
830 arg_register = r;
831 break;
832
89f7c846
LP
833 case ARG_KEEP_UNIT:
834 arg_keep_unit = true;
835 break;
836
6afc95b7
LP
837 case ARG_PERSONALITY:
838
ac45f971 839 arg_personality = personality_from_string(optarg);
050f7277 840 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
841 log_error("Unknown or unsupported personality '%s'.", optarg);
842 return -EINVAL;
843 }
844
f757855e 845 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
846 break;
847
4d9f07b4
LP
848 case ARG_VOLATILE:
849
850 if (!optarg)
f757855e 851 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 852 else {
f757855e 853 VolatileMode m;
4d9f07b4 854
f757855e
LP
855 m = volatile_mode_from_string(optarg);
856 if (m < 0) {
857 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 858 return -EINVAL;
f757855e
LP
859 } else
860 arg_volatile_mode = m;
6d0b55c2
LP
861 }
862
f757855e
LP
863 arg_settings_mask |= SETTING_VOLATILE_MODE;
864 break;
6d0b55c2 865
f757855e
LP
866 case 'p':
867 r = expose_port_parse(&arg_expose_ports, optarg);
868 if (r == -EEXIST)
869 return log_error_errno(r, "Duplicate port specification: %s", optarg);
870 if (r < 0)
871 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 872
f757855e 873 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 874 break;
6d0b55c2 875
f36933fe
LP
876 case ARG_PROPERTY:
877 if (strv_extend(&arg_property, optarg) < 0)
878 return log_oom();
879
880 break;
881
ae209204
ZJS
882 case ARG_PRIVATE_USERS: {
883 int boolean = -1;
0de7acce 884
ae209204
ZJS
885 if (!optarg)
886 boolean = true;
887 else if (!in_charset(optarg, DIGITS))
888 /* do *not* parse numbers as booleans */
889 boolean = parse_boolean(optarg);
890
891 if (boolean == false) {
0de7acce
LP
892 /* no: User namespacing off */
893 arg_userns_mode = USER_NAMESPACE_NO;
894 arg_uid_shift = UID_INVALID;
895 arg_uid_range = UINT32_C(0x10000);
ae209204 896 } else if (boolean == true) {
0de7acce
LP
897 /* yes: User namespacing on, UID range is read from root dir */
898 arg_userns_mode = USER_NAMESPACE_FIXED;
899 arg_uid_shift = UID_INVALID;
900 arg_uid_range = UINT32_C(0x10000);
901 } else if (streq(optarg, "pick")) {
902 /* pick: User namespacing on, UID range is picked randomly */
903 arg_userns_mode = USER_NAMESPACE_PICK;
904 arg_uid_shift = UID_INVALID;
905 arg_uid_range = UINT32_C(0x10000);
906 } else {
6c2058b3 907 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
908 const char *range, *shift;
909
0de7acce
LP
910 /* anything else: User namespacing on, UID range is explicitly configured */
911
6dac160c
LP
912 range = strchr(optarg, ':');
913 if (range) {
6c2058b3
ZJS
914 buffer = strndup(optarg, range - optarg);
915 if (!buffer)
916 return log_oom();
917 shift = buffer;
6dac160c
LP
918
919 range++;
bfd292ec
ZJS
920 r = safe_atou32(range, &arg_uid_range);
921 if (r < 0)
be715731 922 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
923 } else
924 shift = optarg;
925
be715731
ZJS
926 r = parse_uid(shift, &arg_uid_shift);
927 if (r < 0)
928 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
929
930 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
931 }
932
be715731
ZJS
933 if (arg_uid_range <= 0) {
934 log_error("UID range cannot be 0.");
935 return -EINVAL;
936 }
937
0de7acce 938 arg_settings_mask |= SETTING_USERNS;
6dac160c 939 break;
ae209204 940 }
6dac160c 941
0de7acce 942 case 'U':
ccabee0d
LP
943 if (userns_supported()) {
944 arg_userns_mode = USER_NAMESPACE_PICK;
945 arg_uid_shift = UID_INVALID;
946 arg_uid_range = UINT32_C(0x10000);
947
948 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
949 }
950
7336138e
LP
951 break;
952
0de7acce 953 case ARG_PRIVATE_USERS_CHOWN:
19aac838 954 arg_userns_chown = true;
0de7acce
LP
955
956 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
957 break;
958
c6c8f6e2
LP
959 case ARG_KILL_SIGNAL:
960 arg_kill_signal = signal_from_string_try_harder(optarg);
961 if (arg_kill_signal < 0) {
962 log_error("Cannot parse signal: %s", optarg);
963 return -EINVAL;
964 }
965
f757855e
LP
966 arg_settings_mask |= SETTING_KILL_SIGNAL;
967 break;
968
969 case ARG_SETTINGS:
970
971 /* no → do not read files
972 * yes → read files, do not override cmdline, trust only subset
973 * override → read files, override cmdline, trust only subset
974 * trusted → read files, do not override cmdline, trust all
975 */
976
977 r = parse_boolean(optarg);
978 if (r < 0) {
979 if (streq(optarg, "trusted")) {
980 mask_all_settings = false;
981 mask_no_settings = false;
982 arg_settings_trusted = true;
983
984 } else if (streq(optarg, "override")) {
985 mask_all_settings = false;
986 mask_no_settings = true;
987 arg_settings_trusted = -1;
988 } else
989 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
990 } else if (r > 0) {
991 /* yes */
992 mask_all_settings = false;
993 mask_no_settings = false;
994 arg_settings_trusted = -1;
995 } else {
996 /* no */
997 mask_all_settings = true;
998 mask_no_settings = false;
999 arg_settings_trusted = false;
1000 }
1001
c6c8f6e2
LP
1002 break;
1003
5f932eb9
LP
1004 case ARG_CHDIR:
1005 if (!path_is_absolute(optarg)) {
1006 log_error("Working directory %s is not an absolute path.", optarg);
1007 return -EINVAL;
1008 }
1009
1010 r = free_and_strdup(&arg_chdir, optarg);
1011 if (r < 0)
1012 return log_oom();
1013
1014 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1015 break;
1016
b53ede69
PW
1017 case ARG_PIVOT_ROOT:
1018 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1019 if (r < 0)
1020 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1021
1022 arg_settings_mask |= SETTING_PIVOT_ROOT;
1023 break;
1024
9c1e04d0
AP
1025 case ARG_NOTIFY_READY:
1026 r = parse_boolean(optarg);
1027 if (r < 0) {
1028 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1029 return -EINVAL;
1030 }
1031 arg_notify_ready = r;
1032 arg_settings_mask |= SETTING_NOTIFY_READY;
1033 break;
1034
4623e8e6
LP
1035 case ARG_ROOT_HASH: {
1036 void *k;
1037 size_t l;
1038
1039 r = unhexmem(optarg, strlen(optarg), &k, &l);
1040 if (r < 0)
1041 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1042 if (l < sizeof(sd_id128_t)) {
1043 log_error("Root hash must be at least 128bit long: %s", optarg);
1044 free(k);
1045 return -EINVAL;
1046 }
1047
1048 free(arg_root_hash);
1049 arg_root_hash = k;
1050 arg_root_hash_size = l;
1051 break;
1052 }
1053
88213476
LP
1054 case '?':
1055 return -EINVAL;
1056
1057 default:
eb9da376 1058 assert_not_reached("Unhandled option");
88213476 1059 }
88213476 1060
0c582db0
LB
1061 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1062 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1063 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1064 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1065
4f086aab
SU
1066 if (arg_userns_mode != USER_NAMESPACE_NO)
1067 arg_mount_settings |= MOUNT_USE_USERNS;
1068
1069 if (arg_private_network)
1070 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1071
1072 parse_mount_settings_env();
1073
48a8d337
LB
1074 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1075 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1076 arg_register = false;
0c582db0
LB
1077 if (arg_start_mode != START_PID1) {
1078 log_error("--boot cannot be used without namespacing.");
1079 return -EINVAL;
1080 }
1081 }
eb91eb18 1082
0de7acce 1083 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1084 arg_userns_chown = true;
1085
cd2dfc6f
LP
1086 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
1087 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846
LP
1088 return -EINVAL;
1089 }
1090
1b9e5b12
LP
1091 if (arg_directory && arg_image) {
1092 log_error("--directory= and --image= may not be combined.");
1093 return -EINVAL;
1094 }
1095
ec16945e
LP
1096 if (arg_template && arg_image) {
1097 log_error("--template= and --image= may not be combined.");
1098 return -EINVAL;
1099 }
1100
8cd328d8
LP
1101 if (arg_ephemeral && arg_template && !arg_directory) {
1102 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1103 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1104 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1105 * --directory=". */
1106
1107 arg_directory = arg_template;
1108 arg_template = NULL;
1109 }
1110
ec16945e
LP
1111 if (arg_template && !(arg_directory || arg_machine)) {
1112 log_error("--template= needs --directory= or --machine=.");
1113 return -EINVAL;
1114 }
1115
1116 if (arg_ephemeral && arg_template) {
1117 log_error("--ephemeral and --template= may not be combined.");
1118 return -EINVAL;
1119 }
1120
df9a75e4
LP
1121 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1122 log_error("--ephemeral and --link-journal= may not be combined.");
1123 return -EINVAL;
1124 }
1125
ccabee0d 1126 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1127 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1128 return -EOPNOTSUPP;
1129 }
1130
1131 if (arg_userns_chown && arg_read_only) {
1132 log_error("--read-only and --private-users-chown may not be combined.");
1133 return -EINVAL;
1134 }
f757855e 1135
22b28dfd
LP
1136 if (arg_network_bridge && arg_network_zone) {
1137 log_error("--network-bridge= and --network-zone= may not be combined.");
1138 return -EINVAL;
1139 }
1140
f757855e
LP
1141 if (argc > optind) {
1142 arg_parameters = strv_copy(argv + optind);
1143 if (!arg_parameters)
1144 return log_oom();
1145
7732f92b 1146 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1147 }
1148
1149 /* Load all settings from .nspawn files */
1150 if (mask_no_settings)
1151 arg_settings_mask = 0;
1152
1153 /* Don't load any settings from .nspawn files */
1154 if (mask_all_settings)
1155 arg_settings_mask = _SETTINGS_MASK_ALL;
1156
520e0d54 1157 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1158
399e391f
ZJS
1159 r = cg_unified_flush();
1160 if (r < 0)
1161 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1162
6aadfa4c
ILG
1163 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1164 if (e)
1165 arg_container_service_name = e;
1166
5a8ff0e6
CB
1167 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1168 if (r < 0)
1169 arg_use_cgns = cg_ns_supported();
1170 else
1171 arg_use_cgns = r;
1172
86c0dd4a
LP
1173 r = custom_mount_check_all();
1174 if (r < 0)
1175 return r;
1176
f757855e
LP
1177 return 1;
1178}
1179
1180static int verify_arguments(void) {
4f086aab
SU
1181 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1182 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1183 return -EINVAL;
1184 }
1185
1186 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1187 log_error("Cannot combine --private-users with read-write mounts.");
1188 return -EINVAL;
1189 }
f757855e
LP
1190
1191 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1192 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1193 return -EINVAL;
1194 }
1195
6d0b55c2
LP
1196 if (arg_expose_ports && !arg_private_network) {
1197 log_error("Cannot use --port= without private networking.");
1198 return -EINVAL;
1199 }
1200
1c1ea217
EV
1201#ifndef HAVE_LIBIPTC
1202 if (arg_expose_ports) {
1203 log_error("--port= is not supported, compiled without libiptc support.");
1204 return -EOPNOTSUPP;
1205 }
1206#endif
1207
7732f92b 1208 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1209 arg_kill_signal = SIGRTMIN+3;
1210
f757855e 1211 return 0;
88213476
LP
1212}
1213
03cfe0d5
LP
1214static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1215 assert(p);
1216
0de7acce 1217 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1218 return 0;
1219
1220 if (uid == UID_INVALID && gid == GID_INVALID)
1221 return 0;
1222
1223 if (uid != UID_INVALID) {
1224 uid += arg_uid_shift;
1225
1226 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1227 return -EOVERFLOW;
1228 }
1229
1230 if (gid != GID_INVALID) {
1231 gid += (gid_t) arg_uid_shift;
1232
1233 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1234 return -EOVERFLOW;
1235 }
1236
1237 if (lchown(p, uid, gid) < 0)
1238 return -errno;
b12afc8c
LP
1239
1240 return 0;
1241}
1242
03cfe0d5
LP
1243static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1244 const char *q;
1245
1246 q = prefix_roota(root, path);
1247 if (mkdir(q, mode) < 0) {
1248 if (errno == EEXIST)
1249 return 0;
1250 return -errno;
1251 }
1252
1253 return userns_lchown(q, uid, gid);
1254}
1255
e58a1277 1256static int setup_timezone(const char *dest) {
03cfe0d5
LP
1257 _cleanup_free_ char *p = NULL, *q = NULL;
1258 const char *where, *check, *what;
d4036145
LP
1259 char *z, *y;
1260 int r;
f8440af5 1261
e58a1277
LP
1262 assert(dest);
1263
1264 /* Fix the timezone, if possible */
d4036145
LP
1265 r = readlink_malloc("/etc/localtime", &p);
1266 if (r < 0) {
0b493a02
MP
1267 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1268 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1269 * with a symbolic link to a time zone data file.
0b493a02
MP
1270 *
1271 * Example:
21dc0227 1272 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1273 */
d4036145
LP
1274 return 0;
1275 }
1276
1277 z = path_startswith(p, "../usr/share/zoneinfo/");
1278 if (!z)
1279 z = path_startswith(p, "/usr/share/zoneinfo/");
1280 if (!z) {
1281 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1282 return 0;
1283 }
1284
03cfe0d5 1285 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1286 r = readlink_malloc(where, &q);
1287 if (r >= 0) {
1288 y = path_startswith(q, "../usr/share/zoneinfo/");
1289 if (!y)
1290 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1291
d4036145
LP
1292 /* Already pointing to the right place? Then do nothing .. */
1293 if (y && streq(y, z))
1294 return 0;
1295 }
1296
03cfe0d5 1297 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1298 check = prefix_roota(dest, check);
03cfe0d5 1299 if (laccess(check, F_OK) < 0) {
d4036145
LP
1300 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1301 return 0;
1302 }
68fb0892 1303
8ccf7e9e
LP
1304 if (unlink(where) < 0 && errno != ENOENT) {
1305 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1306 errno,
1307 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1308 return 0;
1309 }
4d9f07b4 1310
03cfe0d5 1311 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1312 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1313 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1314 errno,
1315 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1316 return 0;
1317 }
e58a1277 1318
03cfe0d5
LP
1319 r = userns_lchown(where, 0, 0);
1320 if (r < 0)
1321 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1322
e58a1277 1323 return 0;
88213476
LP
1324}
1325
7357272e 1326static int resolved_listening(void) {
b053cd5f 1327 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1328 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1329 int r;
1330
7357272e 1331 /* Check if resolved is listening */
b053cd5f
LP
1332
1333 r = sd_bus_open_system(&bus);
1334 if (r < 0)
1335 return r;
1336
7357272e
DM
1337 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1338 if (r <= 0)
1339 return r;
1340
1341 r = sd_bus_get_property_string(bus,
1342 "org.freedesktop.resolve1",
1343 "/org/freedesktop/resolve1",
1344 "org.freedesktop.resolve1.Manager",
1345 "DNSStubListener",
1346 NULL,
1347 &dns_stub_listener_mode);
1348 if (r < 0)
1349 return r;
1350
1351 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1352}
1353
2547bb41 1354static int setup_resolv_conf(const char *dest) {
87447ae4
LP
1355 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1356 const char *where;
1357 int r, found;
2547bb41
LP
1358
1359 assert(dest);
1360
1361 if (arg_private_network)
1362 return 0;
1363
87447ae4
LP
1364 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1365 if (r < 0) {
1366 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1367 return 0;
1368 }
1369
1370 where = strjoina(etc, "/resolv.conf");
1371 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1372 if (found < 0) {
1373 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1374 return 0;
1375 }
79d80fc1 1376
b053cd5f 1377 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
7357272e 1378 resolved_listening() > 0) {
87447ae4 1379
3539724c
LP
1380 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1381 * container, so that the container can use the host's resolver. Given that network namespacing is
1382 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1383 * advantage that the container will be able to follow the host's DNS server configuration changes
1384 * transparently. */
1385
87447ae4
LP
1386 if (found == 0) /* missing? */
1387 (void) touch(resolved);
5367354d 1388
87447ae4 1389 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
60e76d48 1390 if (r >= 0)
87447ae4 1391 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1392 }
1393
1394 /* If that didn't work, let's copy the file */
1c876927 1395 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1396 if (r < 0) {
3539724c
LP
1397 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1398 * resolved or something similar runs inside and the symlink points there.
68a313c5 1399 *
3539724c 1400 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1401 */
87447ae4 1402 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1403 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1404 return 0;
1405 }
2547bb41 1406
03cfe0d5
LP
1407 r = userns_lchown(where, 0, 0);
1408 if (r < 0)
3539724c 1409 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1410
2547bb41
LP
1411 return 0;
1412}
1413
04bc4a3f 1414static int setup_boot_id(const char *dest) {
3bbaff3e 1415 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1416 const char *from, *to;
04bc4a3f
LP
1417 int r;
1418
04bc4a3f
LP
1419 /* Generate a new randomized boot ID, so that each boot-up of
1420 * the container gets a new one */
1421
03cfe0d5
LP
1422 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1423 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1424
1425 r = sd_id128_randomize(&rnd);
f647962d
MS
1426 if (r < 0)
1427 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1428
15b1248a 1429 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1430 if (r < 0)
1431 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1432
60e76d48
ZJS
1433 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1434 if (r >= 0)
1435 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1436 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1437
3bbaff3e 1438 (void) unlink(from);
04bc4a3f
LP
1439 return r;
1440}
1441
e58a1277 1442static int copy_devnodes(const char *dest) {
88213476
LP
1443
1444 static const char devnodes[] =
1445 "null\0"
1446 "zero\0"
1447 "full\0"
1448 "random\0"
1449 "urandom\0"
85614d66
TG
1450 "tty\0"
1451 "net/tun\0";
88213476
LP
1452
1453 const char *d;
e58a1277 1454 int r = 0;
7fd1b19b 1455 _cleanup_umask_ mode_t u;
a258bf26
LP
1456
1457 assert(dest);
124640f1
LP
1458
1459 u = umask(0000);
88213476 1460
03cfe0d5
LP
1461 /* Create /dev/net, so that we can create /dev/net/tun in it */
1462 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1463 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1464
88213476 1465 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1466 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1467 struct stat st;
88213476 1468
7f112f50 1469 from = strappend("/dev/", d);
03cfe0d5 1470 to = prefix_root(dest, from);
88213476
LP
1471
1472 if (stat(from, &st) < 0) {
1473
4a62c710
MS
1474 if (errno != ENOENT)
1475 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1476
a258bf26 1477 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1478
03cfe0d5 1479 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1480 return -EIO;
a258bf26 1481
85614d66 1482 } else {
81f5049b 1483 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1484 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1485 if (errno == EEXIST)
8dbf71ec 1486 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1487 if (errno != EPERM)
1488 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1489
1490 /* Some systems abusively restrict mknod but
1491 * allow bind mounts. */
1492 r = touch(to);
1493 if (r < 0)
1494 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1495 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1496 if (r < 0)
1497 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1498 }
6278cf60 1499
03cfe0d5
LP
1500 r = userns_lchown(to, 0, 0);
1501 if (r < 0)
1502 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1503 }
88213476
LP
1504 }
1505
e58a1277
LP
1506 return r;
1507}
88213476 1508
03cfe0d5
LP
1509static int setup_pts(const char *dest) {
1510 _cleanup_free_ char *options = NULL;
1511 const char *p;
709f6e46 1512 int r;
03cfe0d5
LP
1513
1514#ifdef HAVE_SELINUX
1515 if (arg_selinux_apifs_context)
1516 (void) asprintf(&options,
3dce8915 1517 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1518 arg_uid_shift + TTY_GID,
1519 arg_selinux_apifs_context);
1520 else
1521#endif
1522 (void) asprintf(&options,
3dce8915 1523 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1524 arg_uid_shift + TTY_GID);
f2d88580 1525
03cfe0d5 1526 if (!options)
f2d88580
LP
1527 return log_oom();
1528
03cfe0d5 1529 /* Mount /dev/pts itself */
cc9fce65 1530 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1531 if (mkdir(p, 0755) < 0)
1532 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1533 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1534 if (r < 0)
1535 return r;
709f6e46
MS
1536 r = userns_lchown(p, 0, 0);
1537 if (r < 0)
1538 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1539
1540 /* Create /dev/ptmx symlink */
1541 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1542 if (symlink("pts/ptmx", p) < 0)
1543 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1544 r = userns_lchown(p, 0, 0);
1545 if (r < 0)
1546 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1547
03cfe0d5
LP
1548 /* And fix /dev/pts/ptmx ownership */
1549 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1550 r = userns_lchown(p, 0, 0);
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1553
f2d88580
LP
1554 return 0;
1555}
1556
e58a1277 1557static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1558 _cleanup_umask_ mode_t u;
1559 const char *to;
e58a1277 1560 int r;
e58a1277
LP
1561
1562 assert(dest);
1563 assert(console);
1564
1565 u = umask(0000);
1566
03cfe0d5 1567 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1568 if (r < 0)
1569 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1570
a258bf26
LP
1571 /* We need to bind mount the right tty to /dev/console since
1572 * ptys can only exist on pts file systems. To have something
81f5049b 1573 * to bind mount things on we create a empty regular file. */
a258bf26 1574
03cfe0d5 1575 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1576 r = touch(to);
1577 if (r < 0)
1578 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1579
60e76d48 1580 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1581}
1582
1583static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1584 const char *from, *to;
7fd1b19b 1585 _cleanup_umask_ mode_t u;
d9603714 1586 int fd, r;
e58a1277 1587
e58a1277 1588 assert(kmsg_socket >= 0);
a258bf26 1589
e58a1277 1590 u = umask(0000);
a258bf26 1591
03cfe0d5 1592 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1593 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1594 * on the reading side behave very similar to /proc/kmsg,
1595 * their writing side behaves differently from /dev/kmsg in
1596 * that writing blocks when nothing is reading. In order to
1597 * avoid any problems with containers deadlocking due to this
1598 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1599 from = prefix_roota(dest, "/run/kmsg");
1600 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1601
4a62c710 1602 if (mkfifo(from, 0600) < 0)
03cfe0d5 1603 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1604 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1605 if (r < 0)
1606 return r;
e58a1277
LP
1607
1608 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1609 if (fd < 0)
1610 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1611
e58a1277
LP
1612 /* Store away the fd in the socket, so that it stays open as
1613 * long as we run the child */
3ee897d6 1614 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1615 safe_close(fd);
e58a1277 1616
d9603714
DH
1617 if (r < 0)
1618 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1619
03cfe0d5
LP
1620 /* And now make the FIFO unavailable as /run/kmsg... */
1621 (void) unlink(from);
1622
25ea79fe 1623 return 0;
88213476
LP
1624}
1625
1c4baffc 1626static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1627 union in_addr_union *exposed = userdata;
1628
1629 assert(rtnl);
1630 assert(m);
1631 assert(exposed);
1632
7a8f6325 1633 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1634 return 0;
1635}
1636
3a74cea5 1637static int setup_hostname(void) {
3a74cea5 1638
0c582db0 1639 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1640 return 0;
1641
605f81a8 1642 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1643 return -errno;
3a74cea5 1644
7027ff61 1645 return 0;
3a74cea5
LP
1646}
1647
57fb9fb5 1648static int setup_journal(const char *directory) {
e01ff70a 1649 sd_id128_t this_id;
0f5e1382 1650 _cleanup_free_ char *d = NULL;
e01ff70a 1651 const char *p, *q;
8054d749 1652 bool try;
e01ff70a 1653 char id[33];
57fb9fb5
LP
1654 int r;
1655
df9a75e4
LP
1656 /* Don't link journals in ephemeral mode */
1657 if (arg_ephemeral)
1658 return 0;
1659
8054d749
LP
1660 if (arg_link_journal == LINK_NO)
1661 return 0;
1662
1663 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1664
4d680aee 1665 r = sd_id128_get_machine(&this_id);
f647962d
MS
1666 if (r < 0)
1667 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1668
e01ff70a 1669 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1670 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1671 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1672 if (try)
4d680aee 1673 return 0;
df9a75e4 1674 return -EEXIST;
4d680aee
ZJS
1675 }
1676
03cfe0d5
LP
1677 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1678 if (r < 0)
1679 return log_error_errno(r, "Failed to create /var: %m");
1680
1681 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1682 if (r < 0)
1683 return log_error_errno(r, "Failed to create /var/log: %m");
1684
1685 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1686 if (r < 0)
1687 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1688
e01ff70a
MS
1689 (void) sd_id128_to_string(arg_uuid, id);
1690
03cfe0d5
LP
1691 p = strjoina("/var/log/journal/", id);
1692 q = prefix_roota(directory, p);
27407a01 1693
e1873695 1694 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1695 if (try)
1696 return 0;
27407a01 1697
8054d749
LP
1698 log_error("%s: already a mount point, refusing to use for journal", p);
1699 return -EEXIST;
57fb9fb5
LP
1700 }
1701
e1873695 1702 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1703 if (try)
1704 return 0;
57fb9fb5 1705
8054d749
LP
1706 log_error("%s: already a mount point, refusing to use for journal", q);
1707 return -EEXIST;
57fb9fb5
LP
1708 }
1709
1710 r = readlink_and_make_absolute(p, &d);
1711 if (r >= 0) {
1712 if ((arg_link_journal == LINK_GUEST ||
1713 arg_link_journal == LINK_AUTO) &&
1714 path_equal(d, q)) {
1715
03cfe0d5 1716 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1717 if (r < 0)
709f6e46 1718 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1719 return 0;
57fb9fb5
LP
1720 }
1721
4a62c710
MS
1722 if (unlink(p) < 0)
1723 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1724 } else if (r == -EINVAL) {
1725
1726 if (arg_link_journal == LINK_GUEST &&
1727 rmdir(p) < 0) {
1728
27407a01
ZJS
1729 if (errno == ENOTDIR) {
1730 log_error("%s already exists and is neither a symlink nor a directory", p);
1731 return r;
4314d33f
MS
1732 } else
1733 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1734 }
4314d33f
MS
1735 } else if (r != -ENOENT)
1736 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1737
1738 if (arg_link_journal == LINK_GUEST) {
1739
1740 if (symlink(q, p) < 0) {
8054d749 1741 if (try) {
56f64d95 1742 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1743 return 0;
4314d33f
MS
1744 } else
1745 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1746 }
1747
03cfe0d5 1748 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1749 if (r < 0)
709f6e46 1750 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1751 return 0;
57fb9fb5
LP
1752 }
1753
1754 if (arg_link_journal == LINK_HOST) {
ccddd104 1755 /* don't create parents here — if the host doesn't have
574edc90 1756 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1757
1758 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1759 if (try) {
56f64d95 1760 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1761 return 0;
4314d33f
MS
1762 } else
1763 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1764 }
1765
27407a01
ZJS
1766 } else if (access(p, F_OK) < 0)
1767 return 0;
57fb9fb5 1768
cdb2b9d0
LP
1769 if (dir_is_empty(q) == 0)
1770 log_warning("%s is not empty, proceeding anyway.", q);
1771
03cfe0d5 1772 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1773 if (r < 0)
1774 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1775
60e76d48
ZJS
1776 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1777 if (r < 0)
4a62c710 1778 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1779
27407a01 1780 return 0;
57fb9fb5
LP
1781}
1782
88213476 1783static int drop_capabilities(void) {
520e0d54 1784 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1785}
1786
db999e0f
LP
1787static int reset_audit_loginuid(void) {
1788 _cleanup_free_ char *p = NULL;
1789 int r;
1790
0c582db0 1791 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1792 return 0;
1793
1794 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1795 if (r == -ENOENT)
db999e0f 1796 return 0;
f647962d
MS
1797 if (r < 0)
1798 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1799
1800 /* Already reset? */
1801 if (streq(p, "4294967295"))
1802 return 0;
1803
ad118bda 1804 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1805 if (r < 0) {
10a87006
LP
1806 log_error_errno(r,
1807 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1808 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1809 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1810 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1811 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1812
db999e0f 1813 sleep(5);
77b6e194 1814 }
db999e0f
LP
1815
1816 return 0;
77b6e194
LP
1817}
1818
24fb1112 1819
785890ac
LP
1820static int setup_propagate(const char *root) {
1821 const char *p, *q;
709f6e46 1822 int r;
785890ac
LP
1823
1824 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1825 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1826 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1827 (void) mkdir_p(p, 0600);
1828
709f6e46
MS
1829 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1830 if (r < 0)
1831 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1832
709f6e46
MS
1833 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1834 if (r < 0)
1835 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1836
709f6e46
MS
1837 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1838 if (r < 0)
1839 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1840
03cfe0d5 1841 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1842 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1843 if (r < 0)
1844 return r;
785890ac 1845
60e76d48
ZJS
1846 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1847 if (r < 0)
1848 return r;
785890ac 1849
19caffac
AC
1850 /* machined will MS_MOVE into that directory, and that's only
1851 * supported for non-shared mounts. */
60e76d48 1852 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1853}
1854
317feb4d 1855static int setup_machine_id(const char *directory) {
691675ba
LP
1856 const char *etc_machine_id;
1857 sd_id128_t id;
3bbaff3e 1858 int r;
e01ff70a 1859
317feb4d
LP
1860 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1861 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1862 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1863 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1864 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1865 * container behaves nicely). */
1866
e01ff70a
MS
1867 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1868
691675ba 1869 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
1870 if (r < 0) {
1871 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1872 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 1873
317feb4d
LP
1874 if (sd_id128_is_null(arg_uuid)) {
1875 r = sd_id128_randomize(&arg_uuid);
1876 if (r < 0)
1877 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1878 }
1879 } else {
1880 if (sd_id128_is_null(id)) {
1881 log_error("Machine ID in container image is zero, refusing.");
1882 return -EINVAL;
1883 }
e01ff70a 1884
317feb4d
LP
1885 arg_uuid = id;
1886 }
691675ba 1887
e01ff70a
MS
1888 return 0;
1889}
1890
7336138e
LP
1891static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1892 int r;
1893
1894 assert(directory);
1895
0de7acce 1896 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
1897 return 0;
1898
1899 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1900 if (r == -EOPNOTSUPP)
1901 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1902 if (r == -EBADE)
1903 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1904 if (r < 0)
1905 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1906 if (r == 0)
1907 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1908 else
1909 log_debug("Patched directory tree to match UID/GID range.");
1910
1911 return r;
1912}
1913
113cea80 1914/*
6d416b9c
LS
1915 * Return values:
1916 * < 0 : wait_for_terminate() failed to get the state of the
1917 * container, the container was terminated by a signal, or
1918 * failed for an unknown reason. No change is made to the
1919 * container argument.
1920 * > 0 : The program executed in the container terminated with an
1921 * error. The exit code of the program executed in the
919699ec
LP
1922 * container is returned. The container argument has been set
1923 * to CONTAINER_TERMINATED.
6d416b9c
LS
1924 * 0 : The container is being rebooted, has been shut down or exited
1925 * successfully. The container argument has been set to either
1926 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 1927 *
6d416b9c
LS
1928 * That is, success is indicated by a return value of zero, and an
1929 * error is indicated by a non-zero value.
113cea80
DH
1930 */
1931static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 1932 siginfo_t status;
919699ec 1933 int r;
113cea80
DH
1934
1935 r = wait_for_terminate(pid, &status);
f647962d
MS
1936 if (r < 0)
1937 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
1938
1939 switch (status.si_code) {
fddbb89c 1940
113cea80 1941 case CLD_EXITED:
b5a2179b 1942 if (status.si_status == 0)
919699ec 1943 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 1944 else
919699ec 1945 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 1946
919699ec
LP
1947 *container = CONTAINER_TERMINATED;
1948 return status.si_status;
113cea80
DH
1949
1950 case CLD_KILLED:
1951 if (status.si_status == SIGINT) {
919699ec 1952 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 1953 *container = CONTAINER_TERMINATED;
919699ec
LP
1954 return 0;
1955
113cea80 1956 } else if (status.si_status == SIGHUP) {
919699ec 1957 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 1958 *container = CONTAINER_REBOOTED;
919699ec 1959 return 0;
113cea80 1960 }
919699ec 1961
ec251fe7 1962 /* fall through */
113cea80
DH
1963
1964 case CLD_DUMPED:
fddbb89c 1965 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 1966 return -EIO;
113cea80
DH
1967
1968 default:
fddbb89c 1969 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 1970 return -EIO;
113cea80 1971 }
113cea80
DH
1972}
1973
023fb90b
LP
1974static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1975 pid_t pid;
1976
4a0b58c4 1977 pid = PTR_TO_PID(userdata);
023fb90b 1978 if (pid > 0) {
c6c8f6e2 1979 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
1980 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1981 sd_event_source_set_userdata(s, NULL);
1982 return 0;
1983 }
1984 }
1985
1986 sd_event_exit(sd_event_source_get_event(s), 0);
1987 return 0;
1988}
1989
6916b164
AU
1990static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
1991 for (;;) {
1992 siginfo_t si = {};
1993 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
1994 return log_error_errno(errno, "Failed to waitid(): %m");
1995 if (si.si_pid == 0) /* No pending children. */
1996 break;
1997 if (si.si_pid == PTR_TO_PID(userdata)) {
1998 /* The main process we care for has exited. Return from
1999 * signal handler but leave the zombie. */
2000 sd_event_exit(sd_event_source_get_event(s), 0);
2001 break;
2002 }
2003 /* Reap all other children. */
2004 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2005 }
2006
2007 return 0;
2008}
2009
ec16945e 2010static int determine_names(void) {
1b9cebf6 2011 int r;
ec16945e 2012
c1521918
LP
2013 if (arg_template && !arg_directory && arg_machine) {
2014
2015 /* If --template= was specified then we should not
2016 * search for a machine, but instead create a new one
2017 * in /var/lib/machine. */
2018
605405c6 2019 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2020 if (!arg_directory)
2021 return log_oom();
2022 }
2023
ec16945e 2024 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2025 if (arg_machine) {
2026 _cleanup_(image_unrefp) Image *i = NULL;
2027
2028 r = image_find(arg_machine, &i);
2029 if (r < 0)
2030 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2031 if (r == 0) {
35bca925 2032 log_error("No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2033 return -ENOENT;
2034 }
2035
aceac2f0 2036 if (i->type == IMAGE_RAW)
0f03c2a4 2037 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2038 else
0f03c2a4 2039 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2040 if (r < 0)
0f3be6ca 2041 return log_oom();
1b9cebf6 2042
aee327b8
LP
2043 if (!arg_ephemeral)
2044 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2045 } else
ec16945e
LP
2046 arg_directory = get_current_dir_name();
2047
0f3be6ca 2048 if (!arg_directory && !arg_image) {
1b9cebf6 2049 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2050 return -EINVAL;
2051 }
2052 }
2053
2054 if (!arg_machine) {
4827ab48 2055
b9ba4dab
LP
2056 if (arg_directory && path_equal(arg_directory, "/"))
2057 arg_machine = gethostname_malloc();
4827ab48
LP
2058 else {
2059 if (arg_image) {
2060 char *e;
2061
2062 arg_machine = strdup(basename(arg_image));
2063
2064 /* Truncate suffix if there is one */
2065 e = endswith(arg_machine, ".raw");
2066 if (e)
2067 *e = 0;
2068 } else
2069 arg_machine = strdup(basename(arg_directory));
2070 }
ec16945e
LP
2071 if (!arg_machine)
2072 return log_oom();
2073
ae691c1d 2074 hostname_cleanup(arg_machine);
ec16945e
LP
2075 if (!machine_name_is_valid(arg_machine)) {
2076 log_error("Failed to determine machine name automatically, please use -M.");
2077 return -EINVAL;
2078 }
b9ba4dab
LP
2079
2080 if (arg_ephemeral) {
2081 char *b;
2082
2083 /* Add a random suffix when this is an
2084 * ephemeral machine, so that we can run many
2085 * instances at once without manually having
2086 * to specify -M each time. */
2087
2088 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2089 return log_oom();
2090
2091 free(arg_machine);
2092 arg_machine = b;
2093 }
ec16945e
LP
2094 }
2095
2096 return 0;
2097}
2098
8d4aa2bb 2099static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2100 char *chased;
2101 int r;
2102
2103 assert(p);
2104
2105 if (!*p)
2106 return 0;
2107
8d4aa2bb 2108 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2109 if (r < 0)
2110 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2111
2112 free(*p);
2113 *p = chased;
2114
2115 return 0;
2116}
2117
03cfe0d5 2118static int determine_uid_shift(const char *directory) {
6dac160c
LP
2119 int r;
2120
0de7acce 2121 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2122 arg_uid_shift = 0;
6dac160c 2123 return 0;
03cfe0d5 2124 }
6dac160c
LP
2125
2126 if (arg_uid_shift == UID_INVALID) {
2127 struct stat st;
2128
03cfe0d5 2129 r = stat(directory, &st);
6dac160c 2130 if (r < 0)
03cfe0d5 2131 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2132
2133 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2134
2135 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2136 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2137 return -EINVAL;
2138 }
2139
2140 arg_uid_range = UINT32_C(0x10000);
2141 }
2142
2143 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2144 log_error("UID base too high for UID range.");
2145 return -EINVAL;
2146 }
2147
6dac160c
LP
2148 return 0;
2149}
2150
03cfe0d5
LP
2151static int inner_child(
2152 Barrier *barrier,
2153 const char *directory,
2154 bool secondary,
2155 int kmsg_socket,
2156 int rtnl_socket,
f757855e 2157 FDSet *fds) {
69c79d3c 2158
03cfe0d5 2159 _cleanup_free_ char *home = NULL;
e01ff70a 2160 char as_uuid[37];
6aadfa4c 2161 unsigned n_env = 1;
03cfe0d5
LP
2162 const char *envp[] = {
2163 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2164 NULL, /* container */
03cfe0d5
LP
2165 NULL, /* TERM */
2166 NULL, /* HOME */
2167 NULL, /* USER */
2168 NULL, /* LOGNAME */
2169 NULL, /* container_uuid */
2170 NULL, /* LISTEN_FDS */
2171 NULL, /* LISTEN_PID */
9c1e04d0 2172 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2173 NULL
2174 };
1a68e1e5 2175 const char *exec_target;
88213476 2176
2371271c 2177 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2178 int r;
88213476 2179
03cfe0d5
LP
2180 assert(barrier);
2181 assert(directory);
2182 assert(kmsg_socket >= 0);
88213476 2183
0de7acce 2184 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2185 /* Tell the parent, that it now can write the UID map. */
2186 (void) barrier_place(barrier); /* #1 */
7027ff61 2187
03cfe0d5
LP
2188 /* Wait until the parent wrote the UID map */
2189 if (!barrier_place_and_sync(barrier)) { /* #2 */
2190 log_error("Parent died too early");
2191 return -ESRCH;
2192 }
88213476
LP
2193 }
2194
6d66bd3b
EV
2195 r = reset_uid_gid();
2196 if (r < 0)
2197 return log_error_errno(r, "Couldn't become new root: %m");
2198
0de7acce 2199 r = mount_all(NULL,
4f086aab 2200 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2201 arg_uid_shift,
2202 arg_uid_range,
2203 arg_selinux_apifs_context);
2204
03cfe0d5
LP
2205 if (r < 0)
2206 return r;
2207
4f086aab 2208 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2209 if (r < 0)
2210 return r;
2211
03cfe0d5
LP
2212 /* Wait until we are cgroup-ified, so that we
2213 * can mount the right cgroup path writable */
2214 if (!barrier_place_and_sync(barrier)) { /* #3 */
2215 log_error("Parent died too early");
2216 return -ESRCH;
88213476
LP
2217 }
2218
5a8ff0e6 2219 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2220 r = unshare(CLONE_NEWCGROUP);
2221 if (r < 0)
2222 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2223 r = mount_cgroups(
2224 "",
2225 arg_unified_cgroup_hierarchy,
2226 arg_userns_mode != USER_NAMESPACE_NO,
2227 arg_uid_shift,
2228 arg_uid_range,
5a8ff0e6 2229 arg_selinux_apifs_context,
ada54120 2230 true);
0996ef00
CB
2231 if (r < 0)
2232 return r;
2233 } else {
2234 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2235 if (r < 0)
2236 return r;
2237 }
ec16945e 2238
03cfe0d5
LP
2239 r = setup_boot_id(NULL);
2240 if (r < 0)
2241 return r;
ec16945e 2242
03cfe0d5
LP
2243 r = setup_kmsg(NULL, kmsg_socket);
2244 if (r < 0)
2245 return r;
2246 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2247
03cfe0d5 2248 umask(0022);
30535c16 2249
03cfe0d5
LP
2250 if (setsid() < 0)
2251 return log_error_errno(errno, "setsid() failed: %m");
2252
2253 if (arg_private_network)
2254 loopback_setup();
2255
7a8f6325
LP
2256 if (arg_expose_ports) {
2257 r = expose_port_send_rtnl(rtnl_socket);
2258 if (r < 0)
2259 return r;
2260 rtnl_socket = safe_close(rtnl_socket);
2261 }
03cfe0d5 2262
709f6e46
MS
2263 r = drop_capabilities();
2264 if (r < 0)
2265 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2266
2267 setup_hostname();
2268
050f7277 2269 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2270 if (personality(arg_personality) < 0)
2271 return log_error_errno(errno, "personality() failed: %m");
2272 } else if (secondary) {
2273 if (personality(PER_LINUX32) < 0)
2274 return log_error_errno(errno, "personality() failed: %m");
2275 }
2276
2277#ifdef HAVE_SELINUX
2278 if (arg_selinux_context)
2ed96880 2279 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2280 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2281#endif
2282
ee645080 2283 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2284 if (r < 0)
2285 return r;
2286
6aadfa4c
ILG
2287 /* LXC sets container=lxc, so follow the scheme here */
2288 envp[n_env++] = strjoina("container=", arg_container_service_name);
2289
03cfe0d5
LP
2290 envp[n_env] = strv_find_prefix(environ, "TERM=");
2291 if (envp[n_env])
313cefa1 2292 n_env++;
03cfe0d5
LP
2293
2294 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2295 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2296 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2297 return log_oom();
2298
3bbaff3e 2299 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2300
691675ba 2301 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2302 return log_oom();
03cfe0d5
LP
2303
2304 if (fdset_size(fds) > 0) {
2305 r = fdset_cloexec(fds, false);
2306 if (r < 0)
2307 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2308
2309 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2310 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2311 return log_oom();
2312 }
9c1e04d0
AP
2313 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2314 return log_oom();
03cfe0d5 2315
2371271c
TG
2316 env_use = strv_env_merge(2, envp, arg_setenv);
2317 if (!env_use)
2318 return log_oom();
03cfe0d5
LP
2319
2320 /* Let the parent know that we are ready and
2321 * wait until the parent is ready with the
2322 * setup, too... */
2323 if (!barrier_place_and_sync(barrier)) { /* #4 */
2324 log_error("Parent died too early");
2325 return -ESRCH;
2326 }
2327
5f932eb9
LP
2328 if (arg_chdir)
2329 if (chdir(arg_chdir) < 0)
2330 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2331
7732f92b 2332 if (arg_start_mode == START_PID2) {
75bf701f 2333 r = stub_pid1(arg_uuid);
7732f92b
LP
2334 if (r < 0)
2335 return r;
2336 }
2337
03cfe0d5
LP
2338 /* Now, explicitly close the log, so that we
2339 * then can close all remaining fds. Closing
2340 * the log explicitly first has the benefit
2341 * that the logging subsystem knows about it,
2342 * and is thus ready to be reopened should we
2343 * need it again. Note that the other fds
2344 * closed here are at least the locking and
2345 * barrier fds. */
2346 log_close();
2347 (void) fdset_close_others(fds);
2348
7732f92b 2349 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2350 char **a;
2351 size_t m;
2352
2353 /* Automatically search for the init system */
2354
75f32f04
ZJS
2355 m = strv_length(arg_parameters);
2356 a = newa(char*, m + 2);
2357 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2358 a[1 + m] = NULL;
03cfe0d5 2359
ced58da7 2360 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2361 execve(a[0], a, env_use);
2362
ced58da7 2363 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2364 execve(a[0], a, env_use);
2365
ced58da7 2366 a[0] = (char*) "/sbin/init";
03cfe0d5 2367 execve(a[0], a, env_use);
ced58da7
LP
2368
2369 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2370 } else if (!strv_isempty(arg_parameters)) {
2371 exec_target = arg_parameters[0];
f757855e 2372 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2373 } else {
5f932eb9 2374 if (!arg_chdir)
d929b0f9
ZJS
2375 /* If we cannot change the directory, we'll end up in /, that is expected. */
2376 (void) chdir(home ?: "/root");
5f932eb9 2377
03cfe0d5
LP
2378 execle("/bin/bash", "-bash", NULL, env_use);
2379 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2380
2381 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2382 }
2383
35607a8d 2384 r = -errno;
03cfe0d5 2385 (void) log_open();
1a68e1e5 2386 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2387}
2388
9c1e04d0
AP
2389static int setup_sd_notify_child(void) {
2390 static const int one = 1;
2391 int fd = -1;
2392 union sockaddr_union sa = {
2393 .sa.sa_family = AF_UNIX,
2394 };
2395 int r;
2396
2397 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2398 if (fd < 0)
2399 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2400
2401 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2402 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2403
2404 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2405 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2406 if (r < 0) {
2407 safe_close(fd);
2408 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2409 }
2410
adc7d9f0
EV
2411 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2412 if (r < 0) {
2413 safe_close(fd);
2414 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2415 }
2416
9c1e04d0
AP
2417 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2418 if (r < 0) {
2419 safe_close(fd);
2420 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2421 }
2422
2423 return fd;
2424}
2425
03cfe0d5
LP
2426static int outer_child(
2427 Barrier *barrier,
2428 const char *directory,
2429 const char *console,
2d845785 2430 DissectedImage *dissected_image,
03cfe0d5
LP
2431 bool interactive,
2432 bool secondary,
2433 int pid_socket,
e01ff70a 2434 int uuid_socket,
9c1e04d0 2435 int notify_socket,
03cfe0d5
LP
2436 int kmsg_socket,
2437 int rtnl_socket,
825d5287 2438 int uid_shift_socket,
f757855e 2439 FDSet *fds) {
03cfe0d5
LP
2440
2441 pid_t pid;
2442 ssize_t l;
2443 int r;
9c1e04d0 2444 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2445
2446 assert(barrier);
2447 assert(directory);
2448 assert(console);
2449 assert(pid_socket >= 0);
e01ff70a 2450 assert(uuid_socket >= 0);
9c1e04d0 2451 assert(notify_socket >= 0);
03cfe0d5
LP
2452 assert(kmsg_socket >= 0);
2453
2454 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2455 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2456
2457 if (interactive) {
2458 close_nointr(STDIN_FILENO);
2459 close_nointr(STDOUT_FILENO);
2460 close_nointr(STDERR_FILENO);
2461
2462 r = open_terminal(console, O_RDWR);
2463 if (r != STDIN_FILENO) {
2464 if (r >= 0) {
2465 safe_close(r);
2466 r = -EINVAL;
2467 }
2468
2469 return log_error_errno(r, "Failed to open console: %m");
2470 }
2471
2472 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2473 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2474 return log_error_errno(errno, "Failed to duplicate console: %m");
2475 }
2476
2477 r = reset_audit_loginuid();
2478 if (r < 0)
2479 return r;
2480
2481 /* Mark everything as slave, so that we still
2482 * receive mounts from the real root, but don't
2483 * propagate mounts to the real root. */
60e76d48
ZJS
2484 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2485 if (r < 0)
2486 return r;
03cfe0d5 2487
2d845785 2488 if (dissected_image) {
18b5886e 2489 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2490 if (r < 0)
2491 return r;
2492 }
03cfe0d5 2493
391567f4
LP
2494 r = determine_uid_shift(directory);
2495 if (r < 0)
2496 return r;
2497
0de7acce 2498 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2499 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2500 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2501 if (l < 0)
2502 return log_error_errno(errno, "Failed to send UID shift: %m");
2503 if (l != sizeof(arg_uid_shift)) {
2504 log_error("Short write while sending UID shift.");
2505 return -EIO;
2506 }
0e7ac751 2507
0de7acce 2508 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2509 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2510 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2511 * not it will pick a different one, and send it back to us. */
2512
2513 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2514 if (l < 0)
2515 return log_error_errno(errno, "Failed to recv UID shift: %m");
2516 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2517 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2518 return -EIO;
2519 }
2520 }
2521
2522 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2523 }
2524
03cfe0d5 2525 /* Turn directory into bind mount */
60e76d48
ZJS
2526 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2527 if (r < 0)
2528 return r;
03cfe0d5 2529
b53ede69
PW
2530 r = setup_pivot_root(
2531 directory,
2532 arg_pivot_root_new,
2533 arg_pivot_root_old);
2534 if (r < 0)
2535 return r;
2536
0de7acce
LP
2537 r = setup_volatile(
2538 directory,
2539 arg_volatile_mode,
2540 arg_userns_mode != USER_NAMESPACE_NO,
2541 arg_uid_shift,
2542 arg_uid_range,
2543 arg_selinux_context);
03cfe0d5
LP
2544 if (r < 0)
2545 return r;
2546
0de7acce
LP
2547 r = setup_volatile_state(
2548 directory,
2549 arg_volatile_mode,
2550 arg_userns_mode != USER_NAMESPACE_NO,
2551 arg_uid_shift,
2552 arg_uid_range,
2553 arg_selinux_context);
03cfe0d5
LP
2554 if (r < 0)
2555 return r;
2556
4ad14eff
LP
2557 /* Mark everything as shared so our mounts get propagated down. This is
2558 * required to make new bind mounts available in systemd services
2559 * inside the containter that create a new mount namespace.
2560 * See https://github.com/systemd/systemd/issues/3860
2561 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2562 * shared propagation mode. */
4ad14eff
LP
2563 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2564 if (r < 0)
2565 return r;
2566
2567 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2568 if (r < 0)
2569 return r;
2570
03cfe0d5
LP
2571 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2572 if (r < 0)
2573 return r;
2574
03cfe0d5 2575 if (arg_read_only) {
6b7c9f8b 2576 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2577 if (r < 0)
2578 return log_error_errno(r, "Failed to make tree read-only: %m");
2579 }
2580
0de7acce 2581 r = mount_all(directory,
4f086aab 2582 arg_mount_settings,
0de7acce
LP
2583 arg_uid_shift,
2584 arg_uid_range,
2585 arg_selinux_apifs_context);
03cfe0d5
LP
2586 if (r < 0)
2587 return r;
2588
07fa00f9
LP
2589 r = copy_devnodes(directory);
2590 if (r < 0)
03cfe0d5
LP
2591 return r;
2592
2593 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2594
07fa00f9
LP
2595 r = setup_pts(directory);
2596 if (r < 0)
03cfe0d5
LP
2597 return r;
2598
2599 r = setup_propagate(directory);
2600 if (r < 0)
2601 return r;
2602
2603 r = setup_dev_console(directory, console);
2604 if (r < 0)
2605 return r;
2606
520e0d54 2607 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
2608 if (r < 0)
2609 return r;
2610
2611 r = setup_timezone(directory);
2612 if (r < 0)
2613 return r;
2614
2615 r = setup_resolv_conf(directory);
2616 if (r < 0)
2617 return r;
2618
e01ff70a
MS
2619 r = setup_machine_id(directory);
2620 if (r < 0)
2621 return r;
2622
03cfe0d5
LP
2623 r = setup_journal(directory);
2624 if (r < 0)
2625 return r;
2626
0de7acce
LP
2627 r = mount_custom(
2628 directory,
2629 arg_custom_mounts,
2630 arg_n_custom_mounts,
2631 arg_userns_mode != USER_NAMESPACE_NO,
2632 arg_uid_shift,
2633 arg_uid_range,
2634 arg_selinux_apifs_context);
03cfe0d5
LP
2635 if (r < 0)
2636 return r;
2637
5a8ff0e6 2638 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2639 r = mount_cgroups(
2640 directory,
2641 arg_unified_cgroup_hierarchy,
2642 arg_userns_mode != USER_NAMESPACE_NO,
2643 arg_uid_shift,
2644 arg_uid_range,
5a8ff0e6 2645 arg_selinux_apifs_context,
ada54120 2646 false);
0996ef00
CB
2647 if (r < 0)
2648 return r;
2649 }
03cfe0d5
LP
2650
2651 r = mount_move_root(directory);
2652 if (r < 0)
2653 return log_error_errno(r, "Failed to move root directory: %m");
2654
9c1e04d0
AP
2655 fd = setup_sd_notify_child();
2656 if (fd < 0)
2657 return fd;
2658
03cfe0d5 2659 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2660 arg_clone_ns_flags |
03cfe0d5 2661 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2662 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2663 if (pid < 0)
2664 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2665 if (pid == 0) {
2666 pid_socket = safe_close(pid_socket);
e01ff70a 2667 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2668 notify_socket = safe_close(notify_socket);
825d5287 2669 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2670
2671 /* The inner child has all namespaces that are
2672 * requested, so that we all are owned by the user if
2673 * user namespaces are turned on. */
2674
f757855e 2675 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2676 if (r < 0)
2677 _exit(EXIT_FAILURE);
2678
2679 _exit(EXIT_SUCCESS);
2680 }
2681
2682 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2683 if (l < 0)
2684 return log_error_errno(errno, "Failed to send PID: %m");
2685 if (l != sizeof(pid)) {
2686 log_error("Short write while sending PID.");
2687 return -EIO;
2688 }
2689
e01ff70a
MS
2690 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2691 if (l < 0)
2692 return log_error_errno(errno, "Failed to send machine ID: %m");
2693 if (l != sizeof(arg_uuid)) {
2694 log_error("Short write while sending machine ID.");
2695 return -EIO;
2696 }
2697
9c1e04d0
AP
2698 l = send_one_fd(notify_socket, fd, 0);
2699 if (l < 0)
2700 return log_error_errno(errno, "Failed to send notify fd: %m");
2701
03cfe0d5 2702 pid_socket = safe_close(pid_socket);
e01ff70a 2703 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2704 notify_socket = safe_close(notify_socket);
327e26d6
KN
2705 kmsg_socket = safe_close(kmsg_socket);
2706 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2707
2708 return 0;
2709}
2710
0e7ac751
LP
2711static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2712 unsigned n_tries = 100;
2713 uid_t candidate;
2714 int r;
2715
2716 assert(shift);
2717 assert(ret_lock_file);
0de7acce 2718 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2719 assert(arg_uid_range == 0x10000U);
2720
2721 candidate = *shift;
2722
2723 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2724
2725 for (;;) {
2726 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2727 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2728
2729 if (--n_tries <= 0)
2730 return -EBUSY;
2731
2732 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2733 goto next;
2734 if ((candidate & UINT32_C(0xFFFF)) != 0)
2735 goto next;
2736
2737 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2738 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2739 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2740 goto next;
2741 if (r < 0)
2742 return r;
2743
2744 /* Make some superficial checks whether the range is currently known in the user database */
2745 if (getpwuid(candidate))
2746 goto next;
2747 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2748 goto next;
2749 if (getgrgid(candidate))
2750 goto next;
2751 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2752 goto next;
2753
2754 *ret_lock_file = lf;
2755 lf = (struct LockFile) LOCK_FILE_INIT;
2756 *shift = candidate;
2757 return 0;
2758
2759 next:
2760 random_bytes(&candidate, sizeof(candidate));
2761 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2762 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2763 }
2764}
2765
03cfe0d5
LP
2766static int setup_uid_map(pid_t pid) {
2767 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2768 int r;
2769
2770 assert(pid > 1);
2771
2772 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2773 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2774 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2775 if (r < 0)
2776 return log_error_errno(r, "Failed to write UID map: %m");
2777
2778 /* We always assign the same UID and GID ranges */
2779 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2780 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2781 if (r < 0)
2782 return log_error_errno(r, "Failed to write GID map: %m");
2783
2784 return 0;
2785}
2786
9c1e04d0 2787static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
2788 char buf[NOTIFY_BUFFER_MAX+1];
2789 char *p = NULL;
2790 struct iovec iovec = {
2791 .iov_base = buf,
2792 .iov_len = sizeof(buf)-1,
2793 };
2794 union {
2795 struct cmsghdr cmsghdr;
2796 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2797 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2798 } control = {};
2799 struct msghdr msghdr = {
2800 .msg_iov = &iovec,
2801 .msg_iovlen = 1,
2802 .msg_control = &control,
2803 .msg_controllen = sizeof(control),
2804 };
2805 struct cmsghdr *cmsg;
2806 struct ucred *ucred = NULL;
2807 ssize_t n;
2808 pid_t inner_child_pid;
2809 _cleanup_strv_free_ char **tags = NULL;
2810
2811 assert(userdata);
2812
2813 inner_child_pid = PTR_TO_PID(userdata);
2814
2815 if (revents != EPOLLIN) {
2816 log_warning("Got unexpected poll event for notify fd.");
2817 return 0;
2818 }
2819
2820 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2821 if (n < 0) {
2822 if (errno == EAGAIN || errno == EINTR)
2823 return 0;
2824
2825 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2826 }
2827 cmsg_close_all(&msghdr);
2828
2829 CMSG_FOREACH(cmsg, &msghdr) {
2830 if (cmsg->cmsg_level == SOL_SOCKET &&
2831 cmsg->cmsg_type == SCM_CREDENTIALS &&
2832 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2833
2834 ucred = (struct ucred*) CMSG_DATA(cmsg);
2835 }
2836 }
2837
2838 if (!ucred || ucred->pid != inner_child_pid) {
2839 log_warning("Received notify message without valid credentials. Ignoring.");
2840 return 0;
2841 }
2842
2843 if ((size_t) n >= sizeof(buf)) {
2844 log_warning("Received notify message exceeded maximum size. Ignoring.");
2845 return 0;
2846 }
2847
2848 buf[n] = 0;
2849 tags = strv_split(buf, "\n\r");
2850 if (!tags)
2851 return log_oom();
2852
2853 if (strv_find(tags, "READY=1"))
2854 sd_notifyf(false, "READY=1\n");
2855
2856 p = strv_find_startswith(tags, "STATUS=");
2857 if (p)
2858 sd_notifyf(false, "STATUS=Container running: %s", p);
2859
2860 return 0;
2861}
2862
5773024d 2863static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 2864 int r;
9c1e04d0 2865
5773024d 2866 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
2867 if (r < 0)
2868 return log_error_errno(r, "Failed to allocate notify event source: %m");
2869
5773024d 2870 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
2871
2872 return 0;
2873}
2874
f757855e
LP
2875static int load_settings(void) {
2876 _cleanup_(settings_freep) Settings *settings = NULL;
2877 _cleanup_fclose_ FILE *f = NULL;
2878 _cleanup_free_ char *p = NULL;
2879 const char *fn, *i;
2880 int r;
2881
2882 /* If all settings are masked, there's no point in looking for
2883 * the settings file */
2884 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2885 return 0;
2886
2887 fn = strjoina(arg_machine, ".nspawn");
2888
2889 /* We first look in the admin's directories in /etc and /run */
2890 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2891 _cleanup_free_ char *j = NULL;
2892
605405c6 2893 j = strjoin(i, "/", fn);
f757855e
LP
2894 if (!j)
2895 return log_oom();
2896
2897 f = fopen(j, "re");
2898 if (f) {
2899 p = j;
2900 j = NULL;
2901
b938cb90 2902 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2903 if (arg_settings_trusted < 0)
2904 arg_settings_trusted = true;
2905
2906 break;
2907 }
2908
2909 if (errno != ENOENT)
2910 return log_error_errno(errno, "Failed to open %s: %m", j);
2911 }
2912
2913 if (!f) {
2914 /* After that, let's look for a file next to the
2915 * actual image we shall boot. */
2916
2917 if (arg_image) {
2918 p = file_in_same_dir(arg_image, fn);
2919 if (!p)
2920 return log_oom();
2921 } else if (arg_directory) {
2922 p = file_in_same_dir(arg_directory, fn);
2923 if (!p)
2924 return log_oom();
2925 }
2926
2927 if (p) {
2928 f = fopen(p, "re");
2929 if (!f && errno != ENOENT)
2930 return log_error_errno(errno, "Failed to open %s: %m", p);
2931
b938cb90 2932 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2933 if (arg_settings_trusted < 0)
2934 arg_settings_trusted = false;
2935 }
2936 }
2937
2938 if (!f)
2939 return 0;
2940
2941 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2942
2943 r = settings_load(f, p, &settings);
2944 if (r < 0)
2945 return r;
2946
2947 /* Copy over bits from the settings, unless they have been
2948 * explicitly masked by command line switches. */
2949
7732f92b
LP
2950 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2951 settings->start_mode >= 0) {
2952 arg_start_mode = settings->start_mode;
f757855e
LP
2953
2954 strv_free(arg_parameters);
2955 arg_parameters = settings->parameters;
2956 settings->parameters = NULL;
2957 }
2958
b53ede69
PW
2959 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
2960 settings->pivot_root_new) {
2961 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
2962 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
2963 }
2964
5f932eb9
LP
2965 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2966 settings->working_directory) {
2967 free(arg_chdir);
2968 arg_chdir = settings->working_directory;
2969 settings->working_directory = NULL;
2970 }
2971
f757855e
LP
2972 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2973 settings->environment) {
2974 strv_free(arg_setenv);
2975 arg_setenv = settings->environment;
2976 settings->environment = NULL;
2977 }
2978
2979 if ((arg_settings_mask & SETTING_USER) == 0 &&
2980 settings->user) {
2981 free(arg_user);
2982 arg_user = settings->user;
2983 settings->user = NULL;
2984 }
2985
2986 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2987 uint64_t plus;
f757855e 2988
0e265674
LP
2989 plus = settings->capability;
2990 if (settings_private_network(settings))
2991 plus |= (1ULL << CAP_NET_ADMIN);
2992
2993 if (!arg_settings_trusted && plus != 0) {
2994 if (settings->capability != 0)
2995 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2996 } else
520e0d54 2997 arg_caps_retain |= plus;
f757855e 2998
520e0d54 2999 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3000 }
3001
3002 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3003 settings->kill_signal > 0)
3004 arg_kill_signal = settings->kill_signal;
3005
3006 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3007 settings->personality != PERSONALITY_INVALID)
3008 arg_personality = settings->personality;
3009
3010 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3011 !sd_id128_is_null(settings->machine_id)) {
3012
3013 if (!arg_settings_trusted)
3014 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3015 else
3016 arg_uuid = settings->machine_id;
3017 }
3018
3019 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3020 settings->read_only >= 0)
3021 arg_read_only = settings->read_only;
3022
3023 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3024 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3025 arg_volatile_mode = settings->volatile_mode;
3026
3027 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3028 settings->n_custom_mounts > 0) {
3029
3030 if (!arg_settings_trusted)
3031 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3032 else {
3033 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3034 arg_custom_mounts = settings->custom_mounts;
3035 arg_n_custom_mounts = settings->n_custom_mounts;
3036
3037 settings->custom_mounts = NULL;
3038 settings->n_custom_mounts = 0;
3039 }
3040 }
3041
3042 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3043 (settings->private_network >= 0 ||
3044 settings->network_veth >= 0 ||
3045 settings->network_bridge ||
22b28dfd 3046 settings->network_zone ||
f757855e
LP
3047 settings->network_interfaces ||
3048 settings->network_macvlan ||
f6d6bad1
LP
3049 settings->network_ipvlan ||
3050 settings->network_veth_extra)) {
f757855e
LP
3051
3052 if (!arg_settings_trusted)
3053 log_warning("Ignoring network settings, file %s is not trusted.", p);
3054 else {
f6d6bad1 3055 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3056 arg_private_network = settings_private_network(settings);
3057
f757855e
LP
3058 strv_free(arg_network_interfaces);
3059 arg_network_interfaces = settings->network_interfaces;
3060 settings->network_interfaces = NULL;
3061
3062 strv_free(arg_network_macvlan);
3063 arg_network_macvlan = settings->network_macvlan;
3064 settings->network_macvlan = NULL;
3065
3066 strv_free(arg_network_ipvlan);
3067 arg_network_ipvlan = settings->network_ipvlan;
3068 settings->network_ipvlan = NULL;
3069
f6d6bad1
LP
3070 strv_free(arg_network_veth_extra);
3071 arg_network_veth_extra = settings->network_veth_extra;
3072 settings->network_veth_extra = NULL;
3073
f757855e
LP
3074 free(arg_network_bridge);
3075 arg_network_bridge = settings->network_bridge;
3076 settings->network_bridge = NULL;
22b28dfd
LP
3077
3078 free(arg_network_zone);
3079 arg_network_zone = settings->network_zone;
3080 settings->network_zone = NULL;
f757855e
LP
3081 }
3082 }
3083
3084 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3085 settings->expose_ports) {
3086
3087 if (!arg_settings_trusted)
3088 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3089 else {
3090 expose_port_free_all(arg_expose_ports);
3091 arg_expose_ports = settings->expose_ports;
3092 settings->expose_ports = NULL;
3093 }
3094 }
3095
0de7acce
LP
3096 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3097 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3098
3099 if (!arg_settings_trusted)
3100 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3101 else {
3102 arg_userns_mode = settings->userns_mode;
3103 arg_uid_shift = settings->uid_shift;
3104 arg_uid_range = settings->uid_range;
3105 arg_userns_chown = settings->userns_chown;
3106 }
3107 }
3108
9c1e04d0
AP
3109 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3110 arg_notify_ready = settings->notify_ready;
3111
f757855e
LP
3112 return 0;
3113}
3114
b0067625
ZJS
3115static int run(int master,
3116 const char* console,
2d845785 3117 DissectedImage *dissected_image,
b0067625
ZJS
3118 bool interactive,
3119 bool secondary,
3120 FDSet *fds,
3121 char veth_name[IFNAMSIZ], bool *veth_created,
3122 union in_addr_union *exposed,
3123 pid_t *pid, int *ret) {
3124
3125 static const struct sigaction sa = {
3126 .sa_handler = nop_signal_handler,
e28c7cd0 3127 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3128 };
3129
3130 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3131 _cleanup_close_ int etc_passwd_lock = -1;
3132 _cleanup_close_pair_ int
3133 kmsg_socket_pair[2] = { -1, -1 },
3134 rtnl_socket_pair[2] = { -1, -1 },
3135 pid_socket_pair[2] = { -1, -1 },
3136 uuid_socket_pair[2] = { -1, -1 },
3137 notify_socket_pair[2] = { -1, -1 },
3138 uid_shift_socket_pair[2] = { -1, -1 };
3139 _cleanup_close_ int notify_socket= -1;
3140 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3141 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3142 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3143 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3144 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3145 ContainerStatus container_status = 0;
3146 char last_char = 0;
3147 int ifi = 0, r;
3148 ssize_t l;
3149 sigset_t mask_chld;
3150
3151 assert_se(sigemptyset(&mask_chld) == 0);
3152 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3153
3154 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3155 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3156 * check with getpwuid() if the specific user already exists. Note that /etc might be
3157 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3158 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3159 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3160 * really ours. */
3161
3162 etc_passwd_lock = take_etc_passwd_lock(NULL);
3163 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3164 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3165 }
3166
3167 r = barrier_create(&barrier);
3168 if (r < 0)
3169 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3170
3171 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3172 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3173
3174 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3175 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3176
3177 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3178 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3179
3180 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3181 return log_error_errno(errno, "Failed to create id socket pair: %m");
3182
3183 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3184 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3185
3186 if (arg_userns_mode != USER_NAMESPACE_NO)
3187 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3188 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3189
3190 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3191 * parent's blocking calls and give it a chance to call wait() and terminate. */
3192 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3193 if (r < 0)
3194 return log_error_errno(errno, "Failed to change the signal mask: %m");
3195
3196 r = sigaction(SIGCHLD, &sa, NULL);
3197 if (r < 0)
3198 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3199
3200 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3201 if (*pid < 0)
3202 return log_error_errno(errno, "clone() failed%s: %m",
3203 errno == EINVAL ?
3204 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3205
3206 if (*pid == 0) {
3207 /* The outer child only has a file system namespace. */
3208 barrier_set_role(&barrier, BARRIER_CHILD);
3209
3210 master = safe_close(master);
3211
3212 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3213 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3214 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3215 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3216 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3217 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3218
3219 (void) reset_all_signal_handlers();
3220 (void) reset_signal_mask();
3221
3222 r = outer_child(&barrier,
3223 arg_directory,
3224 console,
2d845785 3225 dissected_image,
b0067625
ZJS
3226 interactive,
3227 secondary,
3228 pid_socket_pair[1],
3229 uuid_socket_pair[1],
3230 notify_socket_pair[1],
3231 kmsg_socket_pair[1],
3232 rtnl_socket_pair[1],
3233 uid_shift_socket_pair[1],
3234 fds);
3235 if (r < 0)
3236 _exit(EXIT_FAILURE);
3237
3238 _exit(EXIT_SUCCESS);
3239 }
3240
3241 barrier_set_role(&barrier, BARRIER_PARENT);
3242
3243 fds = fdset_free(fds);
3244
3245 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3246 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3247 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3248 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3249 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3250 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3251
3252 if (arg_userns_mode != USER_NAMESPACE_NO) {
3253 /* The child just let us know the UID shift it might have read from the image. */
3254 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3255 if (l < 0)
3256 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3257 if (l != sizeof arg_uid_shift) {
3258 log_error("Short read while reading UID shift.");
3259 return -EIO;
3260 }
3261
3262 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3263 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3264 * image, but if that's already in use, pick a new one, and report back to the child,
3265 * which one we now picked. */
3266
3267 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3268 if (r < 0)
3269 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3270
3271 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3272 if (l < 0)
3273 return log_error_errno(errno, "Failed to send UID shift: %m");
3274 if (l != sizeof arg_uid_shift) {
3275 log_error("Short write while writing UID shift.");
3276 return -EIO;
3277 }
3278 }
3279 }
3280
3281 /* Wait for the outer child. */
3282 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3283 if (r != 0)
3284 return r < 0 ? r : -EIO;
3285
3286 /* And now retrieve the PID of the inner child. */
3287 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3288 if (l < 0)
3289 return log_error_errno(errno, "Failed to read inner child PID: %m");
3290 if (l != sizeof *pid) {
3291 log_error("Short read while reading inner child PID.");
3292 return -EIO;
3293 }
3294
3295 /* We also retrieve container UUID in case it was generated by outer child */
3296 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3297 if (l < 0)
3298 return log_error_errno(errno, "Failed to read container machine ID: %m");
3299 if (l != sizeof(arg_uuid)) {
3300 log_error("Short read while reading container machined ID.");
3301 return -EIO;
3302 }
3303
3304 /* We also retrieve the socket used for notifications generated by outer child */
3305 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3306 if (notify_socket < 0)
3307 return log_error_errno(notify_socket,
3308 "Failed to receive notification socket from the outer child: %m");
3309
3310 log_debug("Init process invoked as PID "PID_FMT, *pid);
3311
3312 if (arg_userns_mode != USER_NAMESPACE_NO) {
3313 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3314 log_error("Child died too early.");
3315 return -ESRCH;
3316 }
3317
3318 r = setup_uid_map(*pid);
3319 if (r < 0)
3320 return r;
3321
3322 (void) barrier_place(&barrier); /* #2 */
3323 }
3324
3325 if (arg_private_network) {
3326
3327 r = move_network_interfaces(*pid, arg_network_interfaces);
3328 if (r < 0)
3329 return r;
3330
3331 if (arg_network_veth) {
3332 r = setup_veth(arg_machine, *pid, veth_name,
3333 arg_network_bridge || arg_network_zone);
3334 if (r < 0)
3335 return r;
3336 else if (r > 0)
3337 ifi = r;
3338
3339 if (arg_network_bridge) {
3340 /* Add the interface to a bridge */
3341 r = setup_bridge(veth_name, arg_network_bridge, false);
3342 if (r < 0)
3343 return r;
3344 if (r > 0)
3345 ifi = r;
3346 } else if (arg_network_zone) {
3347 /* Add the interface to a bridge, possibly creating it */
3348 r = setup_bridge(veth_name, arg_network_zone, true);
3349 if (r < 0)
3350 return r;
3351 if (r > 0)
3352 ifi = r;
3353 }
3354 }
3355
3356 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3357 if (r < 0)
3358 return r;
3359
3360 /* We created the primary and extra veth links now; let's remember this, so that we know to
3361 remove them later on. Note that we don't bother with removing veth links that were created
3362 here when their setup failed half-way, because in that case the kernel should be able to
3363 remove them on its own, since they cannot be referenced by anything yet. */
3364 *veth_created = true;
3365
3366 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3367 if (r < 0)
3368 return r;
3369
3370 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3371 if (r < 0)
3372 return r;
3373 }
3374
3375 if (arg_register) {
3376 r = register_machine(
3377 arg_machine,
3378 *pid,
3379 arg_directory,
3380 arg_uuid,
3381 ifi,
3382 arg_slice,
3383 arg_custom_mounts, arg_n_custom_mounts,
3384 arg_kill_signal,
3385 arg_property,
3386 arg_keep_unit,
3387 arg_container_service_name);
3388 if (r < 0)
3389 return r;
cd2dfc6f
LP
3390 } else if (!arg_keep_unit) {
3391 r = allocate_scope(
3392 arg_machine,
3393 *pid,
3394 arg_slice,
3395 arg_custom_mounts, arg_n_custom_mounts,
3396 arg_kill_signal,
3397 arg_property);
3398 if (r < 0)
3399 return r;
3400
3401 } else if (arg_slice || arg_property)
3402 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3403
f0bef277 3404 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3405 if (r < 0)
3406 return r;
3407
3408 if (arg_keep_unit) {
3409 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3410 if (r < 0)
3411 return r;
3412 }
3413
3414 r = chown_cgroup(*pid, arg_uid_shift);
3415 if (r < 0)
3416 return r;
3417
3418 /* Notify the child that the parent is ready with all
3419 * its setup (including cgroup-ification), and that
3420 * the child can now hand over control to the code to
3421 * run inside the container. */
3422 (void) barrier_place(&barrier); /* #3 */
3423
3424 /* Block SIGCHLD here, before notifying child.
3425 * process_pty() will handle it with the other signals. */
3426 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3427
3428 /* Reset signal to default */
3429 r = default_signals(SIGCHLD, -1);
3430 if (r < 0)
3431 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3432
3433 r = sd_event_new(&event);
3434 if (r < 0)
3435 return log_error_errno(r, "Failed to get default event source: %m");
3436
5773024d 3437 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3438 if (r < 0)
3439 return r;
3440
3441 /* Let the child know that we are ready and wait that the child is completely ready now. */
3442 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3443 log_error("Child died too early.");
3444 return -ESRCH;
3445 }
3446
3447 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3448 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3449 etc_passwd_lock = safe_close(etc_passwd_lock);
3450
3451 sd_notifyf(false,
3452 "STATUS=Container running.\n"
3453 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3454 if (!arg_notify_ready)
3455 sd_notify(false, "READY=1\n");
3456
3457 if (arg_kill_signal > 0) {
3458 /* Try to kill the init system on SIGINT or SIGTERM */
3459 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3460 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3461 } else {
3462 /* Immediately exit */
3463 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3464 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3465 }
3466
6916b164
AU
3467 /* Exit when the child exits */
3468 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3469
3470 if (arg_expose_ports) {
3471 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3472 if (r < 0)
3473 return r;
3474
3475 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3476 }
3477
3478 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3479
3480 r = pty_forward_new(event, master,
3481 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3482 &forward);
3483 if (r < 0)
3484 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3485
3486 r = sd_event_loop(event);
3487 if (r < 0)
3488 return log_error_errno(r, "Failed to run event loop: %m");
3489
3490 pty_forward_get_last_char(forward, &last_char);
3491
3492 forward = pty_forward_free(forward);
3493
3494 if (!arg_quiet && last_char != '\n')
3495 putc('\n', stdout);
3496
3497 /* Kill if it is not dead yet anyway */
3498 if (arg_register && !arg_keep_unit)
3499 terminate_machine(*pid);
3500
3501 /* Normally redundant, but better safe than sorry */
c67b0082 3502 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3503
3504 r = wait_for_container(*pid, &container_status);
3505 *pid = 0;
3506
3507 if (r < 0)
3508 /* We failed to wait for the container, or the container exited abnormally. */
3509 return r;
3510 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3511 /* r > 0 → The container exited with a non-zero status.
3512 * As a special case, we need to replace 133 with a different value,
3513 * because 133 is special-cased in the service file to reboot the container.
3514 * otherwise → The container exited with zero status and a reboot was not requested.
3515 */
2a49b612 3516 if (r == EXIT_FORCE_RESTART)
27e29a1e 3517 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3518 *ret = r;
b0067625
ZJS
3519 return 0; /* finito */
3520 }
3521
3522 /* CONTAINER_REBOOTED, loop again */
3523
3524 if (arg_keep_unit) {
3525 /* Special handling if we are running as a service: instead of simply
3526 * restarting the machine we want to restart the entire service, so let's
3527 * inform systemd about this with the special exit code 133. The service
3528 * file uses RestartForceExitStatus=133 so that this results in a full
3529 * nspawn restart. This is necessary since we might have cgroup parameters
3530 * set we want to have flushed out. */
2a49b612
ZJS
3531 *ret = EXIT_FORCE_RESTART;
3532 return 0; /* finito */
b0067625
ZJS
3533 }
3534
3535 expose_port_flush(arg_expose_ports, exposed);
3536
3537 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3538 *veth_created = false;
3539 return 1; /* loop again */
3540}
3541
03cfe0d5
LP
3542int main(int argc, char *argv[]) {
3543
2d845785
LP
3544 _cleanup_free_ char *console = NULL;
3545 _cleanup_close_ int master = -1;
03cfe0d5 3546 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 3547 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 3548 char veth_name[IFNAMSIZ] = "";
17cbb288 3549 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 3550 pid_t pid = 0;
03cfe0d5
LP
3551 union in_addr_union exposed = {};
3552 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
3553 bool interactive, veth_created = false, remove_tmprootdir = false;
3554 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 3555 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
3556 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3557 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
3558
3559 log_parse_environment();
3560 log_open();
415fc41c 3561
7732f92b
LP
3562 /* Make sure rename_process() in the stub init process can work */
3563 saved_argv = argv;
3564 saved_argc = argc;
3565
03cfe0d5
LP
3566 r = parse_argv(argc, argv);
3567 if (r <= 0)
3568 goto finish;
3569
03cfe0d5
LP
3570 if (geteuid() != 0) {
3571 log_error("Need to be root.");
3572 r = -EPERM;
3573 goto finish;
3574 }
f757855e
LP
3575 r = determine_names();
3576 if (r < 0)
3577 goto finish;
3578
3579 r = load_settings();
3580 if (r < 0)
3581 goto finish;
3582
3583 r = verify_arguments();
3584 if (r < 0)
3585 goto finish;
03cfe0d5
LP
3586
3587 n_fd_passed = sd_listen_fds(false);
3588 if (n_fd_passed > 0) {
3589 r = fdset_new_listen_fds(&fds, false);
3590 if (r < 0) {
3591 log_error_errno(r, "Failed to collect file descriptors: %m");
3592 goto finish;
3593 }
3594 }
3595
3596 if (arg_directory) {
3597 assert(!arg_image);
3598
3599 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3600 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3601 r = -EINVAL;
3602 goto finish;
3603 }
3604
3605 if (arg_ephemeral) {
3606 _cleanup_free_ char *np = NULL;
3607
8d4aa2bb 3608 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
3609 if (r < 0)
3610 goto finish;
3611
03cfe0d5
LP
3612 /* If the specified path is a mount point we
3613 * generate the new snapshot immediately
3614 * inside it under a random name. However if
3615 * the specified is not a mount point we
3616 * create the new snapshot in the parent
3617 * directory, just next to it. */
e1873695 3618 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
3619 if (r < 0) {
3620 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3621 goto finish;
3622 }
3623 if (r > 0)
770b5ce4 3624 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3625 else
770b5ce4 3626 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 3627 if (r < 0) {
0f3be6ca 3628 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
3629 goto finish;
3630 }
3631
3632 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3633 if (r < 0) {
3634 log_error_errno(r, "Failed to lock %s: %m", np);
3635 goto finish;
3636 }
3637
17cbb288
LP
3638 r = btrfs_subvol_snapshot(arg_directory, np,
3639 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3640 BTRFS_SNAPSHOT_FALLBACK_COPY |
3641 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3642 BTRFS_SNAPSHOT_RECURSIVE |
3643 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3644 if (r < 0) {
3645 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3646 goto finish;
ec16945e
LP
3647 }
3648
3649 free(arg_directory);
3650 arg_directory = np;
8a16a7b4 3651 np = NULL;
ec16945e 3652
17cbb288 3653 remove_directory = true;
30535c16
LP
3654
3655 } else {
cb638b5e 3656 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
3657 if (r < 0)
3658 goto finish;
3659
30535c16
LP
3660 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3661 if (r == -EBUSY) {
3662 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3663 goto finish;
3664 }
3665 if (r < 0) {
3666 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3667 goto finish;
30535c16
LP
3668 }
3669
3670 if (arg_template) {
8d4aa2bb 3671 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
3672 if (r < 0)
3673 goto finish;
3674
17cbb288
LP
3675 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3676 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3677 BTRFS_SNAPSHOT_FALLBACK_COPY |
3678 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3679 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3680 BTRFS_SNAPSHOT_RECURSIVE |
3681 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3682 if (r == -EEXIST) {
3683 if (!arg_quiet)
3684 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3685 } else if (r < 0) {
83521414 3686 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3687 goto finish;
3688 } else {
3689 if (!arg_quiet)
3690 log_info("Populated %s from template %s.", arg_directory, arg_template);
3691 }
3692 }
ec16945e
LP
3693 }
3694
7732f92b 3695 if (arg_start_mode == START_BOOT) {
1b9e5b12 3696 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3697 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3698 r = -EINVAL;
1b9e5b12
LP
3699 goto finish;
3700 }
3701 } else {
3702 const char *p;
3703
16fb773e
LP
3704 p = strjoina(arg_directory, "/usr/");
3705 if (laccess(p, F_OK) < 0) {
3706 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3707 r = -EINVAL;
1b9e5b12 3708 goto finish;
1b9e5b12
LP
3709 }
3710 }
ec16945e 3711
6b9132a9 3712 } else {
ec16945e
LP
3713 assert(arg_image);
3714 assert(!arg_template);
3715
8d4aa2bb 3716 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
3717 if (r < 0)
3718 goto finish;
3719
0f3be6ca
LP
3720 if (arg_ephemeral) {
3721 _cleanup_free_ char *np = NULL;
3722
3723 r = tempfn_random(arg_image, "machine.", &np);
3724 if (r < 0) {
3725 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3726 goto finish;
3727 }
3728
3729 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3730 if (r < 0) {
3731 r = log_error_errno(r, "Failed to create image lock: %m");
3732 goto finish;
3733 }
3734
1c876927 3735 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
3736 if (r < 0) {
3737 r = log_error_errno(r, "Failed to copy image file: %m");
3738 goto finish;
3739 }
3740
3741 free(arg_image);
3742 arg_image = np;
3743 np = NULL;
3744
3745 remove_image = true;
3746 } else {
3747 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3748 if (r == -EBUSY) {
3749 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3750 goto finish;
3751 }
3752 if (r < 0) {
3753 r = log_error_errno(r, "Failed to create image lock: %m");
3754 goto finish;
3755 }
4623e8e6 3756
78ebe980
LP
3757 if (!arg_root_hash) {
3758 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3759 if (r < 0) {
3760 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3761 goto finish;
3762 }
3763 }
30535c16
LP
3764 }
3765
c67b0082 3766 if (!mkdtemp(tmprootdir)) {
0f3be6ca 3767 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 3768 goto finish;
1b9e5b12 3769 }
6b9132a9 3770
c67b0082
LP
3771 remove_tmprootdir = true;
3772
3773 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
3774 if (!arg_directory) {
3775 r = log_oom();
3776 goto finish;
6b9132a9 3777 }
88213476 3778
2d845785
LP
3779 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3780 if (r < 0) {
3781 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
3782 goto finish;
3783 }
1b9e5b12 3784
e0f9e7bd
LP
3785 r = dissect_image(
3786 loop->fd,
3787 arg_root_hash, arg_root_hash_size,
3788 DISSECT_IMAGE_REQUIRE_ROOT,
3789 &dissected_image);
2d845785
LP
3790 if (r == -ENOPKG) {
3791 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3792
3793 log_notice("Note that the disk image needs to\n"
3794 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3795 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3796 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3797 " d) or contain a file system without a partition table\n"
3798 "in order to be bootable with systemd-nspawn.");
1b9e5b12 3799 goto finish;
2d845785 3800 }
4623e8e6
LP
3801 if (r == -EADDRNOTAVAIL) {
3802 log_error_errno(r, "No root partition for specified root hash found.");
3803 goto finish;
3804 }
2d845785
LP
3805 if (r == -EOPNOTSUPP) {
3806 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3807 goto finish;
3808 }
3809 if (r < 0) {
3810 log_error_errno(r, "Failed to dissect image: %m");
842f3b0f
LP
3811 goto finish;
3812 }
1b9e5b12 3813
4623e8e6
LP
3814 if (!arg_root_hash && dissected_image->can_verity)
3815 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3816
3817 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
3818 if (r < 0)
3819 goto finish;
0f3be6ca
LP
3820
3821 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3822 if (remove_image && unlink(arg_image) >= 0)
3823 remove_image = false;
842f3b0f 3824 }
842f3b0f 3825
86c0dd4a 3826 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
3827 if (r < 0)
3828 goto finish;
bd15ab41
TH
3829
3830 r = detect_unified_cgroup_hierarchy(arg_directory);
3831 if (r < 0)
3832 goto finish;
5a8af538 3833
03cfe0d5
LP
3834 interactive =
3835 isatty(STDIN_FILENO) > 0 &&
3836 isatty(STDOUT_FILENO) > 0;
9c857b9d 3837
db7feb7e
LP
3838 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3839 if (master < 0) {
ec16945e 3840 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3841 goto finish;
3842 }
3843
611b312b
LP
3844 r = ptsname_malloc(master, &console);
3845 if (r < 0) {
3846 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3847 goto finish;
68b02049
DW
3848 }
3849
3850 if (arg_selinux_apifs_context) {
3851 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3852 if (r < 0)
3853 goto finish;
a258bf26
LP
3854 }
3855
a258bf26 3856 if (unlockpt(master) < 0) {
ec16945e 3857 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3858 goto finish;
3859 }
3860
9c857b9d
LP
3861 if (!arg_quiet)
3862 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3863 arg_machine, arg_image ?: arg_directory);
3864
72c0a2c2 3865 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3866
03cfe0d5
LP
3867 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3868 r = log_error_errno(errno, "Failed to become subreaper: %m");
3869 goto finish;
3870 }
3871
d87be9b0 3872 for (;;) {
b0067625
ZJS
3873 r = run(master,
3874 console,
2d845785 3875 dissected_image,
b0067625
ZJS
3876 interactive, secondary,
3877 fds,
3878 veth_name, &veth_created,
3879 &exposed,
3880 &pid, &ret);
3881 if (r <= 0)
d87be9b0 3882 break;
d87be9b0 3883 }
88213476
LP
3884
3885finish:
af4ec430 3886 sd_notify(false,
2a49b612
ZJS
3887 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3888 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 3889
9444b1f2 3890 if (pid > 0)
c67b0082 3891 (void) kill(pid, SIGKILL);
88213476 3892
503546da 3893 /* Try to flush whatever is still queued in the pty */
6a0f896b 3894 if (master >= 0) {
1c876927 3895 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
3896 master = safe_close(master);
3897 }
3898
3899 if (pid > 0)
3900 (void) wait_for_terminate(pid, NULL);
503546da 3901
17cbb288 3902 if (remove_directory && arg_directory) {
ec16945e
LP
3903 int k;
3904
17cbb288 3905 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 3906 if (k < 0)
17cbb288 3907 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
3908 }
3909
0f3be6ca
LP
3910 if (remove_image && arg_image) {
3911 if (unlink(arg_image) < 0)
3912 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3913 }
3914
c67b0082
LP
3915 if (remove_tmprootdir) {
3916 if (rmdir(tmprootdir) < 0)
3917 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3918 }
3919
785890ac
LP
3920 if (arg_machine) {
3921 const char *p;
3922
63c372cb 3923 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3924 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3925 }
3926
7a8f6325 3927 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
3928
3929 if (veth_created)
3930 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 3931 (void) remove_bridge(arg_network_zone);
f757855e 3932
04d391da 3933 free(arg_directory);
ec16945e
LP
3934 free(arg_template);
3935 free(arg_image);
7027ff61 3936 free(arg_machine);
c74e630d 3937 free(arg_user);
b53ede69
PW
3938 free(arg_pivot_root_new);
3939 free(arg_pivot_root_old);
5f932eb9 3940 free(arg_chdir);
c74e630d 3941 strv_free(arg_setenv);
f757855e 3942 free(arg_network_bridge);
c74e630d
LP
3943 strv_free(arg_network_interfaces);
3944 strv_free(arg_network_macvlan);
4bbfe7ad 3945 strv_free(arg_network_ipvlan);
f6d6bad1 3946 strv_free(arg_network_veth_extra);
f757855e
LP
3947 strv_free(arg_parameters);
3948 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3949 expose_port_free_all(arg_expose_ports);
4623e8e6 3950 free(arg_root_hash);
6d0b55c2 3951
ec16945e 3952 return r < 0 ? EXIT_FAILURE : ret;
88213476 3953}