]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
journalctl: add reference to sd-id128(3) to output (#5382)
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
6916b164 41#include <sys/wait.h>
8fe0087e 42#include <unistd.h>
1b9e5b12 43
b053cd5f 44#include "sd-bus.h"
1f0cd86b 45#include "sd-daemon.h"
1f0cd86b 46#include "sd-id128.h"
8fe0087e 47
b5efdb8a 48#include "alloc-util.h"
8fe0087e
LP
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
b053cd5f 53#include "bus-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
2d845785 59#include "dissect-image.h"
8fe0087e 60#include "env-util.h"
3ffd4af2 61#include "fd-util.h"
842f3b0f 62#include "fdset.h"
a5c32cff 63#include "fileio.h"
f97b34a6 64#include "format-util.h"
f4f15635 65#include "fs-util.h"
1b9e5b12 66#include "gpt.h"
4623e8e6 67#include "hexdecoct.h"
8fe0087e 68#include "hostname-util.h"
910fd145 69#include "id128-util.h"
8fe0087e 70#include "log.h"
2d845785 71#include "loop-util.h"
8fe0087e 72#include "loopback-setup.h"
1b9cebf6 73#include "machine-image.h"
8fe0087e
LP
74#include "macro.h"
75#include "missing.h"
76#include "mkdir.h"
4349cd7c 77#include "mount-util.h"
8fe0087e 78#include "netlink-util.h"
07630cea
LP
79#include "nspawn-cgroup.h"
80#include "nspawn-expose-ports.h"
81#include "nspawn-mount.h"
82#include "nspawn-network.h"
7336138e 83#include "nspawn-patch-uid.h"
07630cea 84#include "nspawn-register.h"
910fd145 85#include "nspawn-seccomp.h"
07630cea
LP
86#include "nspawn-settings.h"
87#include "nspawn-setuid.h"
7732f92b 88#include "nspawn-stub-pid1.h"
6bedfcbb 89#include "parse-util.h"
8fe0087e 90#include "path-util.h"
0b452006 91#include "process-util.h"
8fe0087e
LP
92#include "ptyfwd.h"
93#include "random-util.h"
8869a0b4 94#include "raw-clone.h"
8fe0087e 95#include "rm-rf.h"
68b02049 96#include "selinux-util.h"
8fe0087e 97#include "signal-util.h"
2583fbea 98#include "socket-util.h"
8fcde012 99#include "stat-util.h"
15a5e950 100#include "stdio-util.h"
07630cea 101#include "string-util.h"
8fe0087e
LP
102#include "strv.h"
103#include "terminal-util.h"
104#include "udev-util.h"
affb60b1 105#include "umask-util.h"
b1d4f8e1 106#include "user-util.h"
8fe0087e 107#include "util.h"
e9642be2 108
0e7ac751 109/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
110 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
111 * may have their own allocation ranges too. */
0e7ac751
LP
112#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
113#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 114
9c1e04d0
AP
115/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
116 * nspawn_notify_socket_path is relative to the container
117 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
118#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 119
2a49b612
ZJS
120#define EXIT_FORCE_RESTART 133
121
113cea80
DH
122typedef enum ContainerStatus {
123 CONTAINER_TERMINATED,
124 CONTAINER_REBOOTED
125} ContainerStatus;
126
57fb9fb5
LP
127typedef enum LinkJournal {
128 LINK_NO,
129 LINK_AUTO,
130 LINK_HOST,
131 LINK_GUEST
132} LinkJournal;
88213476
LP
133
134static char *arg_directory = NULL;
ec16945e 135static char *arg_template = NULL;
5f932eb9 136static char *arg_chdir = NULL;
b53ede69
PW
137static char *arg_pivot_root_new = NULL;
138static char *arg_pivot_root_old = NULL;
687d0825 139static char *arg_user = NULL;
9444b1f2 140static sd_id128_t arg_uuid = {};
7027ff61 141static char *arg_machine = NULL;
c74e630d
LP
142static const char *arg_selinux_context = NULL;
143static const char *arg_selinux_apifs_context = NULL;
9444b1f2 144static const char *arg_slice = NULL;
ff01d048 145static bool arg_private_network = false;
bc2f673e 146static bool arg_read_only = false;
7732f92b 147static StartMode arg_start_mode = START_PID1;
ec16945e 148static bool arg_ephemeral = false;
57fb9fb5 149static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 150static bool arg_link_journal_try = false;
520e0d54 151static uint64_t arg_caps_retain =
50b52222
LP
152 (1ULL << CAP_AUDIT_CONTROL) |
153 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
154 (1ULL << CAP_CHOWN) |
155 (1ULL << CAP_DAC_OVERRIDE) |
156 (1ULL << CAP_DAC_READ_SEARCH) |
157 (1ULL << CAP_FOWNER) |
158 (1ULL << CAP_FSETID) |
159 (1ULL << CAP_IPC_OWNER) |
160 (1ULL << CAP_KILL) |
161 (1ULL << CAP_LEASE) |
162 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 163 (1ULL << CAP_MKNOD) |
5076f0cc
LP
164 (1ULL << CAP_NET_BIND_SERVICE) |
165 (1ULL << CAP_NET_BROADCAST) |
166 (1ULL << CAP_NET_RAW) |
5076f0cc 167 (1ULL << CAP_SETFCAP) |
50b52222 168 (1ULL << CAP_SETGID) |
5076f0cc
LP
169 (1ULL << CAP_SETPCAP) |
170 (1ULL << CAP_SETUID) |
171 (1ULL << CAP_SYS_ADMIN) |
50b52222 172 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
173 (1ULL << CAP_SYS_CHROOT) |
174 (1ULL << CAP_SYS_NICE) |
175 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 176 (1ULL << CAP_SYS_RESOURCE) |
50b52222 177 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
178static CustomMount *arg_custom_mounts = NULL;
179static unsigned arg_n_custom_mounts = 0;
f4889f65 180static char **arg_setenv = NULL;
284c0b91 181static bool arg_quiet = false;
eb91eb18 182static bool arg_register = true;
89f7c846 183static bool arg_keep_unit = false;
aa28aefe 184static char **arg_network_interfaces = NULL;
c74e630d 185static char **arg_network_macvlan = NULL;
4bbfe7ad 186static char **arg_network_ipvlan = NULL;
69c79d3c 187static bool arg_network_veth = false;
f6d6bad1 188static char **arg_network_veth_extra = NULL;
f757855e 189static char *arg_network_bridge = NULL;
22b28dfd 190static char *arg_network_zone = NULL;
050f7277 191static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 192static char *arg_image = NULL;
f757855e 193static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 194static ExposePort *arg_expose_ports = NULL;
f36933fe 195static char **arg_property = NULL;
0de7acce 196static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 197static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 198static bool arg_userns_chown = false;
c6c8f6e2 199static int arg_kill_signal = 0;
5da38d07 200static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
201static SettingsMask arg_settings_mask = 0;
202static int arg_settings_trusted = -1;
203static char **arg_parameters = NULL;
6aadfa4c 204static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 205static bool arg_notify_ready = false;
5a8ff0e6 206static bool arg_use_cgns = true;
0c582db0 207static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 208static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
209static void *arg_root_hash = NULL;
210static size_t arg_root_hash_size = 0;
88213476 211
601185b4 212static void help(void) {
88213476
LP
213 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
214 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
215 " -h --help Show this help\n"
216 " --version Print version string\n"
69c79d3c 217 " -q --quiet Do not show status information\n"
1b9e5b12 218 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
219 " --template=PATH Initialize root directory from template directory,\n"
220 " if missing\n"
221 " -x --ephemeral Run container with snapshot of root directory, and\n"
222 " remove it after exit\n"
223 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 224 " --root-hash=HASH Specify verity root hash\n"
7732f92b 225 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 226 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 227 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
228 " --pivot-root=PATH[:PATH]\n"
229 " Pivot root to given directory in the container\n"
a8828ed9 230 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 231 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 232 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 233 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 234 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 235 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 236 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 237 " Similar, but with user configured UID/GID range\n"
24597ee0 238 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
239 " --private-network Disable network in container\n"
240 " --network-interface=INTERFACE\n"
241 " Assign an existing network interface to the\n"
242 " container\n"
c74e630d
LP
243 " --network-macvlan=INTERFACE\n"
244 " Create a macvlan network interface based on an\n"
245 " existing network interface to the container\n"
4bbfe7ad
TG
246 " --network-ipvlan=INTERFACE\n"
247 " Create a ipvlan network interface based on an\n"
248 " existing network interface to the container\n"
a8eaaee7 249 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 250 " and container\n"
f6d6bad1
LP
251 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
252 " Add an additional virtual Ethernet link between\n"
253 " host and container\n"
ab046dde 254 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
255 " Add a virtual Ethernet connection to the container\n"
256 " and attach it to an existing bridge on the host\n"
257 " --network-zone=NAME Similar, but attach the new interface to an\n"
258 " an automatically managed bridge interface\n"
6d0b55c2 259 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 260 " Expose a container IP port on the host\n"
82adf6af
LP
261 " -Z --selinux-context=SECLABEL\n"
262 " Set the SELinux security context to be used by\n"
263 " processes in the container\n"
264 " -L --selinux-apifs-context=SECLABEL\n"
265 " Set the SELinux security context to be used by\n"
266 " API/tmpfs file systems in the container\n"
a8828ed9
DW
267 " --capability=CAP In addition to the default, retain specified\n"
268 " capability\n"
269 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 270 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
271 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
272 " host, try-guest, try-host\n"
574edc90 273 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 274 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
275 " --bind=PATH[:PATH[:OPTIONS]]\n"
276 " Bind mount a file or directory from the host into\n"
a8828ed9 277 " the container\n"
5e5bfa6e
EY
278 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
279 " Similar, but creates a read-only bind mount\n"
06c17c39 280 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
281 " --overlay=PATH[:PATH...]:PATH\n"
282 " Create an overlay mount from the host to \n"
283 " the container\n"
284 " --overlay-ro=PATH[:PATH...]:PATH\n"
285 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 286 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 287 " --register=BOOLEAN Register container as machine\n"
89f7c846 288 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 289 " the service unit nspawn is running in\n"
6d0b55c2 290 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 291 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 292 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 293 , program_invocation_short_name);
88213476
LP
294}
295
86c0dd4a 296static int custom_mount_check_all(void) {
5a8af538 297 unsigned i;
5a8af538 298
5a8af538
LP
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
0de7acce 302 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
303
304 if (arg_userns_chown) {
305 log_error("--private-users-chown may not be combined with custom root mounts.");
306 return -EINVAL;
307 } else if (arg_uid_shift == UID_INVALID) {
308 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
309 return -EINVAL;
310 }
825d5287 311 }
5a8af538
LP
312 }
313
314 return 0;
315}
316
0fd9563f 317static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 318 const char *e;
5da38d07
TH
319 int r, all_unified, systemd_unified;
320
efdb0237
LP
321 /* Allow the user to control whether the unified hierarchy is used */
322 e = getenv("UNIFIED_CGROUP_HIERARCHY");
323 if (e) {
324 r = parse_boolean(e);
325 if (r < 0)
326 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
327 if (r > 0)
328 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
329 else
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 331
efdb0237
LP
332 return 0;
333 }
334
98afd6af
ZJS
335 all_unified = cg_all_unified();
336 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
337
338 if (all_unified < 0 || systemd_unified < 0)
339 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
340 "Failed to determine whether the unified cgroups hierarchy is used: %m");
341
efdb0237 342 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
343 if (all_unified > 0) {
344 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
345 * routine only detects 231, so we'll have a false negative here for 230. */
346 r = systemd_installation_has_version(directory, 230);
347 if (r < 0)
348 return log_error_errno(r, "Failed to determine systemd version in container: %m");
349 if (r > 0)
350 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
351 else
352 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
353 } else if (systemd_unified > 0) {
354 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
355 r = systemd_installation_has_version(directory, 232);
356 if (r < 0)
357 return log_error_errno(r, "Failed to determine systemd version in container: %m");
358 if (r > 0)
359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
360 else
361 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
362 } else
5da38d07 363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 364
efdb0237
LP
365 return 0;
366}
367
0c582db0
LB
368static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
369 int r;
370
371 r = getenv_bool(name);
372 if (r == -ENXIO)
373 return;
374 if (r < 0)
375 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
376 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
377}
378
4f086aab
SU
379static void parse_mount_settings_env(void) {
380 int r;
381 const char *e;
382
383 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
384 if (!e)
385 return;
386
387 if (streq(e, "network")) {
388 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
389 return;
390 }
391
392 r = parse_boolean(e);
393 if (r < 0) {
394 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
395 return;
396 } else if (r > 0)
397 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
398 else
399 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
400
401 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
402}
403
88213476
LP
404static int parse_argv(int argc, char *argv[]) {
405
a41fe3a2 406 enum {
acbeb427
ZJS
407 ARG_VERSION = 0x100,
408 ARG_PRIVATE_NETWORK,
bc2f673e 409 ARG_UUID,
5076f0cc 410 ARG_READ_ONLY,
57fb9fb5 411 ARG_CAPABILITY,
420c7379 412 ARG_DROP_CAPABILITY,
17fe0523
LP
413 ARG_LINK_JOURNAL,
414 ARG_BIND,
f4889f65 415 ARG_BIND_RO,
06c17c39 416 ARG_TMPFS,
5a8af538
LP
417 ARG_OVERLAY,
418 ARG_OVERLAY_RO,
eb91eb18 419 ARG_SHARE_SYSTEM,
89f7c846 420 ARG_REGISTER,
aa28aefe 421 ARG_KEEP_UNIT,
69c79d3c 422 ARG_NETWORK_INTERFACE,
c74e630d 423 ARG_NETWORK_MACVLAN,
4bbfe7ad 424 ARG_NETWORK_IPVLAN,
ab046dde 425 ARG_NETWORK_BRIDGE,
22b28dfd 426 ARG_NETWORK_ZONE,
f6d6bad1 427 ARG_NETWORK_VETH_EXTRA,
6afc95b7 428 ARG_PERSONALITY,
4d9f07b4 429 ARG_VOLATILE,
ec16945e 430 ARG_TEMPLATE,
f36933fe 431 ARG_PROPERTY,
6dac160c 432 ARG_PRIVATE_USERS,
c6c8f6e2 433 ARG_KILL_SIGNAL,
f757855e 434 ARG_SETTINGS,
5f932eb9 435 ARG_CHDIR,
b53ede69 436 ARG_PIVOT_ROOT,
7336138e 437 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 438 ARG_NOTIFY_READY,
4623e8e6 439 ARG_ROOT_HASH,
a41fe3a2
LP
440 };
441
88213476 442 static const struct option options[] = {
27eb8e90
ZJS
443 { "help", no_argument, NULL, 'h' },
444 { "version", no_argument, NULL, ARG_VERSION },
445 { "directory", required_argument, NULL, 'D' },
446 { "template", required_argument, NULL, ARG_TEMPLATE },
447 { "ephemeral", no_argument, NULL, 'x' },
448 { "user", required_argument, NULL, 'u' },
449 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
450 { "as-pid2", no_argument, NULL, 'a' },
451 { "boot", no_argument, NULL, 'b' },
452 { "uuid", required_argument, NULL, ARG_UUID },
453 { "read-only", no_argument, NULL, ARG_READ_ONLY },
454 { "capability", required_argument, NULL, ARG_CAPABILITY },
455 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
456 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
457 { "bind", required_argument, NULL, ARG_BIND },
458 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
459 { "tmpfs", required_argument, NULL, ARG_TMPFS },
460 { "overlay", required_argument, NULL, ARG_OVERLAY },
461 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
462 { "machine", required_argument, NULL, 'M' },
463 { "slice", required_argument, NULL, 'S' },
464 { "setenv", required_argument, NULL, 'E' },
465 { "selinux-context", required_argument, NULL, 'Z' },
466 { "selinux-apifs-context", required_argument, NULL, 'L' },
467 { "quiet", no_argument, NULL, 'q' },
468 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
469 { "register", required_argument, NULL, ARG_REGISTER },
470 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
471 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
472 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
473 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
474 { "network-veth", no_argument, NULL, 'n' },
475 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
476 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
477 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
478 { "personality", required_argument, NULL, ARG_PERSONALITY },
479 { "image", required_argument, NULL, 'i' },
480 { "volatile", optional_argument, NULL, ARG_VOLATILE },
481 { "port", required_argument, NULL, 'p' },
482 { "property", required_argument, NULL, ARG_PROPERTY },
483 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
484 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
485 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
486 { "settings", required_argument, NULL, ARG_SETTINGS },
487 { "chdir", required_argument, NULL, ARG_CHDIR },
b53ede69 488 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
27eb8e90 489 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
4623e8e6 490 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
eb9da376 491 {}
88213476
LP
492 };
493
9444b1f2 494 int c, r;
6aadfa4c 495 const char *p, *e;
a42c8b54 496 uint64_t plus = 0, minus = 0;
f757855e 497 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
498
499 assert(argc >= 0);
500 assert(argv);
501
2e1f244e 502 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
503
504 switch (c) {
505
506 case 'h':
601185b4
ZJS
507 help();
508 return 0;
88213476 509
acbeb427 510 case ARG_VERSION:
3f6fd1ba 511 return version();
acbeb427 512
88213476 513 case 'D':
0f03c2a4 514 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 515 if (r < 0)
0f03c2a4 516 return r;
ec16945e
LP
517 break;
518
519 case ARG_TEMPLATE:
0f03c2a4 520 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 521 if (r < 0)
0f03c2a4 522 return r;
88213476
LP
523 break;
524
1b9e5b12 525 case 'i':
0f03c2a4 526 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 527 if (r < 0)
0f03c2a4 528 return r;
ec16945e
LP
529 break;
530
531 case 'x':
532 arg_ephemeral = true;
1b9e5b12
LP
533 break;
534
687d0825 535 case 'u':
2fc09a9c
DM
536 r = free_and_strdup(&arg_user, optarg);
537 if (r < 0)
7027ff61 538 return log_oom();
687d0825 539
f757855e 540 arg_settings_mask |= SETTING_USER;
687d0825
MV
541 break;
542
22b28dfd
LP
543 case ARG_NETWORK_ZONE: {
544 char *j;
545
546 j = strappend("vz-", optarg);
547 if (!j)
548 return log_oom();
549
550 if (!ifname_valid(j)) {
551 log_error("Network zone name not valid: %s", j);
552 free(j);
553 return -EINVAL;
554 }
555
556 free(arg_network_zone);
557 arg_network_zone = j;
558
559 arg_network_veth = true;
560 arg_private_network = true;
561 arg_settings_mask |= SETTING_NETWORK;
562 break;
563 }
564
ab046dde 565 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
566
567 if (!ifname_valid(optarg)) {
568 log_error("Bridge interface name not valid: %s", optarg);
569 return -EINVAL;
570 }
571
f757855e
LP
572 r = free_and_strdup(&arg_network_bridge, optarg);
573 if (r < 0)
574 return log_oom();
ab046dde
TG
575
576 /* fall through */
577
0dfaa006 578 case 'n':
69c79d3c
LP
579 arg_network_veth = true;
580 arg_private_network = true;
f757855e 581 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
582 break;
583
f6d6bad1
LP
584 case ARG_NETWORK_VETH_EXTRA:
585 r = veth_extra_parse(&arg_network_veth_extra, optarg);
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
588
589 arg_private_network = true;
590 arg_settings_mask |= SETTING_NETWORK;
591 break;
592
aa28aefe 593 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
594
595 if (!ifname_valid(optarg)) {
596 log_error("Network interface name not valid: %s", optarg);
597 return -EINVAL;
598 }
599
c74e630d
LP
600 if (strv_extend(&arg_network_interfaces, optarg) < 0)
601 return log_oom();
602
603 arg_private_network = true;
f757855e 604 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
605 break;
606
607 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
608
609 if (!ifname_valid(optarg)) {
610 log_error("MACVLAN network interface name not valid: %s", optarg);
611 return -EINVAL;
612 }
613
c74e630d 614 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
615 return log_oom();
616
4bbfe7ad 617 arg_private_network = true;
f757855e 618 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
619 break;
620
621 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
622
623 if (!ifname_valid(optarg)) {
624 log_error("IPVLAN network interface name not valid: %s", optarg);
625 return -EINVAL;
626 }
627
4bbfe7ad
TG
628 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
629 return log_oom();
630
aa28aefe
LP
631 /* fall through */
632
ff01d048
LP
633 case ARG_PRIVATE_NETWORK:
634 arg_private_network = true;
f757855e 635 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
636 break;
637
0f0dbc46 638 case 'b':
7732f92b
LP
639 if (arg_start_mode == START_PID2) {
640 log_error("--boot and --as-pid2 may not be combined.");
641 return -EINVAL;
642 }
643
644 arg_start_mode = START_BOOT;
645 arg_settings_mask |= SETTING_START_MODE;
646 break;
647
648 case 'a':
649 if (arg_start_mode == START_BOOT) {
650 log_error("--boot and --as-pid2 may not be combined.");
651 return -EINVAL;
652 }
653
654 arg_start_mode = START_PID2;
655 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
656 break;
657
144f0fc0 658 case ARG_UUID:
9444b1f2 659 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
660 if (r < 0)
661 return log_error_errno(r, "Invalid UUID: %s", optarg);
662
663 if (sd_id128_is_null(arg_uuid)) {
664 log_error("Machine UUID may not be all zeroes.");
665 return -EINVAL;
aa96c6cb 666 }
f757855e
LP
667
668 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 669 break;
aa96c6cb 670
9444b1f2 671 case 'S':
c74e630d 672 arg_slice = optarg;
144f0fc0
LP
673 break;
674
7027ff61 675 case 'M':
c1521918 676 if (isempty(optarg))
97b11eed 677 arg_machine = mfree(arg_machine);
c1521918 678 else {
0c3c4284 679 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
680 log_error("Invalid machine name: %s", optarg);
681 return -EINVAL;
682 }
7027ff61 683
0c3c4284
LP
684 r = free_and_strdup(&arg_machine, optarg);
685 if (r < 0)
eb91eb18 686 return log_oom();
eb91eb18 687 }
9ce6d1b3 688 break;
7027ff61 689
82adf6af
LP
690 case 'Z':
691 arg_selinux_context = optarg;
a8828ed9
DW
692 break;
693
82adf6af
LP
694 case 'L':
695 arg_selinux_apifs_context = optarg;
a8828ed9
DW
696 break;
697
bc2f673e
LP
698 case ARG_READ_ONLY:
699 arg_read_only = true;
f757855e 700 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
701 break;
702
420c7379
LP
703 case ARG_CAPABILITY:
704 case ARG_DROP_CAPABILITY: {
6cbe4ed1 705 p = optarg;
9ed794a3 706 for (;;) {
6cbe4ed1 707 _cleanup_free_ char *t = NULL;
5076f0cc 708
6cbe4ed1
SS
709 r = extract_first_word(&p, &t, ",", 0);
710 if (r < 0)
711 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 712
6cbe4ed1
SS
713 if (r == 0)
714 break;
5076f0cc 715
39ed67d1
LP
716 if (streq(t, "all")) {
717 if (c == ARG_CAPABILITY)
a42c8b54 718 plus = (uint64_t) -1;
39ed67d1 719 else
a42c8b54 720 minus = (uint64_t) -1;
39ed67d1 721 } else {
2822da4f
LP
722 int cap;
723
724 cap = capability_from_name(t);
725 if (cap < 0) {
39ed67d1
LP
726 log_error("Failed to parse capability %s.", t);
727 return -EINVAL;
728 }
729
730 if (c == ARG_CAPABILITY)
a42c8b54 731 plus |= 1ULL << (uint64_t) cap;
39ed67d1 732 else
a42c8b54 733 minus |= 1ULL << (uint64_t) cap;
5076f0cc 734 }
5076f0cc
LP
735 }
736
f757855e 737 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
738 break;
739 }
740
57fb9fb5
LP
741 case 'j':
742 arg_link_journal = LINK_GUEST;
574edc90 743 arg_link_journal_try = true;
57fb9fb5
LP
744 break;
745
746 case ARG_LINK_JOURNAL:
53e438e3 747 if (streq(optarg, "auto")) {
57fb9fb5 748 arg_link_journal = LINK_AUTO;
53e438e3
LP
749 arg_link_journal_try = false;
750 } else if (streq(optarg, "no")) {
57fb9fb5 751 arg_link_journal = LINK_NO;
53e438e3
LP
752 arg_link_journal_try = false;
753 } else if (streq(optarg, "guest")) {
57fb9fb5 754 arg_link_journal = LINK_GUEST;
53e438e3
LP
755 arg_link_journal_try = false;
756 } else if (streq(optarg, "host")) {
57fb9fb5 757 arg_link_journal = LINK_HOST;
53e438e3
LP
758 arg_link_journal_try = false;
759 } else if (streq(optarg, "try-guest")) {
574edc90
MP
760 arg_link_journal = LINK_GUEST;
761 arg_link_journal_try = true;
762 } else if (streq(optarg, "try-host")) {
763 arg_link_journal = LINK_HOST;
764 arg_link_journal_try = true;
765 } else {
57fb9fb5
LP
766 log_error("Failed to parse link journal mode %s", optarg);
767 return -EINVAL;
768 }
769
770 break;
771
17fe0523 772 case ARG_BIND:
f757855e
LP
773 case ARG_BIND_RO:
774 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
775 if (r < 0)
776 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 777
f757855e 778 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 779 break;
06c17c39 780
f757855e
LP
781 case ARG_TMPFS:
782 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
783 if (r < 0)
784 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 785
f757855e 786 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 787 break;
5a8af538
LP
788
789 case ARG_OVERLAY:
ad85779a
LP
790 case ARG_OVERLAY_RO:
791 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
792 if (r == -EADDRNOTAVAIL)
793 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
794 if (r < 0)
795 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 796
f757855e 797 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 798 break;
06c17c39 799
a5f1cb3b 800 case 'E': {
f4889f65
LP
801 char **n;
802
803 if (!env_assignment_is_valid(optarg)) {
804 log_error("Environment variable assignment '%s' is not valid.", optarg);
805 return -EINVAL;
806 }
807
808 n = strv_env_set(arg_setenv, optarg);
809 if (!n)
810 return log_oom();
811
812 strv_free(arg_setenv);
813 arg_setenv = n;
f757855e
LP
814
815 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
816 break;
817 }
818
284c0b91
LP
819 case 'q':
820 arg_quiet = true;
821 break;
822
8a96d94e 823 case ARG_SHARE_SYSTEM:
a6b5216c 824 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
825 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
826 arg_clone_ns_flags = 0;
8a96d94e
LP
827 break;
828
eb91eb18
LP
829 case ARG_REGISTER:
830 r = parse_boolean(optarg);
831 if (r < 0) {
832 log_error("Failed to parse --register= argument: %s", optarg);
833 return r;
834 }
835
836 arg_register = r;
837 break;
838
89f7c846
LP
839 case ARG_KEEP_UNIT:
840 arg_keep_unit = true;
841 break;
842
6afc95b7
LP
843 case ARG_PERSONALITY:
844
ac45f971 845 arg_personality = personality_from_string(optarg);
050f7277 846 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
847 log_error("Unknown or unsupported personality '%s'.", optarg);
848 return -EINVAL;
849 }
850
f757855e 851 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
852 break;
853
4d9f07b4
LP
854 case ARG_VOLATILE:
855
856 if (!optarg)
f757855e 857 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 858 else {
f757855e 859 VolatileMode m;
4d9f07b4 860
f757855e
LP
861 m = volatile_mode_from_string(optarg);
862 if (m < 0) {
863 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 864 return -EINVAL;
f757855e
LP
865 } else
866 arg_volatile_mode = m;
6d0b55c2
LP
867 }
868
f757855e
LP
869 arg_settings_mask |= SETTING_VOLATILE_MODE;
870 break;
6d0b55c2 871
f757855e
LP
872 case 'p':
873 r = expose_port_parse(&arg_expose_ports, optarg);
874 if (r == -EEXIST)
875 return log_error_errno(r, "Duplicate port specification: %s", optarg);
876 if (r < 0)
877 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 878
f757855e 879 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 880 break;
6d0b55c2 881
f36933fe
LP
882 case ARG_PROPERTY:
883 if (strv_extend(&arg_property, optarg) < 0)
884 return log_oom();
885
886 break;
887
ae209204
ZJS
888 case ARG_PRIVATE_USERS: {
889 int boolean = -1;
0de7acce 890
ae209204
ZJS
891 if (!optarg)
892 boolean = true;
893 else if (!in_charset(optarg, DIGITS))
894 /* do *not* parse numbers as booleans */
895 boolean = parse_boolean(optarg);
896
897 if (boolean == false) {
0de7acce
LP
898 /* no: User namespacing off */
899 arg_userns_mode = USER_NAMESPACE_NO;
900 arg_uid_shift = UID_INVALID;
901 arg_uid_range = UINT32_C(0x10000);
ae209204 902 } else if (boolean == true) {
0de7acce
LP
903 /* yes: User namespacing on, UID range is read from root dir */
904 arg_userns_mode = USER_NAMESPACE_FIXED;
905 arg_uid_shift = UID_INVALID;
906 arg_uid_range = UINT32_C(0x10000);
907 } else if (streq(optarg, "pick")) {
908 /* pick: User namespacing on, UID range is picked randomly */
909 arg_userns_mode = USER_NAMESPACE_PICK;
910 arg_uid_shift = UID_INVALID;
911 arg_uid_range = UINT32_C(0x10000);
912 } else {
6c2058b3 913 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
914 const char *range, *shift;
915
0de7acce
LP
916 /* anything else: User namespacing on, UID range is explicitly configured */
917
6dac160c
LP
918 range = strchr(optarg, ':');
919 if (range) {
6c2058b3
ZJS
920 buffer = strndup(optarg, range - optarg);
921 if (!buffer)
922 return log_oom();
923 shift = buffer;
6dac160c
LP
924
925 range++;
bfd292ec
ZJS
926 r = safe_atou32(range, &arg_uid_range);
927 if (r < 0)
be715731 928 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
929 } else
930 shift = optarg;
931
be715731
ZJS
932 r = parse_uid(shift, &arg_uid_shift);
933 if (r < 0)
934 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
935
936 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
937 }
938
be715731
ZJS
939 if (arg_uid_range <= 0) {
940 log_error("UID range cannot be 0.");
941 return -EINVAL;
942 }
943
0de7acce 944 arg_settings_mask |= SETTING_USERNS;
6dac160c 945 break;
ae209204 946 }
6dac160c 947
0de7acce 948 case 'U':
ccabee0d
LP
949 if (userns_supported()) {
950 arg_userns_mode = USER_NAMESPACE_PICK;
951 arg_uid_shift = UID_INVALID;
952 arg_uid_range = UINT32_C(0x10000);
953
954 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
955 }
956
7336138e
LP
957 break;
958
0de7acce 959 case ARG_PRIVATE_USERS_CHOWN:
19aac838 960 arg_userns_chown = true;
0de7acce
LP
961
962 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
963 break;
964
c6c8f6e2
LP
965 case ARG_KILL_SIGNAL:
966 arg_kill_signal = signal_from_string_try_harder(optarg);
967 if (arg_kill_signal < 0) {
968 log_error("Cannot parse signal: %s", optarg);
969 return -EINVAL;
970 }
971
f757855e
LP
972 arg_settings_mask |= SETTING_KILL_SIGNAL;
973 break;
974
975 case ARG_SETTINGS:
976
977 /* no → do not read files
978 * yes → read files, do not override cmdline, trust only subset
979 * override → read files, override cmdline, trust only subset
980 * trusted → read files, do not override cmdline, trust all
981 */
982
983 r = parse_boolean(optarg);
984 if (r < 0) {
985 if (streq(optarg, "trusted")) {
986 mask_all_settings = false;
987 mask_no_settings = false;
988 arg_settings_trusted = true;
989
990 } else if (streq(optarg, "override")) {
991 mask_all_settings = false;
992 mask_no_settings = true;
993 arg_settings_trusted = -1;
994 } else
995 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
996 } else if (r > 0) {
997 /* yes */
998 mask_all_settings = false;
999 mask_no_settings = false;
1000 arg_settings_trusted = -1;
1001 } else {
1002 /* no */
1003 mask_all_settings = true;
1004 mask_no_settings = false;
1005 arg_settings_trusted = false;
1006 }
1007
c6c8f6e2
LP
1008 break;
1009
5f932eb9
LP
1010 case ARG_CHDIR:
1011 if (!path_is_absolute(optarg)) {
1012 log_error("Working directory %s is not an absolute path.", optarg);
1013 return -EINVAL;
1014 }
1015
1016 r = free_and_strdup(&arg_chdir, optarg);
1017 if (r < 0)
1018 return log_oom();
1019
1020 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1021 break;
1022
b53ede69
PW
1023 case ARG_PIVOT_ROOT:
1024 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1025 if (r < 0)
1026 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1027
1028 arg_settings_mask |= SETTING_PIVOT_ROOT;
1029 break;
1030
9c1e04d0
AP
1031 case ARG_NOTIFY_READY:
1032 r = parse_boolean(optarg);
1033 if (r < 0) {
1034 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1035 return -EINVAL;
1036 }
1037 arg_notify_ready = r;
1038 arg_settings_mask |= SETTING_NOTIFY_READY;
1039 break;
1040
4623e8e6
LP
1041 case ARG_ROOT_HASH: {
1042 void *k;
1043 size_t l;
1044
1045 r = unhexmem(optarg, strlen(optarg), &k, &l);
1046 if (r < 0)
1047 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1048 if (l < sizeof(sd_id128_t)) {
1049 log_error("Root hash must be at least 128bit long: %s", optarg);
1050 free(k);
1051 return -EINVAL;
1052 }
1053
1054 free(arg_root_hash);
1055 arg_root_hash = k;
1056 arg_root_hash_size = l;
1057 break;
1058 }
1059
88213476
LP
1060 case '?':
1061 return -EINVAL;
1062
1063 default:
eb9da376 1064 assert_not_reached("Unhandled option");
88213476 1065 }
88213476 1066
0c582db0
LB
1067 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1068 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1069 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1070 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1071
4f086aab
SU
1072 if (arg_userns_mode != USER_NAMESPACE_NO)
1073 arg_mount_settings |= MOUNT_USE_USERNS;
1074
1075 if (arg_private_network)
1076 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1077
1078 parse_mount_settings_env();
1079
48a8d337
LB
1080 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1081 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1082 arg_register = false;
0c582db0
LB
1083 if (arg_start_mode != START_PID1) {
1084 log_error("--boot cannot be used without namespacing.");
1085 return -EINVAL;
1086 }
1087 }
eb91eb18 1088
0de7acce 1089 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1090 arg_userns_chown = true;
1091
89f7c846
LP
1092 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1093 log_error("--keep-unit may not be used when invoked from a user session.");
1094 return -EINVAL;
1095 }
1096
1b9e5b12
LP
1097 if (arg_directory && arg_image) {
1098 log_error("--directory= and --image= may not be combined.");
1099 return -EINVAL;
1100 }
1101
ec16945e
LP
1102 if (arg_template && arg_image) {
1103 log_error("--template= and --image= may not be combined.");
1104 return -EINVAL;
1105 }
1106
8cd328d8
LP
1107 if (arg_ephemeral && arg_template && !arg_directory) {
1108 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1109 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1110 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1111 * --directory=". */
1112
1113 arg_directory = arg_template;
1114 arg_template = NULL;
1115 }
1116
ec16945e
LP
1117 if (arg_template && !(arg_directory || arg_machine)) {
1118 log_error("--template= needs --directory= or --machine=.");
1119 return -EINVAL;
1120 }
1121
1122 if (arg_ephemeral && arg_template) {
1123 log_error("--ephemeral and --template= may not be combined.");
1124 return -EINVAL;
1125 }
1126
df9a75e4
LP
1127 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1128 log_error("--ephemeral and --link-journal= may not be combined.");
1129 return -EINVAL;
1130 }
1131
ccabee0d 1132 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1133 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1134 return -EOPNOTSUPP;
1135 }
1136
1137 if (arg_userns_chown && arg_read_only) {
1138 log_error("--read-only and --private-users-chown may not be combined.");
1139 return -EINVAL;
1140 }
f757855e 1141
22b28dfd
LP
1142 if (arg_network_bridge && arg_network_zone) {
1143 log_error("--network-bridge= and --network-zone= may not be combined.");
1144 return -EINVAL;
1145 }
1146
f757855e
LP
1147 if (argc > optind) {
1148 arg_parameters = strv_copy(argv + optind);
1149 if (!arg_parameters)
1150 return log_oom();
1151
7732f92b 1152 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1153 }
1154
1155 /* Load all settings from .nspawn files */
1156 if (mask_no_settings)
1157 arg_settings_mask = 0;
1158
1159 /* Don't load any settings from .nspawn files */
1160 if (mask_all_settings)
1161 arg_settings_mask = _SETTINGS_MASK_ALL;
1162
520e0d54 1163 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1164
6aadfa4c
ILG
1165 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1166 if (e)
1167 arg_container_service_name = e;
1168
5a8ff0e6
CB
1169 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1170 if (r < 0)
1171 arg_use_cgns = cg_ns_supported();
1172 else
1173 arg_use_cgns = r;
1174
86c0dd4a
LP
1175 r = custom_mount_check_all();
1176 if (r < 0)
1177 return r;
1178
f757855e
LP
1179 return 1;
1180}
1181
1182static int verify_arguments(void) {
4f086aab
SU
1183 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1184 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1185 return -EINVAL;
1186 }
1187
1188 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1189 log_error("Cannot combine --private-users with read-write mounts.");
1190 return -EINVAL;
1191 }
f757855e
LP
1192
1193 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1194 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1195 return -EINVAL;
1196 }
1197
6d0b55c2
LP
1198 if (arg_expose_ports && !arg_private_network) {
1199 log_error("Cannot use --port= without private networking.");
1200 return -EINVAL;
1201 }
1202
1c1ea217
EV
1203#ifndef HAVE_LIBIPTC
1204 if (arg_expose_ports) {
1205 log_error("--port= is not supported, compiled without libiptc support.");
1206 return -EOPNOTSUPP;
1207 }
1208#endif
1209
7732f92b 1210 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1211 arg_kill_signal = SIGRTMIN+3;
1212
f757855e 1213 return 0;
88213476
LP
1214}
1215
03cfe0d5
LP
1216static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1217 assert(p);
1218
0de7acce 1219 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1220 return 0;
1221
1222 if (uid == UID_INVALID && gid == GID_INVALID)
1223 return 0;
1224
1225 if (uid != UID_INVALID) {
1226 uid += arg_uid_shift;
1227
1228 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1229 return -EOVERFLOW;
1230 }
1231
1232 if (gid != GID_INVALID) {
1233 gid += (gid_t) arg_uid_shift;
1234
1235 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1236 return -EOVERFLOW;
1237 }
1238
1239 if (lchown(p, uid, gid) < 0)
1240 return -errno;
b12afc8c
LP
1241
1242 return 0;
1243}
1244
03cfe0d5
LP
1245static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1246 const char *q;
1247
1248 q = prefix_roota(root, path);
1249 if (mkdir(q, mode) < 0) {
1250 if (errno == EEXIST)
1251 return 0;
1252 return -errno;
1253 }
1254
1255 return userns_lchown(q, uid, gid);
1256}
1257
e58a1277 1258static int setup_timezone(const char *dest) {
03cfe0d5
LP
1259 _cleanup_free_ char *p = NULL, *q = NULL;
1260 const char *where, *check, *what;
d4036145
LP
1261 char *z, *y;
1262 int r;
f8440af5 1263
e58a1277
LP
1264 assert(dest);
1265
1266 /* Fix the timezone, if possible */
d4036145
LP
1267 r = readlink_malloc("/etc/localtime", &p);
1268 if (r < 0) {
0b493a02
MP
1269 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1270 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1271 * with a symbolic link to a time zone data file.
0b493a02
MP
1272 *
1273 * Example:
21dc0227 1274 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1275 */
d4036145
LP
1276 return 0;
1277 }
1278
1279 z = path_startswith(p, "../usr/share/zoneinfo/");
1280 if (!z)
1281 z = path_startswith(p, "/usr/share/zoneinfo/");
1282 if (!z) {
1283 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1284 return 0;
1285 }
1286
03cfe0d5 1287 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1288 r = readlink_malloc(where, &q);
1289 if (r >= 0) {
1290 y = path_startswith(q, "../usr/share/zoneinfo/");
1291 if (!y)
1292 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1293
d4036145
LP
1294 /* Already pointing to the right place? Then do nothing .. */
1295 if (y && streq(y, z))
1296 return 0;
1297 }
1298
03cfe0d5 1299 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1300 check = prefix_roota(dest, check);
03cfe0d5 1301 if (laccess(check, F_OK) < 0) {
d4036145
LP
1302 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1303 return 0;
1304 }
68fb0892 1305
8ccf7e9e
LP
1306 if (unlink(where) < 0 && errno != ENOENT) {
1307 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1308 errno,
1309 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1310 return 0;
1311 }
4d9f07b4 1312
03cfe0d5 1313 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1314 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1315 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1316 errno,
1317 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1318 return 0;
1319 }
e58a1277 1320
03cfe0d5
LP
1321 r = userns_lchown(where, 0, 0);
1322 if (r < 0)
1323 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1324
e58a1277 1325 return 0;
88213476
LP
1326}
1327
b053cd5f
LP
1328static int resolved_running(void) {
1329 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1330 int r;
1331
1332 /* Check if resolved is running */
1333
1334 r = sd_bus_open_system(&bus);
1335 if (r < 0)
1336 return r;
1337
1338 return bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1339}
1340
2547bb41 1341static int setup_resolv_conf(const char *dest) {
87447ae4
LP
1342 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1343 const char *where;
1344 int r, found;
2547bb41
LP
1345
1346 assert(dest);
1347
1348 if (arg_private_network)
1349 return 0;
1350
87447ae4
LP
1351 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1352 if (r < 0) {
1353 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1354 return 0;
1355 }
1356
1357 where = strjoina(etc, "/resolv.conf");
1358 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1359 if (found < 0) {
1360 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1361 return 0;
1362 }
79d80fc1 1363
b053cd5f
LP
1364 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
1365 resolved_running() > 0) {
87447ae4 1366
3539724c
LP
1367 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1368 * container, so that the container can use the host's resolver. Given that network namespacing is
1369 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1370 * advantage that the container will be able to follow the host's DNS server configuration changes
1371 * transparently. */
1372
87447ae4
LP
1373 if (found == 0) /* missing? */
1374 (void) touch(resolved);
5367354d 1375
87447ae4 1376 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
60e76d48 1377 if (r >= 0)
87447ae4 1378 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1379 }
1380
1381 /* If that didn't work, let's copy the file */
1c876927 1382 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1383 if (r < 0) {
3539724c
LP
1384 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1385 * resolved or something similar runs inside and the symlink points there.
68a313c5 1386 *
3539724c 1387 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1388 */
87447ae4 1389 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1390 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1391 return 0;
1392 }
2547bb41 1393
03cfe0d5
LP
1394 r = userns_lchown(where, 0, 0);
1395 if (r < 0)
3539724c 1396 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1397
2547bb41
LP
1398 return 0;
1399}
1400
04bc4a3f 1401static int setup_boot_id(const char *dest) {
3bbaff3e 1402 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1403 const char *from, *to;
04bc4a3f
LP
1404 int r;
1405
04bc4a3f
LP
1406 /* Generate a new randomized boot ID, so that each boot-up of
1407 * the container gets a new one */
1408
03cfe0d5
LP
1409 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1410 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1411
1412 r = sd_id128_randomize(&rnd);
f647962d
MS
1413 if (r < 0)
1414 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1415
15b1248a 1416 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1417 if (r < 0)
1418 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1419
60e76d48
ZJS
1420 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1421 if (r >= 0)
1422 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1423 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1424
3bbaff3e 1425 (void) unlink(from);
04bc4a3f
LP
1426 return r;
1427}
1428
e58a1277 1429static int copy_devnodes(const char *dest) {
88213476
LP
1430
1431 static const char devnodes[] =
1432 "null\0"
1433 "zero\0"
1434 "full\0"
1435 "random\0"
1436 "urandom\0"
85614d66
TG
1437 "tty\0"
1438 "net/tun\0";
88213476
LP
1439
1440 const char *d;
e58a1277 1441 int r = 0;
7fd1b19b 1442 _cleanup_umask_ mode_t u;
a258bf26
LP
1443
1444 assert(dest);
124640f1
LP
1445
1446 u = umask(0000);
88213476 1447
03cfe0d5
LP
1448 /* Create /dev/net, so that we can create /dev/net/tun in it */
1449 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1450 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1451
88213476 1452 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1453 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1454 struct stat st;
88213476 1455
7f112f50 1456 from = strappend("/dev/", d);
03cfe0d5 1457 to = prefix_root(dest, from);
88213476
LP
1458
1459 if (stat(from, &st) < 0) {
1460
4a62c710
MS
1461 if (errno != ENOENT)
1462 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1463
a258bf26 1464 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1465
03cfe0d5 1466 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1467 return -EIO;
a258bf26 1468
85614d66 1469 } else {
81f5049b 1470 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1471 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1472 if (errno == EEXIST)
8dbf71ec 1473 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1474 if (errno != EPERM)
1475 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1476
1477 /* Some systems abusively restrict mknod but
1478 * allow bind mounts. */
1479 r = touch(to);
1480 if (r < 0)
1481 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1482 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1483 if (r < 0)
1484 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1485 }
6278cf60 1486
03cfe0d5
LP
1487 r = userns_lchown(to, 0, 0);
1488 if (r < 0)
1489 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1490 }
88213476
LP
1491 }
1492
e58a1277
LP
1493 return r;
1494}
88213476 1495
03cfe0d5
LP
1496static int setup_pts(const char *dest) {
1497 _cleanup_free_ char *options = NULL;
1498 const char *p;
709f6e46 1499 int r;
03cfe0d5
LP
1500
1501#ifdef HAVE_SELINUX
1502 if (arg_selinux_apifs_context)
1503 (void) asprintf(&options,
3dce8915 1504 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1505 arg_uid_shift + TTY_GID,
1506 arg_selinux_apifs_context);
1507 else
1508#endif
1509 (void) asprintf(&options,
3dce8915 1510 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1511 arg_uid_shift + TTY_GID);
f2d88580 1512
03cfe0d5 1513 if (!options)
f2d88580
LP
1514 return log_oom();
1515
03cfe0d5 1516 /* Mount /dev/pts itself */
cc9fce65 1517 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1518 if (mkdir(p, 0755) < 0)
1519 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1520 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1521 if (r < 0)
1522 return r;
709f6e46
MS
1523 r = userns_lchown(p, 0, 0);
1524 if (r < 0)
1525 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1526
1527 /* Create /dev/ptmx symlink */
1528 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1529 if (symlink("pts/ptmx", p) < 0)
1530 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1531 r = userns_lchown(p, 0, 0);
1532 if (r < 0)
1533 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1534
03cfe0d5
LP
1535 /* And fix /dev/pts/ptmx ownership */
1536 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1537 r = userns_lchown(p, 0, 0);
1538 if (r < 0)
1539 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1540
f2d88580
LP
1541 return 0;
1542}
1543
e58a1277 1544static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1545 _cleanup_umask_ mode_t u;
1546 const char *to;
e58a1277 1547 int r;
e58a1277
LP
1548
1549 assert(dest);
1550 assert(console);
1551
1552 u = umask(0000);
1553
03cfe0d5 1554 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1555 if (r < 0)
1556 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1557
a258bf26
LP
1558 /* We need to bind mount the right tty to /dev/console since
1559 * ptys can only exist on pts file systems. To have something
81f5049b 1560 * to bind mount things on we create a empty regular file. */
a258bf26 1561
03cfe0d5 1562 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1563 r = touch(to);
1564 if (r < 0)
1565 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1566
60e76d48 1567 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1568}
1569
1570static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1571 const char *from, *to;
7fd1b19b 1572 _cleanup_umask_ mode_t u;
d9603714 1573 int fd, r;
e58a1277 1574
e58a1277 1575 assert(kmsg_socket >= 0);
a258bf26 1576
e58a1277 1577 u = umask(0000);
a258bf26 1578
03cfe0d5 1579 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1580 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1581 * on the reading side behave very similar to /proc/kmsg,
1582 * their writing side behaves differently from /dev/kmsg in
1583 * that writing blocks when nothing is reading. In order to
1584 * avoid any problems with containers deadlocking due to this
1585 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1586 from = prefix_roota(dest, "/run/kmsg");
1587 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1588
4a62c710 1589 if (mkfifo(from, 0600) < 0)
03cfe0d5 1590 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1591 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1592 if (r < 0)
1593 return r;
e58a1277
LP
1594
1595 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1596 if (fd < 0)
1597 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1598
e58a1277
LP
1599 /* Store away the fd in the socket, so that it stays open as
1600 * long as we run the child */
3ee897d6 1601 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1602 safe_close(fd);
e58a1277 1603
d9603714
DH
1604 if (r < 0)
1605 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1606
03cfe0d5
LP
1607 /* And now make the FIFO unavailable as /run/kmsg... */
1608 (void) unlink(from);
1609
25ea79fe 1610 return 0;
88213476
LP
1611}
1612
1c4baffc 1613static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1614 union in_addr_union *exposed = userdata;
1615
1616 assert(rtnl);
1617 assert(m);
1618 assert(exposed);
1619
7a8f6325 1620 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1621 return 0;
1622}
1623
3a74cea5 1624static int setup_hostname(void) {
3a74cea5 1625
0c582db0 1626 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1627 return 0;
1628
605f81a8 1629 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1630 return -errno;
3a74cea5 1631
7027ff61 1632 return 0;
3a74cea5
LP
1633}
1634
57fb9fb5 1635static int setup_journal(const char *directory) {
e01ff70a 1636 sd_id128_t this_id;
0f5e1382 1637 _cleanup_free_ char *d = NULL;
e01ff70a 1638 const char *p, *q;
8054d749 1639 bool try;
e01ff70a 1640 char id[33];
57fb9fb5
LP
1641 int r;
1642
df9a75e4
LP
1643 /* Don't link journals in ephemeral mode */
1644 if (arg_ephemeral)
1645 return 0;
1646
8054d749
LP
1647 if (arg_link_journal == LINK_NO)
1648 return 0;
1649
1650 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1651
4d680aee 1652 r = sd_id128_get_machine(&this_id);
f647962d
MS
1653 if (r < 0)
1654 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1655
e01ff70a 1656 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1657 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1658 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1659 if (try)
4d680aee 1660 return 0;
df9a75e4 1661 return -EEXIST;
4d680aee
ZJS
1662 }
1663
03cfe0d5
LP
1664 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1665 if (r < 0)
1666 return log_error_errno(r, "Failed to create /var: %m");
1667
1668 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1669 if (r < 0)
1670 return log_error_errno(r, "Failed to create /var/log: %m");
1671
1672 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1673 if (r < 0)
1674 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1675
e01ff70a
MS
1676 (void) sd_id128_to_string(arg_uuid, id);
1677
03cfe0d5
LP
1678 p = strjoina("/var/log/journal/", id);
1679 q = prefix_roota(directory, p);
27407a01 1680
e1873695 1681 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1682 if (try)
1683 return 0;
27407a01 1684
8054d749
LP
1685 log_error("%s: already a mount point, refusing to use for journal", p);
1686 return -EEXIST;
57fb9fb5
LP
1687 }
1688
e1873695 1689 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1690 if (try)
1691 return 0;
57fb9fb5 1692
8054d749
LP
1693 log_error("%s: already a mount point, refusing to use for journal", q);
1694 return -EEXIST;
57fb9fb5
LP
1695 }
1696
1697 r = readlink_and_make_absolute(p, &d);
1698 if (r >= 0) {
1699 if ((arg_link_journal == LINK_GUEST ||
1700 arg_link_journal == LINK_AUTO) &&
1701 path_equal(d, q)) {
1702
03cfe0d5 1703 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1704 if (r < 0)
709f6e46 1705 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1706 return 0;
57fb9fb5
LP
1707 }
1708
4a62c710
MS
1709 if (unlink(p) < 0)
1710 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1711 } else if (r == -EINVAL) {
1712
1713 if (arg_link_journal == LINK_GUEST &&
1714 rmdir(p) < 0) {
1715
27407a01
ZJS
1716 if (errno == ENOTDIR) {
1717 log_error("%s already exists and is neither a symlink nor a directory", p);
1718 return r;
4314d33f
MS
1719 } else
1720 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1721 }
4314d33f
MS
1722 } else if (r != -ENOENT)
1723 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1724
1725 if (arg_link_journal == LINK_GUEST) {
1726
1727 if (symlink(q, p) < 0) {
8054d749 1728 if (try) {
56f64d95 1729 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1730 return 0;
4314d33f
MS
1731 } else
1732 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1733 }
1734
03cfe0d5 1735 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1736 if (r < 0)
709f6e46 1737 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1738 return 0;
57fb9fb5
LP
1739 }
1740
1741 if (arg_link_journal == LINK_HOST) {
ccddd104 1742 /* don't create parents here — if the host doesn't have
574edc90 1743 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1744
1745 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1746 if (try) {
56f64d95 1747 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1748 return 0;
4314d33f
MS
1749 } else
1750 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1751 }
1752
27407a01
ZJS
1753 } else if (access(p, F_OK) < 0)
1754 return 0;
57fb9fb5 1755
cdb2b9d0
LP
1756 if (dir_is_empty(q) == 0)
1757 log_warning("%s is not empty, proceeding anyway.", q);
1758
03cfe0d5 1759 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1760 if (r < 0)
1761 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1762
60e76d48
ZJS
1763 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1764 if (r < 0)
4a62c710 1765 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1766
27407a01 1767 return 0;
57fb9fb5
LP
1768}
1769
88213476 1770static int drop_capabilities(void) {
520e0d54 1771 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1772}
1773
db999e0f
LP
1774static int reset_audit_loginuid(void) {
1775 _cleanup_free_ char *p = NULL;
1776 int r;
1777
0c582db0 1778 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1779 return 0;
1780
1781 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1782 if (r == -ENOENT)
db999e0f 1783 return 0;
f647962d
MS
1784 if (r < 0)
1785 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1786
1787 /* Already reset? */
1788 if (streq(p, "4294967295"))
1789 return 0;
1790
ad118bda 1791 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1792 if (r < 0) {
10a87006
LP
1793 log_error_errno(r,
1794 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1795 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1796 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1797 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1798 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1799
db999e0f 1800 sleep(5);
77b6e194 1801 }
db999e0f
LP
1802
1803 return 0;
77b6e194
LP
1804}
1805
24fb1112 1806
785890ac
LP
1807static int setup_propagate(const char *root) {
1808 const char *p, *q;
709f6e46 1809 int r;
785890ac
LP
1810
1811 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1812 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1813 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1814 (void) mkdir_p(p, 0600);
1815
709f6e46
MS
1816 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1817 if (r < 0)
1818 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1819
709f6e46
MS
1820 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1821 if (r < 0)
1822 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1823
709f6e46
MS
1824 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1825 if (r < 0)
1826 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1827
03cfe0d5 1828 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1829 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1830 if (r < 0)
1831 return r;
785890ac 1832
60e76d48
ZJS
1833 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1834 if (r < 0)
1835 return r;
785890ac 1836
19caffac
AC
1837 /* machined will MS_MOVE into that directory, and that's only
1838 * supported for non-shared mounts. */
60e76d48 1839 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1840}
1841
317feb4d 1842static int setup_machine_id(const char *directory) {
691675ba
LP
1843 const char *etc_machine_id;
1844 sd_id128_t id;
3bbaff3e 1845 int r;
e01ff70a 1846
317feb4d
LP
1847 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1848 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1849 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1850 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1851 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1852 * container behaves nicely). */
1853
e01ff70a
MS
1854 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1855
691675ba 1856 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
1857 if (r < 0) {
1858 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1859 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 1860
317feb4d
LP
1861 if (sd_id128_is_null(arg_uuid)) {
1862 r = sd_id128_randomize(&arg_uuid);
1863 if (r < 0)
1864 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1865 }
1866 } else {
1867 if (sd_id128_is_null(id)) {
1868 log_error("Machine ID in container image is zero, refusing.");
1869 return -EINVAL;
1870 }
e01ff70a 1871
317feb4d
LP
1872 arg_uuid = id;
1873 }
691675ba 1874
e01ff70a
MS
1875 return 0;
1876}
1877
7336138e
LP
1878static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1879 int r;
1880
1881 assert(directory);
1882
0de7acce 1883 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
1884 return 0;
1885
1886 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1887 if (r == -EOPNOTSUPP)
1888 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1889 if (r == -EBADE)
1890 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1891 if (r < 0)
1892 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1893 if (r == 0)
1894 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1895 else
1896 log_debug("Patched directory tree to match UID/GID range.");
1897
1898 return r;
1899}
1900
113cea80 1901/*
6d416b9c
LS
1902 * Return values:
1903 * < 0 : wait_for_terminate() failed to get the state of the
1904 * container, the container was terminated by a signal, or
1905 * failed for an unknown reason. No change is made to the
1906 * container argument.
1907 * > 0 : The program executed in the container terminated with an
1908 * error. The exit code of the program executed in the
919699ec
LP
1909 * container is returned. The container argument has been set
1910 * to CONTAINER_TERMINATED.
6d416b9c
LS
1911 * 0 : The container is being rebooted, has been shut down or exited
1912 * successfully. The container argument has been set to either
1913 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 1914 *
6d416b9c
LS
1915 * That is, success is indicated by a return value of zero, and an
1916 * error is indicated by a non-zero value.
113cea80
DH
1917 */
1918static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 1919 siginfo_t status;
919699ec 1920 int r;
113cea80
DH
1921
1922 r = wait_for_terminate(pid, &status);
f647962d
MS
1923 if (r < 0)
1924 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
1925
1926 switch (status.si_code) {
fddbb89c 1927
113cea80 1928 case CLD_EXITED:
b5a2179b 1929 if (status.si_status == 0)
919699ec 1930 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 1931 else
919699ec 1932 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 1933
919699ec
LP
1934 *container = CONTAINER_TERMINATED;
1935 return status.si_status;
113cea80
DH
1936
1937 case CLD_KILLED:
1938 if (status.si_status == SIGINT) {
919699ec 1939 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 1940 *container = CONTAINER_TERMINATED;
919699ec
LP
1941 return 0;
1942
113cea80 1943 } else if (status.si_status == SIGHUP) {
919699ec 1944 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 1945 *container = CONTAINER_REBOOTED;
919699ec 1946 return 0;
113cea80 1947 }
919699ec 1948
ec251fe7 1949 /* fall through */
113cea80
DH
1950
1951 case CLD_DUMPED:
fddbb89c 1952 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 1953 return -EIO;
113cea80
DH
1954
1955 default:
fddbb89c 1956 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 1957 return -EIO;
113cea80 1958 }
113cea80
DH
1959}
1960
023fb90b
LP
1961static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1962 pid_t pid;
1963
4a0b58c4 1964 pid = PTR_TO_PID(userdata);
023fb90b 1965 if (pid > 0) {
c6c8f6e2 1966 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
1967 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1968 sd_event_source_set_userdata(s, NULL);
1969 return 0;
1970 }
1971 }
1972
1973 sd_event_exit(sd_event_source_get_event(s), 0);
1974 return 0;
1975}
1976
6916b164
AU
1977static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
1978 for (;;) {
1979 siginfo_t si = {};
1980 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
1981 return log_error_errno(errno, "Failed to waitid(): %m");
1982 if (si.si_pid == 0) /* No pending children. */
1983 break;
1984 if (si.si_pid == PTR_TO_PID(userdata)) {
1985 /* The main process we care for has exited. Return from
1986 * signal handler but leave the zombie. */
1987 sd_event_exit(sd_event_source_get_event(s), 0);
1988 break;
1989 }
1990 /* Reap all other children. */
1991 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
1992 }
1993
1994 return 0;
1995}
1996
ec16945e 1997static int determine_names(void) {
1b9cebf6 1998 int r;
ec16945e 1999
c1521918
LP
2000 if (arg_template && !arg_directory && arg_machine) {
2001
2002 /* If --template= was specified then we should not
2003 * search for a machine, but instead create a new one
2004 * in /var/lib/machine. */
2005
605405c6 2006 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2007 if (!arg_directory)
2008 return log_oom();
2009 }
2010
ec16945e 2011 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2012 if (arg_machine) {
2013 _cleanup_(image_unrefp) Image *i = NULL;
2014
2015 r = image_find(arg_machine, &i);
2016 if (r < 0)
2017 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2018 if (r == 0) {
1b9cebf6
LP
2019 log_error("No image for machine '%s': %m", arg_machine);
2020 return -ENOENT;
2021 }
2022
aceac2f0 2023 if (i->type == IMAGE_RAW)
0f03c2a4 2024 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2025 else
0f03c2a4 2026 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2027 if (r < 0)
0f3be6ca 2028 return log_oom();
1b9cebf6 2029
aee327b8
LP
2030 if (!arg_ephemeral)
2031 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2032 } else
ec16945e
LP
2033 arg_directory = get_current_dir_name();
2034
0f3be6ca 2035 if (!arg_directory && !arg_image) {
1b9cebf6 2036 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2037 return -EINVAL;
2038 }
2039 }
2040
2041 if (!arg_machine) {
4827ab48 2042
b9ba4dab
LP
2043 if (arg_directory && path_equal(arg_directory, "/"))
2044 arg_machine = gethostname_malloc();
4827ab48
LP
2045 else {
2046 if (arg_image) {
2047 char *e;
2048
2049 arg_machine = strdup(basename(arg_image));
2050
2051 /* Truncate suffix if there is one */
2052 e = endswith(arg_machine, ".raw");
2053 if (e)
2054 *e = 0;
2055 } else
2056 arg_machine = strdup(basename(arg_directory));
2057 }
ec16945e
LP
2058 if (!arg_machine)
2059 return log_oom();
2060
ae691c1d 2061 hostname_cleanup(arg_machine);
ec16945e
LP
2062 if (!machine_name_is_valid(arg_machine)) {
2063 log_error("Failed to determine machine name automatically, please use -M.");
2064 return -EINVAL;
2065 }
b9ba4dab
LP
2066
2067 if (arg_ephemeral) {
2068 char *b;
2069
2070 /* Add a random suffix when this is an
2071 * ephemeral machine, so that we can run many
2072 * instances at once without manually having
2073 * to specify -M each time. */
2074
2075 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2076 return log_oom();
2077
2078 free(arg_machine);
2079 arg_machine = b;
2080 }
ec16945e
LP
2081 }
2082
2083 return 0;
2084}
2085
8d4aa2bb 2086static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2087 char *chased;
2088 int r;
2089
2090 assert(p);
2091
2092 if (!*p)
2093 return 0;
2094
8d4aa2bb 2095 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2096 if (r < 0)
2097 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2098
2099 free(*p);
2100 *p = chased;
2101
2102 return 0;
2103}
2104
03cfe0d5 2105static int determine_uid_shift(const char *directory) {
6dac160c
LP
2106 int r;
2107
0de7acce 2108 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2109 arg_uid_shift = 0;
6dac160c 2110 return 0;
03cfe0d5 2111 }
6dac160c
LP
2112
2113 if (arg_uid_shift == UID_INVALID) {
2114 struct stat st;
2115
03cfe0d5 2116 r = stat(directory, &st);
6dac160c 2117 if (r < 0)
03cfe0d5 2118 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2119
2120 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2121
2122 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2123 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2124 return -EINVAL;
2125 }
2126
2127 arg_uid_range = UINT32_C(0x10000);
2128 }
2129
2130 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2131 log_error("UID base too high for UID range.");
2132 return -EINVAL;
2133 }
2134
6dac160c
LP
2135 return 0;
2136}
2137
03cfe0d5
LP
2138static int inner_child(
2139 Barrier *barrier,
2140 const char *directory,
2141 bool secondary,
2142 int kmsg_socket,
2143 int rtnl_socket,
f757855e 2144 FDSet *fds) {
69c79d3c 2145
03cfe0d5 2146 _cleanup_free_ char *home = NULL;
e01ff70a 2147 char as_uuid[37];
6aadfa4c 2148 unsigned n_env = 1;
03cfe0d5
LP
2149 const char *envp[] = {
2150 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2151 NULL, /* container */
03cfe0d5
LP
2152 NULL, /* TERM */
2153 NULL, /* HOME */
2154 NULL, /* USER */
2155 NULL, /* LOGNAME */
2156 NULL, /* container_uuid */
2157 NULL, /* LISTEN_FDS */
2158 NULL, /* LISTEN_PID */
9c1e04d0 2159 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2160 NULL
2161 };
1a68e1e5 2162 const char *exec_target;
88213476 2163
2371271c 2164 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2165 int r;
88213476 2166
03cfe0d5
LP
2167 assert(barrier);
2168 assert(directory);
2169 assert(kmsg_socket >= 0);
88213476 2170
efdb0237
LP
2171 cg_unified_flush();
2172
0de7acce 2173 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2174 /* Tell the parent, that it now can write the UID map. */
2175 (void) barrier_place(barrier); /* #1 */
7027ff61 2176
03cfe0d5
LP
2177 /* Wait until the parent wrote the UID map */
2178 if (!barrier_place_and_sync(barrier)) { /* #2 */
2179 log_error("Parent died too early");
2180 return -ESRCH;
2181 }
88213476
LP
2182 }
2183
6d66bd3b
EV
2184 r = reset_uid_gid();
2185 if (r < 0)
2186 return log_error_errno(r, "Couldn't become new root: %m");
2187
0de7acce 2188 r = mount_all(NULL,
4f086aab 2189 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2190 arg_uid_shift,
2191 arg_uid_range,
2192 arg_selinux_apifs_context);
2193
03cfe0d5
LP
2194 if (r < 0)
2195 return r;
2196
4f086aab 2197 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2198 if (r < 0)
2199 return r;
2200
03cfe0d5
LP
2201 /* Wait until we are cgroup-ified, so that we
2202 * can mount the right cgroup path writable */
2203 if (!barrier_place_and_sync(barrier)) { /* #3 */
2204 log_error("Parent died too early");
2205 return -ESRCH;
88213476
LP
2206 }
2207
5a8ff0e6 2208 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2209 r = unshare(CLONE_NEWCGROUP);
2210 if (r < 0)
2211 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2212 r = mount_cgroups(
2213 "",
2214 arg_unified_cgroup_hierarchy,
2215 arg_userns_mode != USER_NAMESPACE_NO,
2216 arg_uid_shift,
2217 arg_uid_range,
5a8ff0e6 2218 arg_selinux_apifs_context,
ada54120 2219 true);
0996ef00
CB
2220 if (r < 0)
2221 return r;
2222 } else {
2223 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2224 if (r < 0)
2225 return r;
2226 }
ec16945e 2227
03cfe0d5
LP
2228 r = setup_boot_id(NULL);
2229 if (r < 0)
2230 return r;
ec16945e 2231
03cfe0d5
LP
2232 r = setup_kmsg(NULL, kmsg_socket);
2233 if (r < 0)
2234 return r;
2235 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2236
03cfe0d5 2237 umask(0022);
30535c16 2238
03cfe0d5
LP
2239 if (setsid() < 0)
2240 return log_error_errno(errno, "setsid() failed: %m");
2241
2242 if (arg_private_network)
2243 loopback_setup();
2244
7a8f6325
LP
2245 if (arg_expose_ports) {
2246 r = expose_port_send_rtnl(rtnl_socket);
2247 if (r < 0)
2248 return r;
2249 rtnl_socket = safe_close(rtnl_socket);
2250 }
03cfe0d5 2251
709f6e46
MS
2252 r = drop_capabilities();
2253 if (r < 0)
2254 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2255
2256 setup_hostname();
2257
050f7277 2258 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2259 if (personality(arg_personality) < 0)
2260 return log_error_errno(errno, "personality() failed: %m");
2261 } else if (secondary) {
2262 if (personality(PER_LINUX32) < 0)
2263 return log_error_errno(errno, "personality() failed: %m");
2264 }
2265
2266#ifdef HAVE_SELINUX
2267 if (arg_selinux_context)
2ed96880 2268 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2269 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2270#endif
2271
ee645080 2272 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2273 if (r < 0)
2274 return r;
2275
6aadfa4c
ILG
2276 /* LXC sets container=lxc, so follow the scheme here */
2277 envp[n_env++] = strjoina("container=", arg_container_service_name);
2278
03cfe0d5
LP
2279 envp[n_env] = strv_find_prefix(environ, "TERM=");
2280 if (envp[n_env])
313cefa1 2281 n_env++;
03cfe0d5
LP
2282
2283 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2284 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2285 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2286 return log_oom();
2287
3bbaff3e 2288 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2289
691675ba 2290 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2291 return log_oom();
03cfe0d5
LP
2292
2293 if (fdset_size(fds) > 0) {
2294 r = fdset_cloexec(fds, false);
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2297
2298 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2299 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2300 return log_oom();
2301 }
9c1e04d0
AP
2302 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2303 return log_oom();
03cfe0d5 2304
2371271c
TG
2305 env_use = strv_env_merge(2, envp, arg_setenv);
2306 if (!env_use)
2307 return log_oom();
03cfe0d5
LP
2308
2309 /* Let the parent know that we are ready and
2310 * wait until the parent is ready with the
2311 * setup, too... */
2312 if (!barrier_place_and_sync(barrier)) { /* #4 */
2313 log_error("Parent died too early");
2314 return -ESRCH;
2315 }
2316
5f932eb9
LP
2317 if (arg_chdir)
2318 if (chdir(arg_chdir) < 0)
2319 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2320
7732f92b 2321 if (arg_start_mode == START_PID2) {
75bf701f 2322 r = stub_pid1(arg_uuid);
7732f92b
LP
2323 if (r < 0)
2324 return r;
2325 }
2326
03cfe0d5
LP
2327 /* Now, explicitly close the log, so that we
2328 * then can close all remaining fds. Closing
2329 * the log explicitly first has the benefit
2330 * that the logging subsystem knows about it,
2331 * and is thus ready to be reopened should we
2332 * need it again. Note that the other fds
2333 * closed here are at least the locking and
2334 * barrier fds. */
2335 log_close();
2336 (void) fdset_close_others(fds);
2337
7732f92b 2338 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2339 char **a;
2340 size_t m;
2341
2342 /* Automatically search for the init system */
2343
75f32f04
ZJS
2344 m = strv_length(arg_parameters);
2345 a = newa(char*, m + 2);
2346 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2347 a[1 + m] = NULL;
03cfe0d5 2348
ced58da7 2349 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2350 execve(a[0], a, env_use);
2351
ced58da7 2352 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2353 execve(a[0], a, env_use);
2354
ced58da7 2355 a[0] = (char*) "/sbin/init";
03cfe0d5 2356 execve(a[0], a, env_use);
ced58da7
LP
2357
2358 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2359 } else if (!strv_isempty(arg_parameters)) {
2360 exec_target = arg_parameters[0];
f757855e 2361 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2362 } else {
5f932eb9 2363 if (!arg_chdir)
d929b0f9
ZJS
2364 /* If we cannot change the directory, we'll end up in /, that is expected. */
2365 (void) chdir(home ?: "/root");
5f932eb9 2366
03cfe0d5
LP
2367 execle("/bin/bash", "-bash", NULL, env_use);
2368 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2369
2370 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2371 }
2372
35607a8d 2373 r = -errno;
03cfe0d5 2374 (void) log_open();
1a68e1e5 2375 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2376}
2377
9c1e04d0
AP
2378static int setup_sd_notify_child(void) {
2379 static const int one = 1;
2380 int fd = -1;
2381 union sockaddr_union sa = {
2382 .sa.sa_family = AF_UNIX,
2383 };
2384 int r;
2385
2386 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2387 if (fd < 0)
2388 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2389
2390 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2391 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2392
2393 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2394 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2395 if (r < 0) {
2396 safe_close(fd);
2397 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2398 }
2399
adc7d9f0
EV
2400 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2401 if (r < 0) {
2402 safe_close(fd);
2403 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2404 }
2405
9c1e04d0
AP
2406 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2407 if (r < 0) {
2408 safe_close(fd);
2409 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2410 }
2411
2412 return fd;
2413}
2414
03cfe0d5
LP
2415static int outer_child(
2416 Barrier *barrier,
2417 const char *directory,
2418 const char *console,
2d845785 2419 DissectedImage *dissected_image,
03cfe0d5
LP
2420 bool interactive,
2421 bool secondary,
2422 int pid_socket,
e01ff70a 2423 int uuid_socket,
9c1e04d0 2424 int notify_socket,
03cfe0d5
LP
2425 int kmsg_socket,
2426 int rtnl_socket,
825d5287 2427 int uid_shift_socket,
f757855e 2428 FDSet *fds) {
03cfe0d5
LP
2429
2430 pid_t pid;
2431 ssize_t l;
2432 int r;
9c1e04d0 2433 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2434
2435 assert(barrier);
2436 assert(directory);
2437 assert(console);
2438 assert(pid_socket >= 0);
e01ff70a 2439 assert(uuid_socket >= 0);
9c1e04d0 2440 assert(notify_socket >= 0);
03cfe0d5
LP
2441 assert(kmsg_socket >= 0);
2442
efdb0237
LP
2443 cg_unified_flush();
2444
03cfe0d5
LP
2445 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2446 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2447
2448 if (interactive) {
2449 close_nointr(STDIN_FILENO);
2450 close_nointr(STDOUT_FILENO);
2451 close_nointr(STDERR_FILENO);
2452
2453 r = open_terminal(console, O_RDWR);
2454 if (r != STDIN_FILENO) {
2455 if (r >= 0) {
2456 safe_close(r);
2457 r = -EINVAL;
2458 }
2459
2460 return log_error_errno(r, "Failed to open console: %m");
2461 }
2462
2463 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2464 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2465 return log_error_errno(errno, "Failed to duplicate console: %m");
2466 }
2467
2468 r = reset_audit_loginuid();
2469 if (r < 0)
2470 return r;
2471
2472 /* Mark everything as slave, so that we still
2473 * receive mounts from the real root, but don't
2474 * propagate mounts to the real root. */
60e76d48
ZJS
2475 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2476 if (r < 0)
2477 return r;
03cfe0d5 2478
2d845785 2479 if (dissected_image) {
18b5886e 2480 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2481 if (r < 0)
2482 return r;
2483 }
03cfe0d5 2484
391567f4
LP
2485 r = determine_uid_shift(directory);
2486 if (r < 0)
2487 return r;
2488
0fd9563f
ZJS
2489 r = detect_unified_cgroup_hierarchy(directory);
2490 if (r < 0)
2491 return r;
2492
0de7acce 2493 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2494 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2495 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2496 if (l < 0)
2497 return log_error_errno(errno, "Failed to send UID shift: %m");
2498 if (l != sizeof(arg_uid_shift)) {
2499 log_error("Short write while sending UID shift.");
2500 return -EIO;
2501 }
0e7ac751 2502
0de7acce 2503 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2504 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2505 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2506 * not it will pick a different one, and send it back to us. */
2507
2508 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2509 if (l < 0)
2510 return log_error_errno(errno, "Failed to recv UID shift: %m");
2511 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2512 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2513 return -EIO;
2514 }
2515 }
2516
2517 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2518 }
2519
03cfe0d5 2520 /* Turn directory into bind mount */
60e76d48
ZJS
2521 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2522 if (r < 0)
2523 return r;
03cfe0d5 2524
b53ede69
PW
2525 r = setup_pivot_root(
2526 directory,
2527 arg_pivot_root_new,
2528 arg_pivot_root_old);
2529 if (r < 0)
2530 return r;
2531
0de7acce
LP
2532 r = setup_volatile(
2533 directory,
2534 arg_volatile_mode,
2535 arg_userns_mode != USER_NAMESPACE_NO,
2536 arg_uid_shift,
2537 arg_uid_range,
2538 arg_selinux_context);
03cfe0d5
LP
2539 if (r < 0)
2540 return r;
2541
0de7acce
LP
2542 r = setup_volatile_state(
2543 directory,
2544 arg_volatile_mode,
2545 arg_userns_mode != USER_NAMESPACE_NO,
2546 arg_uid_shift,
2547 arg_uid_range,
2548 arg_selinux_context);
03cfe0d5
LP
2549 if (r < 0)
2550 return r;
2551
4ad14eff
LP
2552 /* Mark everything as shared so our mounts get propagated down. This is
2553 * required to make new bind mounts available in systemd services
2554 * inside the containter that create a new mount namespace.
2555 * See https://github.com/systemd/systemd/issues/3860
2556 * Further submounts (such as /dev) done after this will inherit the
2557 * shared propagation mode.*/
2558 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2559 if (r < 0)
2560 return r;
2561
2562 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2563 if (r < 0)
2564 return r;
2565
03cfe0d5
LP
2566 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2567 if (r < 0)
2568 return r;
2569
03cfe0d5 2570 if (arg_read_only) {
6b7c9f8b 2571 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2572 if (r < 0)
2573 return log_error_errno(r, "Failed to make tree read-only: %m");
2574 }
2575
0de7acce 2576 r = mount_all(directory,
4f086aab 2577 arg_mount_settings,
0de7acce
LP
2578 arg_uid_shift,
2579 arg_uid_range,
2580 arg_selinux_apifs_context);
03cfe0d5
LP
2581 if (r < 0)
2582 return r;
2583
07fa00f9
LP
2584 r = copy_devnodes(directory);
2585 if (r < 0)
03cfe0d5
LP
2586 return r;
2587
2588 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2589
07fa00f9
LP
2590 r = setup_pts(directory);
2591 if (r < 0)
03cfe0d5
LP
2592 return r;
2593
2594 r = setup_propagate(directory);
2595 if (r < 0)
2596 return r;
2597
2598 r = setup_dev_console(directory, console);
2599 if (r < 0)
2600 return r;
2601
520e0d54 2602 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
2603 if (r < 0)
2604 return r;
2605
2606 r = setup_timezone(directory);
2607 if (r < 0)
2608 return r;
2609
2610 r = setup_resolv_conf(directory);
2611 if (r < 0)
2612 return r;
2613
e01ff70a
MS
2614 r = setup_machine_id(directory);
2615 if (r < 0)
2616 return r;
2617
03cfe0d5
LP
2618 r = setup_journal(directory);
2619 if (r < 0)
2620 return r;
2621
0de7acce
LP
2622 r = mount_custom(
2623 directory,
2624 arg_custom_mounts,
2625 arg_n_custom_mounts,
2626 arg_userns_mode != USER_NAMESPACE_NO,
2627 arg_uid_shift,
2628 arg_uid_range,
2629 arg_selinux_apifs_context);
03cfe0d5
LP
2630 if (r < 0)
2631 return r;
2632
5a8ff0e6 2633 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2634 r = mount_cgroups(
2635 directory,
2636 arg_unified_cgroup_hierarchy,
2637 arg_userns_mode != USER_NAMESPACE_NO,
2638 arg_uid_shift,
2639 arg_uid_range,
5a8ff0e6 2640 arg_selinux_apifs_context,
ada54120 2641 false);
0996ef00
CB
2642 if (r < 0)
2643 return r;
2644 }
03cfe0d5
LP
2645
2646 r = mount_move_root(directory);
2647 if (r < 0)
2648 return log_error_errno(r, "Failed to move root directory: %m");
2649
9c1e04d0
AP
2650 fd = setup_sd_notify_child();
2651 if (fd < 0)
2652 return fd;
2653
03cfe0d5 2654 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2655 arg_clone_ns_flags |
03cfe0d5 2656 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2657 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2658 if (pid < 0)
2659 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2660 if (pid == 0) {
2661 pid_socket = safe_close(pid_socket);
e01ff70a 2662 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2663 notify_socket = safe_close(notify_socket);
825d5287 2664 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2665
2666 /* The inner child has all namespaces that are
2667 * requested, so that we all are owned by the user if
2668 * user namespaces are turned on. */
2669
f757855e 2670 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2671 if (r < 0)
2672 _exit(EXIT_FAILURE);
2673
2674 _exit(EXIT_SUCCESS);
2675 }
2676
2677 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2678 if (l < 0)
2679 return log_error_errno(errno, "Failed to send PID: %m");
2680 if (l != sizeof(pid)) {
2681 log_error("Short write while sending PID.");
2682 return -EIO;
2683 }
2684
e01ff70a
MS
2685 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2686 if (l < 0)
2687 return log_error_errno(errno, "Failed to send machine ID: %m");
2688 if (l != sizeof(arg_uuid)) {
2689 log_error("Short write while sending machine ID.");
2690 return -EIO;
2691 }
2692
9c1e04d0
AP
2693 l = send_one_fd(notify_socket, fd, 0);
2694 if (l < 0)
2695 return log_error_errno(errno, "Failed to send notify fd: %m");
2696
03cfe0d5 2697 pid_socket = safe_close(pid_socket);
e01ff70a 2698 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2699 notify_socket = safe_close(notify_socket);
327e26d6
KN
2700 kmsg_socket = safe_close(kmsg_socket);
2701 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2702
2703 return 0;
2704}
2705
0e7ac751
LP
2706static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2707 unsigned n_tries = 100;
2708 uid_t candidate;
2709 int r;
2710
2711 assert(shift);
2712 assert(ret_lock_file);
0de7acce 2713 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2714 assert(arg_uid_range == 0x10000U);
2715
2716 candidate = *shift;
2717
2718 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2719
2720 for (;;) {
2721 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2722 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2723
2724 if (--n_tries <= 0)
2725 return -EBUSY;
2726
2727 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2728 goto next;
2729 if ((candidate & UINT32_C(0xFFFF)) != 0)
2730 goto next;
2731
2732 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2733 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2734 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2735 goto next;
2736 if (r < 0)
2737 return r;
2738
2739 /* Make some superficial checks whether the range is currently known in the user database */
2740 if (getpwuid(candidate))
2741 goto next;
2742 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2743 goto next;
2744 if (getgrgid(candidate))
2745 goto next;
2746 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2747 goto next;
2748
2749 *ret_lock_file = lf;
2750 lf = (struct LockFile) LOCK_FILE_INIT;
2751 *shift = candidate;
2752 return 0;
2753
2754 next:
2755 random_bytes(&candidate, sizeof(candidate));
2756 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2757 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2758 }
2759}
2760
03cfe0d5
LP
2761static int setup_uid_map(pid_t pid) {
2762 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2763 int r;
2764
2765 assert(pid > 1);
2766
2767 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2768 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2769 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to write UID map: %m");
2772
2773 /* We always assign the same UID and GID ranges */
2774 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2775 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to write GID map: %m");
2778
2779 return 0;
2780}
2781
9c1e04d0 2782static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
2783 char buf[NOTIFY_BUFFER_MAX+1];
2784 char *p = NULL;
2785 struct iovec iovec = {
2786 .iov_base = buf,
2787 .iov_len = sizeof(buf)-1,
2788 };
2789 union {
2790 struct cmsghdr cmsghdr;
2791 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2792 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2793 } control = {};
2794 struct msghdr msghdr = {
2795 .msg_iov = &iovec,
2796 .msg_iovlen = 1,
2797 .msg_control = &control,
2798 .msg_controllen = sizeof(control),
2799 };
2800 struct cmsghdr *cmsg;
2801 struct ucred *ucred = NULL;
2802 ssize_t n;
2803 pid_t inner_child_pid;
2804 _cleanup_strv_free_ char **tags = NULL;
2805
2806 assert(userdata);
2807
2808 inner_child_pid = PTR_TO_PID(userdata);
2809
2810 if (revents != EPOLLIN) {
2811 log_warning("Got unexpected poll event for notify fd.");
2812 return 0;
2813 }
2814
2815 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2816 if (n < 0) {
2817 if (errno == EAGAIN || errno == EINTR)
2818 return 0;
2819
2820 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2821 }
2822 cmsg_close_all(&msghdr);
2823
2824 CMSG_FOREACH(cmsg, &msghdr) {
2825 if (cmsg->cmsg_level == SOL_SOCKET &&
2826 cmsg->cmsg_type == SCM_CREDENTIALS &&
2827 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2828
2829 ucred = (struct ucred*) CMSG_DATA(cmsg);
2830 }
2831 }
2832
2833 if (!ucred || ucred->pid != inner_child_pid) {
2834 log_warning("Received notify message without valid credentials. Ignoring.");
2835 return 0;
2836 }
2837
2838 if ((size_t) n >= sizeof(buf)) {
2839 log_warning("Received notify message exceeded maximum size. Ignoring.");
2840 return 0;
2841 }
2842
2843 buf[n] = 0;
2844 tags = strv_split(buf, "\n\r");
2845 if (!tags)
2846 return log_oom();
2847
2848 if (strv_find(tags, "READY=1"))
2849 sd_notifyf(false, "READY=1\n");
2850
2851 p = strv_find_startswith(tags, "STATUS=");
2852 if (p)
2853 sd_notifyf(false, "STATUS=Container running: %s", p);
2854
2855 return 0;
2856}
2857
5773024d 2858static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 2859 int r;
9c1e04d0 2860
5773024d 2861 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
2862 if (r < 0)
2863 return log_error_errno(r, "Failed to allocate notify event source: %m");
2864
5773024d 2865 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
2866
2867 return 0;
2868}
2869
f757855e
LP
2870static int load_settings(void) {
2871 _cleanup_(settings_freep) Settings *settings = NULL;
2872 _cleanup_fclose_ FILE *f = NULL;
2873 _cleanup_free_ char *p = NULL;
2874 const char *fn, *i;
2875 int r;
2876
2877 /* If all settings are masked, there's no point in looking for
2878 * the settings file */
2879 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2880 return 0;
2881
2882 fn = strjoina(arg_machine, ".nspawn");
2883
2884 /* We first look in the admin's directories in /etc and /run */
2885 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2886 _cleanup_free_ char *j = NULL;
2887
605405c6 2888 j = strjoin(i, "/", fn);
f757855e
LP
2889 if (!j)
2890 return log_oom();
2891
2892 f = fopen(j, "re");
2893 if (f) {
2894 p = j;
2895 j = NULL;
2896
b938cb90 2897 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2898 if (arg_settings_trusted < 0)
2899 arg_settings_trusted = true;
2900
2901 break;
2902 }
2903
2904 if (errno != ENOENT)
2905 return log_error_errno(errno, "Failed to open %s: %m", j);
2906 }
2907
2908 if (!f) {
2909 /* After that, let's look for a file next to the
2910 * actual image we shall boot. */
2911
2912 if (arg_image) {
2913 p = file_in_same_dir(arg_image, fn);
2914 if (!p)
2915 return log_oom();
2916 } else if (arg_directory) {
2917 p = file_in_same_dir(arg_directory, fn);
2918 if (!p)
2919 return log_oom();
2920 }
2921
2922 if (p) {
2923 f = fopen(p, "re");
2924 if (!f && errno != ENOENT)
2925 return log_error_errno(errno, "Failed to open %s: %m", p);
2926
b938cb90 2927 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2928 if (arg_settings_trusted < 0)
2929 arg_settings_trusted = false;
2930 }
2931 }
2932
2933 if (!f)
2934 return 0;
2935
2936 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2937
2938 r = settings_load(f, p, &settings);
2939 if (r < 0)
2940 return r;
2941
2942 /* Copy over bits from the settings, unless they have been
2943 * explicitly masked by command line switches. */
2944
7732f92b
LP
2945 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2946 settings->start_mode >= 0) {
2947 arg_start_mode = settings->start_mode;
f757855e
LP
2948
2949 strv_free(arg_parameters);
2950 arg_parameters = settings->parameters;
2951 settings->parameters = NULL;
2952 }
2953
b53ede69
PW
2954 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
2955 settings->pivot_root_new) {
2956 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
2957 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
2958 }
2959
5f932eb9
LP
2960 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2961 settings->working_directory) {
2962 free(arg_chdir);
2963 arg_chdir = settings->working_directory;
2964 settings->working_directory = NULL;
2965 }
2966
f757855e
LP
2967 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2968 settings->environment) {
2969 strv_free(arg_setenv);
2970 arg_setenv = settings->environment;
2971 settings->environment = NULL;
2972 }
2973
2974 if ((arg_settings_mask & SETTING_USER) == 0 &&
2975 settings->user) {
2976 free(arg_user);
2977 arg_user = settings->user;
2978 settings->user = NULL;
2979 }
2980
2981 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2982 uint64_t plus;
f757855e 2983
0e265674
LP
2984 plus = settings->capability;
2985 if (settings_private_network(settings))
2986 plus |= (1ULL << CAP_NET_ADMIN);
2987
2988 if (!arg_settings_trusted && plus != 0) {
2989 if (settings->capability != 0)
2990 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2991 } else
520e0d54 2992 arg_caps_retain |= plus;
f757855e 2993
520e0d54 2994 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
2995 }
2996
2997 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2998 settings->kill_signal > 0)
2999 arg_kill_signal = settings->kill_signal;
3000
3001 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3002 settings->personality != PERSONALITY_INVALID)
3003 arg_personality = settings->personality;
3004
3005 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3006 !sd_id128_is_null(settings->machine_id)) {
3007
3008 if (!arg_settings_trusted)
3009 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3010 else
3011 arg_uuid = settings->machine_id;
3012 }
3013
3014 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3015 settings->read_only >= 0)
3016 arg_read_only = settings->read_only;
3017
3018 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3019 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3020 arg_volatile_mode = settings->volatile_mode;
3021
3022 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3023 settings->n_custom_mounts > 0) {
3024
3025 if (!arg_settings_trusted)
3026 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3027 else {
3028 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3029 arg_custom_mounts = settings->custom_mounts;
3030 arg_n_custom_mounts = settings->n_custom_mounts;
3031
3032 settings->custom_mounts = NULL;
3033 settings->n_custom_mounts = 0;
3034 }
3035 }
3036
3037 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3038 (settings->private_network >= 0 ||
3039 settings->network_veth >= 0 ||
3040 settings->network_bridge ||
22b28dfd 3041 settings->network_zone ||
f757855e
LP
3042 settings->network_interfaces ||
3043 settings->network_macvlan ||
f6d6bad1
LP
3044 settings->network_ipvlan ||
3045 settings->network_veth_extra)) {
f757855e
LP
3046
3047 if (!arg_settings_trusted)
3048 log_warning("Ignoring network settings, file %s is not trusted.", p);
3049 else {
f6d6bad1 3050 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3051 arg_private_network = settings_private_network(settings);
3052
f757855e
LP
3053 strv_free(arg_network_interfaces);
3054 arg_network_interfaces = settings->network_interfaces;
3055 settings->network_interfaces = NULL;
3056
3057 strv_free(arg_network_macvlan);
3058 arg_network_macvlan = settings->network_macvlan;
3059 settings->network_macvlan = NULL;
3060
3061 strv_free(arg_network_ipvlan);
3062 arg_network_ipvlan = settings->network_ipvlan;
3063 settings->network_ipvlan = NULL;
3064
f6d6bad1
LP
3065 strv_free(arg_network_veth_extra);
3066 arg_network_veth_extra = settings->network_veth_extra;
3067 settings->network_veth_extra = NULL;
3068
f757855e
LP
3069 free(arg_network_bridge);
3070 arg_network_bridge = settings->network_bridge;
3071 settings->network_bridge = NULL;
22b28dfd
LP
3072
3073 free(arg_network_zone);
3074 arg_network_zone = settings->network_zone;
3075 settings->network_zone = NULL;
f757855e
LP
3076 }
3077 }
3078
3079 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3080 settings->expose_ports) {
3081
3082 if (!arg_settings_trusted)
3083 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3084 else {
3085 expose_port_free_all(arg_expose_ports);
3086 arg_expose_ports = settings->expose_ports;
3087 settings->expose_ports = NULL;
3088 }
3089 }
3090
0de7acce
LP
3091 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3092 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3093
3094 if (!arg_settings_trusted)
3095 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3096 else {
3097 arg_userns_mode = settings->userns_mode;
3098 arg_uid_shift = settings->uid_shift;
3099 arg_uid_range = settings->uid_range;
3100 arg_userns_chown = settings->userns_chown;
3101 }
3102 }
3103
9c1e04d0
AP
3104 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3105 arg_notify_ready = settings->notify_ready;
3106
f757855e
LP
3107 return 0;
3108}
3109
b0067625
ZJS
3110static int run(int master,
3111 const char* console,
2d845785 3112 DissectedImage *dissected_image,
b0067625
ZJS
3113 bool interactive,
3114 bool secondary,
3115 FDSet *fds,
3116 char veth_name[IFNAMSIZ], bool *veth_created,
3117 union in_addr_union *exposed,
3118 pid_t *pid, int *ret) {
3119
3120 static const struct sigaction sa = {
3121 .sa_handler = nop_signal_handler,
e28c7cd0 3122 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3123 };
3124
3125 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3126 _cleanup_close_ int etc_passwd_lock = -1;
3127 _cleanup_close_pair_ int
3128 kmsg_socket_pair[2] = { -1, -1 },
3129 rtnl_socket_pair[2] = { -1, -1 },
3130 pid_socket_pair[2] = { -1, -1 },
3131 uuid_socket_pair[2] = { -1, -1 },
3132 notify_socket_pair[2] = { -1, -1 },
3133 uid_shift_socket_pair[2] = { -1, -1 };
3134 _cleanup_close_ int notify_socket= -1;
3135 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3136 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3137 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3138 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3139 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3140 ContainerStatus container_status = 0;
3141 char last_char = 0;
3142 int ifi = 0, r;
3143 ssize_t l;
3144 sigset_t mask_chld;
3145
3146 assert_se(sigemptyset(&mask_chld) == 0);
3147 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3148
3149 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3150 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3151 * check with getpwuid() if the specific user already exists. Note that /etc might be
3152 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3153 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3154 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3155 * really ours. */
3156
3157 etc_passwd_lock = take_etc_passwd_lock(NULL);
3158 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3159 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3160 }
3161
3162 r = barrier_create(&barrier);
3163 if (r < 0)
3164 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3165
3166 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3167 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3168
3169 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3170 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3171
3172 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3173 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3174
3175 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3176 return log_error_errno(errno, "Failed to create id socket pair: %m");
3177
3178 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3179 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3180
3181 if (arg_userns_mode != USER_NAMESPACE_NO)
3182 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3183 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3184
3185 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3186 * parent's blocking calls and give it a chance to call wait() and terminate. */
3187 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3188 if (r < 0)
3189 return log_error_errno(errno, "Failed to change the signal mask: %m");
3190
3191 r = sigaction(SIGCHLD, &sa, NULL);
3192 if (r < 0)
3193 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3194
3195 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3196 if (*pid < 0)
3197 return log_error_errno(errno, "clone() failed%s: %m",
3198 errno == EINVAL ?
3199 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3200
3201 if (*pid == 0) {
3202 /* The outer child only has a file system namespace. */
3203 barrier_set_role(&barrier, BARRIER_CHILD);
3204
3205 master = safe_close(master);
3206
3207 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3208 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3209 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3210 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3211 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3212 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3213
3214 (void) reset_all_signal_handlers();
3215 (void) reset_signal_mask();
3216
3217 r = outer_child(&barrier,
3218 arg_directory,
3219 console,
2d845785 3220 dissected_image,
b0067625
ZJS
3221 interactive,
3222 secondary,
3223 pid_socket_pair[1],
3224 uuid_socket_pair[1],
3225 notify_socket_pair[1],
3226 kmsg_socket_pair[1],
3227 rtnl_socket_pair[1],
3228 uid_shift_socket_pair[1],
3229 fds);
3230 if (r < 0)
3231 _exit(EXIT_FAILURE);
3232
3233 _exit(EXIT_SUCCESS);
3234 }
3235
3236 barrier_set_role(&barrier, BARRIER_PARENT);
3237
3238 fds = fdset_free(fds);
3239
3240 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3241 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3242 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3243 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3244 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3245 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3246
3247 if (arg_userns_mode != USER_NAMESPACE_NO) {
3248 /* The child just let us know the UID shift it might have read from the image. */
3249 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3250 if (l < 0)
3251 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3252 if (l != sizeof arg_uid_shift) {
3253 log_error("Short read while reading UID shift.");
3254 return -EIO;
3255 }
3256
3257 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3258 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3259 * image, but if that's already in use, pick a new one, and report back to the child,
3260 * which one we now picked. */
3261
3262 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3263 if (r < 0)
3264 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3265
3266 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3267 if (l < 0)
3268 return log_error_errno(errno, "Failed to send UID shift: %m");
3269 if (l != sizeof arg_uid_shift) {
3270 log_error("Short write while writing UID shift.");
3271 return -EIO;
3272 }
3273 }
3274 }
3275
3276 /* Wait for the outer child. */
3277 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3278 if (r != 0)
3279 return r < 0 ? r : -EIO;
3280
3281 /* And now retrieve the PID of the inner child. */
3282 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3283 if (l < 0)
3284 return log_error_errno(errno, "Failed to read inner child PID: %m");
3285 if (l != sizeof *pid) {
3286 log_error("Short read while reading inner child PID.");
3287 return -EIO;
3288 }
3289
3290 /* We also retrieve container UUID in case it was generated by outer child */
3291 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3292 if (l < 0)
3293 return log_error_errno(errno, "Failed to read container machine ID: %m");
3294 if (l != sizeof(arg_uuid)) {
3295 log_error("Short read while reading container machined ID.");
3296 return -EIO;
3297 }
3298
3299 /* We also retrieve the socket used for notifications generated by outer child */
3300 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3301 if (notify_socket < 0)
3302 return log_error_errno(notify_socket,
3303 "Failed to receive notification socket from the outer child: %m");
3304
3305 log_debug("Init process invoked as PID "PID_FMT, *pid);
3306
3307 if (arg_userns_mode != USER_NAMESPACE_NO) {
3308 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3309 log_error("Child died too early.");
3310 return -ESRCH;
3311 }
3312
3313 r = setup_uid_map(*pid);
3314 if (r < 0)
3315 return r;
3316
3317 (void) barrier_place(&barrier); /* #2 */
3318 }
3319
3320 if (arg_private_network) {
3321
3322 r = move_network_interfaces(*pid, arg_network_interfaces);
3323 if (r < 0)
3324 return r;
3325
3326 if (arg_network_veth) {
3327 r = setup_veth(arg_machine, *pid, veth_name,
3328 arg_network_bridge || arg_network_zone);
3329 if (r < 0)
3330 return r;
3331 else if (r > 0)
3332 ifi = r;
3333
3334 if (arg_network_bridge) {
3335 /* Add the interface to a bridge */
3336 r = setup_bridge(veth_name, arg_network_bridge, false);
3337 if (r < 0)
3338 return r;
3339 if (r > 0)
3340 ifi = r;
3341 } else if (arg_network_zone) {
3342 /* Add the interface to a bridge, possibly creating it */
3343 r = setup_bridge(veth_name, arg_network_zone, true);
3344 if (r < 0)
3345 return r;
3346 if (r > 0)
3347 ifi = r;
3348 }
3349 }
3350
3351 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3352 if (r < 0)
3353 return r;
3354
3355 /* We created the primary and extra veth links now; let's remember this, so that we know to
3356 remove them later on. Note that we don't bother with removing veth links that were created
3357 here when their setup failed half-way, because in that case the kernel should be able to
3358 remove them on its own, since they cannot be referenced by anything yet. */
3359 *veth_created = true;
3360
3361 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3362 if (r < 0)
3363 return r;
3364
3365 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3366 if (r < 0)
3367 return r;
3368 }
3369
3370 if (arg_register) {
3371 r = register_machine(
3372 arg_machine,
3373 *pid,
3374 arg_directory,
3375 arg_uuid,
3376 ifi,
3377 arg_slice,
3378 arg_custom_mounts, arg_n_custom_mounts,
3379 arg_kill_signal,
3380 arg_property,
3381 arg_keep_unit,
3382 arg_container_service_name);
3383 if (r < 0)
3384 return r;
3385 }
3386
f0bef277 3387 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3388 if (r < 0)
3389 return r;
3390
3391 if (arg_keep_unit) {
3392 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3393 if (r < 0)
3394 return r;
3395 }
3396
3397 r = chown_cgroup(*pid, arg_uid_shift);
3398 if (r < 0)
3399 return r;
3400
3401 /* Notify the child that the parent is ready with all
3402 * its setup (including cgroup-ification), and that
3403 * the child can now hand over control to the code to
3404 * run inside the container. */
3405 (void) barrier_place(&barrier); /* #3 */
3406
3407 /* Block SIGCHLD here, before notifying child.
3408 * process_pty() will handle it with the other signals. */
3409 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3410
3411 /* Reset signal to default */
3412 r = default_signals(SIGCHLD, -1);
3413 if (r < 0)
3414 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3415
3416 r = sd_event_new(&event);
3417 if (r < 0)
3418 return log_error_errno(r, "Failed to get default event source: %m");
3419
5773024d 3420 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3421 if (r < 0)
3422 return r;
3423
3424 /* Let the child know that we are ready and wait that the child is completely ready now. */
3425 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3426 log_error("Child died too early.");
3427 return -ESRCH;
3428 }
3429
3430 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3431 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3432 etc_passwd_lock = safe_close(etc_passwd_lock);
3433
3434 sd_notifyf(false,
3435 "STATUS=Container running.\n"
3436 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3437 if (!arg_notify_ready)
3438 sd_notify(false, "READY=1\n");
3439
3440 if (arg_kill_signal > 0) {
3441 /* Try to kill the init system on SIGINT or SIGTERM */
3442 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3443 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3444 } else {
3445 /* Immediately exit */
3446 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3447 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3448 }
3449
6916b164
AU
3450 /* Exit when the child exits */
3451 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3452
3453 if (arg_expose_ports) {
3454 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3455 if (r < 0)
3456 return r;
3457
3458 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3459 }
3460
3461 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3462
3463 r = pty_forward_new(event, master,
3464 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3465 &forward);
3466 if (r < 0)
3467 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3468
3469 r = sd_event_loop(event);
3470 if (r < 0)
3471 return log_error_errno(r, "Failed to run event loop: %m");
3472
3473 pty_forward_get_last_char(forward, &last_char);
3474
3475 forward = pty_forward_free(forward);
3476
3477 if (!arg_quiet && last_char != '\n')
3478 putc('\n', stdout);
3479
3480 /* Kill if it is not dead yet anyway */
3481 if (arg_register && !arg_keep_unit)
3482 terminate_machine(*pid);
3483
3484 /* Normally redundant, but better safe than sorry */
c67b0082 3485 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3486
3487 r = wait_for_container(*pid, &container_status);
3488 *pid = 0;
3489
3490 if (r < 0)
3491 /* We failed to wait for the container, or the container exited abnormally. */
3492 return r;
3493 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3494 /* r > 0 → The container exited with a non-zero status.
3495 * As a special case, we need to replace 133 with a different value,
3496 * because 133 is special-cased in the service file to reboot the container.
3497 * otherwise → The container exited with zero status and a reboot was not requested.
3498 */
2a49b612 3499 if (r == EXIT_FORCE_RESTART)
27e29a1e 3500 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3501 *ret = r;
b0067625
ZJS
3502 return 0; /* finito */
3503 }
3504
3505 /* CONTAINER_REBOOTED, loop again */
3506
3507 if (arg_keep_unit) {
3508 /* Special handling if we are running as a service: instead of simply
3509 * restarting the machine we want to restart the entire service, so let's
3510 * inform systemd about this with the special exit code 133. The service
3511 * file uses RestartForceExitStatus=133 so that this results in a full
3512 * nspawn restart. This is necessary since we might have cgroup parameters
3513 * set we want to have flushed out. */
2a49b612
ZJS
3514 *ret = EXIT_FORCE_RESTART;
3515 return 0; /* finito */
b0067625
ZJS
3516 }
3517
3518 expose_port_flush(arg_expose_ports, exposed);
3519
3520 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3521 *veth_created = false;
3522 return 1; /* loop again */
3523}
3524
03cfe0d5
LP
3525int main(int argc, char *argv[]) {
3526
2d845785
LP
3527 _cleanup_free_ char *console = NULL;
3528 _cleanup_close_ int master = -1;
03cfe0d5 3529 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 3530 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 3531 char veth_name[IFNAMSIZ] = "";
17cbb288 3532 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 3533 pid_t pid = 0;
03cfe0d5
LP
3534 union in_addr_union exposed = {};
3535 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
3536 bool interactive, veth_created = false, remove_tmprootdir = false;
3537 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 3538 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
3539 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3540 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
3541
3542 log_parse_environment();
3543 log_open();
3544
7732f92b
LP
3545 /* Make sure rename_process() in the stub init process can work */
3546 saved_argv = argv;
3547 saved_argc = argc;
3548
03cfe0d5
LP
3549 r = parse_argv(argc, argv);
3550 if (r <= 0)
3551 goto finish;
3552
03cfe0d5
LP
3553 if (geteuid() != 0) {
3554 log_error("Need to be root.");
3555 r = -EPERM;
3556 goto finish;
3557 }
f757855e
LP
3558 r = determine_names();
3559 if (r < 0)
3560 goto finish;
3561
3562 r = load_settings();
3563 if (r < 0)
3564 goto finish;
3565
3566 r = verify_arguments();
3567 if (r < 0)
3568 goto finish;
03cfe0d5
LP
3569
3570 n_fd_passed = sd_listen_fds(false);
3571 if (n_fd_passed > 0) {
3572 r = fdset_new_listen_fds(&fds, false);
3573 if (r < 0) {
3574 log_error_errno(r, "Failed to collect file descriptors: %m");
3575 goto finish;
3576 }
3577 }
3578
3579 if (arg_directory) {
3580 assert(!arg_image);
3581
3582 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3583 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3584 r = -EINVAL;
3585 goto finish;
3586 }
3587
3588 if (arg_ephemeral) {
3589 _cleanup_free_ char *np = NULL;
3590
8d4aa2bb 3591 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
3592 if (r < 0)
3593 goto finish;
3594
03cfe0d5
LP
3595 /* If the specified path is a mount point we
3596 * generate the new snapshot immediately
3597 * inside it under a random name. However if
3598 * the specified is not a mount point we
3599 * create the new snapshot in the parent
3600 * directory, just next to it. */
e1873695 3601 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
3602 if (r < 0) {
3603 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3604 goto finish;
3605 }
3606 if (r > 0)
770b5ce4 3607 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3608 else
770b5ce4 3609 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 3610 if (r < 0) {
0f3be6ca 3611 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
3612 goto finish;
3613 }
3614
3615 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3616 if (r < 0) {
3617 log_error_errno(r, "Failed to lock %s: %m", np);
3618 goto finish;
3619 }
3620
17cbb288
LP
3621 r = btrfs_subvol_snapshot(arg_directory, np,
3622 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3623 BTRFS_SNAPSHOT_FALLBACK_COPY |
3624 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3625 BTRFS_SNAPSHOT_RECURSIVE |
3626 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3627 if (r < 0) {
3628 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3629 goto finish;
ec16945e
LP
3630 }
3631
3632 free(arg_directory);
3633 arg_directory = np;
8a16a7b4 3634 np = NULL;
ec16945e 3635
17cbb288 3636 remove_directory = true;
30535c16
LP
3637
3638 } else {
cb638b5e 3639 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
3640 if (r < 0)
3641 goto finish;
3642
30535c16
LP
3643 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3644 if (r == -EBUSY) {
3645 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3646 goto finish;
3647 }
3648 if (r < 0) {
3649 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3650 goto finish;
30535c16
LP
3651 }
3652
3653 if (arg_template) {
8d4aa2bb 3654 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
3655 if (r < 0)
3656 goto finish;
3657
17cbb288
LP
3658 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3659 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3660 BTRFS_SNAPSHOT_FALLBACK_COPY |
3661 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3662 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3663 BTRFS_SNAPSHOT_RECURSIVE |
3664 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3665 if (r == -EEXIST) {
3666 if (!arg_quiet)
3667 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3668 } else if (r < 0) {
83521414 3669 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3670 goto finish;
3671 } else {
3672 if (!arg_quiet)
3673 log_info("Populated %s from template %s.", arg_directory, arg_template);
3674 }
3675 }
ec16945e
LP
3676 }
3677
7732f92b 3678 if (arg_start_mode == START_BOOT) {
1b9e5b12 3679 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3680 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3681 r = -EINVAL;
1b9e5b12
LP
3682 goto finish;
3683 }
3684 } else {
3685 const char *p;
3686
16fb773e
LP
3687 p = strjoina(arg_directory, "/usr/");
3688 if (laccess(p, F_OK) < 0) {
3689 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3690 r = -EINVAL;
1b9e5b12 3691 goto finish;
1b9e5b12
LP
3692 }
3693 }
ec16945e 3694
6b9132a9 3695 } else {
ec16945e
LP
3696 assert(arg_image);
3697 assert(!arg_template);
3698
8d4aa2bb 3699 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
3700 if (r < 0)
3701 goto finish;
3702
0f3be6ca
LP
3703 if (arg_ephemeral) {
3704 _cleanup_free_ char *np = NULL;
3705
3706 r = tempfn_random(arg_image, "machine.", &np);
3707 if (r < 0) {
3708 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3709 goto finish;
3710 }
3711
3712 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3713 if (r < 0) {
3714 r = log_error_errno(r, "Failed to create image lock: %m");
3715 goto finish;
3716 }
3717
1c876927 3718 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
3719 if (r < 0) {
3720 r = log_error_errno(r, "Failed to copy image file: %m");
3721 goto finish;
3722 }
3723
3724 free(arg_image);
3725 arg_image = np;
3726 np = NULL;
3727
3728 remove_image = true;
3729 } else {
3730 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3731 if (r == -EBUSY) {
3732 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3733 goto finish;
3734 }
3735 if (r < 0) {
3736 r = log_error_errno(r, "Failed to create image lock: %m");
3737 goto finish;
3738 }
4623e8e6 3739
78ebe980
LP
3740 if (!arg_root_hash) {
3741 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3742 if (r < 0) {
3743 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3744 goto finish;
3745 }
3746 }
30535c16
LP
3747 }
3748
c67b0082 3749 if (!mkdtemp(tmprootdir)) {
0f3be6ca 3750 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 3751 goto finish;
1b9e5b12 3752 }
6b9132a9 3753
c67b0082
LP
3754 remove_tmprootdir = true;
3755
3756 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
3757 if (!arg_directory) {
3758 r = log_oom();
3759 goto finish;
6b9132a9 3760 }
88213476 3761
2d845785
LP
3762 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3763 if (r < 0) {
3764 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
3765 goto finish;
3766 }
1b9e5b12 3767
e0f9e7bd
LP
3768 r = dissect_image(
3769 loop->fd,
3770 arg_root_hash, arg_root_hash_size,
3771 DISSECT_IMAGE_REQUIRE_ROOT,
3772 &dissected_image);
2d845785
LP
3773 if (r == -ENOPKG) {
3774 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3775
3776 log_notice("Note that the disk image needs to\n"
3777 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3778 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3779 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3780 " d) or contain a file system without a partition table\n"
3781 "in order to be bootable with systemd-nspawn.");
1b9e5b12 3782 goto finish;
2d845785 3783 }
4623e8e6
LP
3784 if (r == -EADDRNOTAVAIL) {
3785 log_error_errno(r, "No root partition for specified root hash found.");
3786 goto finish;
3787 }
2d845785
LP
3788 if (r == -EOPNOTSUPP) {
3789 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3790 goto finish;
3791 }
3792 if (r < 0) {
3793 log_error_errno(r, "Failed to dissect image: %m");
842f3b0f
LP
3794 goto finish;
3795 }
1b9e5b12 3796
4623e8e6
LP
3797 if (!arg_root_hash && dissected_image->can_verity)
3798 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3799
3800 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
3801 if (r < 0)
3802 goto finish;
0f3be6ca
LP
3803
3804 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3805 if (remove_image && unlink(arg_image) >= 0)
3806 remove_image = false;
842f3b0f 3807 }
842f3b0f 3808
86c0dd4a 3809 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
3810 if (r < 0)
3811 goto finish;
3812
03cfe0d5
LP
3813 interactive =
3814 isatty(STDIN_FILENO) > 0 &&
3815 isatty(STDOUT_FILENO) > 0;
9c857b9d 3816
db7feb7e
LP
3817 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3818 if (master < 0) {
ec16945e 3819 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3820 goto finish;
3821 }
3822
611b312b
LP
3823 r = ptsname_malloc(master, &console);
3824 if (r < 0) {
3825 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3826 goto finish;
68b02049
DW
3827 }
3828
3829 if (arg_selinux_apifs_context) {
3830 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3831 if (r < 0)
3832 goto finish;
a258bf26
LP
3833 }
3834
a258bf26 3835 if (unlockpt(master) < 0) {
ec16945e 3836 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3837 goto finish;
3838 }
3839
9c857b9d
LP
3840 if (!arg_quiet)
3841 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3842 arg_machine, arg_image ?: arg_directory);
3843
72c0a2c2 3844 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3845
03cfe0d5
LP
3846 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3847 r = log_error_errno(errno, "Failed to become subreaper: %m");
3848 goto finish;
3849 }
3850
d87be9b0 3851 for (;;) {
b0067625
ZJS
3852 r = run(master,
3853 console,
2d845785 3854 dissected_image,
b0067625
ZJS
3855 interactive, secondary,
3856 fds,
3857 veth_name, &veth_created,
3858 &exposed,
3859 &pid, &ret);
3860 if (r <= 0)
d87be9b0 3861 break;
d87be9b0 3862 }
88213476
LP
3863
3864finish:
af4ec430 3865 sd_notify(false,
2a49b612
ZJS
3866 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3867 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 3868
9444b1f2 3869 if (pid > 0)
c67b0082 3870 (void) kill(pid, SIGKILL);
88213476 3871
503546da 3872 /* Try to flush whatever is still queued in the pty */
6a0f896b 3873 if (master >= 0) {
1c876927 3874 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
3875 master = safe_close(master);
3876 }
3877
3878 if (pid > 0)
3879 (void) wait_for_terminate(pid, NULL);
503546da 3880
17cbb288 3881 if (remove_directory && arg_directory) {
ec16945e
LP
3882 int k;
3883
17cbb288 3884 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 3885 if (k < 0)
17cbb288 3886 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
3887 }
3888
0f3be6ca
LP
3889 if (remove_image && arg_image) {
3890 if (unlink(arg_image) < 0)
3891 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3892 }
3893
c67b0082
LP
3894 if (remove_tmprootdir) {
3895 if (rmdir(tmprootdir) < 0)
3896 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3897 }
3898
785890ac
LP
3899 if (arg_machine) {
3900 const char *p;
3901
63c372cb 3902 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3903 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3904 }
3905
7a8f6325 3906 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
3907
3908 if (veth_created)
3909 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 3910 (void) remove_bridge(arg_network_zone);
f757855e 3911
04d391da 3912 free(arg_directory);
ec16945e
LP
3913 free(arg_template);
3914 free(arg_image);
7027ff61 3915 free(arg_machine);
c74e630d 3916 free(arg_user);
b53ede69
PW
3917 free(arg_pivot_root_new);
3918 free(arg_pivot_root_old);
5f932eb9 3919 free(arg_chdir);
c74e630d 3920 strv_free(arg_setenv);
f757855e 3921 free(arg_network_bridge);
c74e630d
LP
3922 strv_free(arg_network_interfaces);
3923 strv_free(arg_network_macvlan);
4bbfe7ad 3924 strv_free(arg_network_ipvlan);
f6d6bad1 3925 strv_free(arg_network_veth_extra);
f757855e
LP
3926 strv_free(arg_parameters);
3927 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3928 expose_port_free_all(arg_expose_ports);
4623e8e6 3929 free(arg_root_hash);
6d0b55c2 3930
ec16945e 3931 return r < 0 ? EXIT_FAILURE : ret;
88213476 3932}