]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: fix clobbering of selinux context arg
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
6916b164 41#include <sys/wait.h>
8fe0087e 42#include <unistd.h>
1b9e5b12 43
1f0cd86b 44#include "sd-daemon.h"
1f0cd86b 45#include "sd-id128.h"
8fe0087e 46
b5efdb8a 47#include "alloc-util.h"
8fe0087e
LP
48#include "barrier.h"
49#include "base-filesystem.h"
50#include "blkid-util.h"
51#include "btrfs-util.h"
8fe0087e 52#include "cap-list.h"
430f0182 53#include "capability-util.h"
04d391da 54#include "cgroup-util.h"
8fe0087e 55#include "copy.h"
4fc9982c 56#include "dev-setup.h"
2d845785 57#include "dissect-image.h"
8fe0087e 58#include "env-util.h"
3ffd4af2 59#include "fd-util.h"
842f3b0f 60#include "fdset.h"
a5c32cff 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
f4f15635 63#include "fs-util.h"
1b9e5b12 64#include "gpt.h"
4623e8e6 65#include "hexdecoct.h"
8fe0087e 66#include "hostname-util.h"
910fd145 67#include "id128-util.h"
8fe0087e 68#include "log.h"
2d845785 69#include "loop-util.h"
8fe0087e 70#include "loopback-setup.h"
1b9cebf6 71#include "machine-image.h"
8fe0087e
LP
72#include "macro.h"
73#include "missing.h"
74#include "mkdir.h"
4349cd7c 75#include "mount-util.h"
8fe0087e 76#include "netlink-util.h"
07630cea
LP
77#include "nspawn-cgroup.h"
78#include "nspawn-expose-ports.h"
79#include "nspawn-mount.h"
80#include "nspawn-network.h"
7336138e 81#include "nspawn-patch-uid.h"
07630cea 82#include "nspawn-register.h"
910fd145 83#include "nspawn-seccomp.h"
07630cea
LP
84#include "nspawn-settings.h"
85#include "nspawn-setuid.h"
7732f92b 86#include "nspawn-stub-pid1.h"
6bedfcbb 87#include "parse-util.h"
8fe0087e 88#include "path-util.h"
0b452006 89#include "process-util.h"
8fe0087e
LP
90#include "ptyfwd.h"
91#include "random-util.h"
8869a0b4 92#include "raw-clone.h"
8fe0087e 93#include "rm-rf.h"
68b02049 94#include "selinux-util.h"
8fe0087e 95#include "signal-util.h"
2583fbea 96#include "socket-util.h"
8fcde012 97#include "stat-util.h"
15a5e950 98#include "stdio-util.h"
07630cea 99#include "string-util.h"
8fe0087e
LP
100#include "strv.h"
101#include "terminal-util.h"
102#include "udev-util.h"
affb60b1 103#include "umask-util.h"
b1d4f8e1 104#include "user-util.h"
8fe0087e 105#include "util.h"
e9642be2 106
0e7ac751 107/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
108 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
109 * may have their own allocation ranges too. */
0e7ac751
LP
110#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
111#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 112
9c1e04d0
AP
113/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
114 * nspawn_notify_socket_path is relative to the container
115 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
116#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 117
2a49b612
ZJS
118#define EXIT_FORCE_RESTART 133
119
113cea80
DH
120typedef enum ContainerStatus {
121 CONTAINER_TERMINATED,
122 CONTAINER_REBOOTED
123} ContainerStatus;
124
57fb9fb5
LP
125typedef enum LinkJournal {
126 LINK_NO,
127 LINK_AUTO,
128 LINK_HOST,
129 LINK_GUEST
130} LinkJournal;
88213476
LP
131
132static char *arg_directory = NULL;
ec16945e 133static char *arg_template = NULL;
5f932eb9 134static char *arg_chdir = NULL;
687d0825 135static char *arg_user = NULL;
9444b1f2 136static sd_id128_t arg_uuid = {};
7027ff61 137static char *arg_machine = NULL;
c74e630d
LP
138static const char *arg_selinux_context = NULL;
139static const char *arg_selinux_apifs_context = NULL;
9444b1f2 140static const char *arg_slice = NULL;
ff01d048 141static bool arg_private_network = false;
bc2f673e 142static bool arg_read_only = false;
7732f92b 143static StartMode arg_start_mode = START_PID1;
ec16945e 144static bool arg_ephemeral = false;
57fb9fb5 145static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 146static bool arg_link_journal_try = false;
520e0d54 147static uint64_t arg_caps_retain =
50b52222
LP
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 159 (1ULL << CAP_MKNOD) |
5076f0cc
LP
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
5076f0cc 163 (1ULL << CAP_SETFCAP) |
50b52222 164 (1ULL << CAP_SETGID) |
5076f0cc
LP
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
50b52222 168 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 172 (1ULL << CAP_SYS_RESOURCE) |
50b52222 173 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
174static CustomMount *arg_custom_mounts = NULL;
175static unsigned arg_n_custom_mounts = 0;
f4889f65 176static char **arg_setenv = NULL;
284c0b91 177static bool arg_quiet = false;
eb91eb18 178static bool arg_register = true;
89f7c846 179static bool arg_keep_unit = false;
aa28aefe 180static char **arg_network_interfaces = NULL;
c74e630d 181static char **arg_network_macvlan = NULL;
4bbfe7ad 182static char **arg_network_ipvlan = NULL;
69c79d3c 183static bool arg_network_veth = false;
f6d6bad1 184static char **arg_network_veth_extra = NULL;
f757855e 185static char *arg_network_bridge = NULL;
22b28dfd 186static char *arg_network_zone = NULL;
050f7277 187static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 188static char *arg_image = NULL;
f757855e 189static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 190static ExposePort *arg_expose_ports = NULL;
f36933fe 191static char **arg_property = NULL;
0de7acce 192static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 193static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 194static bool arg_userns_chown = false;
c6c8f6e2 195static int arg_kill_signal = 0;
5da38d07 196static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
197static SettingsMask arg_settings_mask = 0;
198static int arg_settings_trusted = -1;
199static char **arg_parameters = NULL;
6aadfa4c 200static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 201static bool arg_notify_ready = false;
5a8ff0e6 202static bool arg_use_cgns = true;
0c582db0 203static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 204static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
205static void *arg_root_hash = NULL;
206static size_t arg_root_hash_size = 0;
88213476 207
601185b4 208static void help(void) {
88213476
LP
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
211 " -h --help Show this help\n"
212 " --version Print version string\n"
69c79d3c 213 " -q --quiet Do not show status information\n"
1b9e5b12 214 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 220 " --root-hash=HASH Specify verity root hash\n"
7732f92b 221 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 222 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 223 " --chdir=PATH Set working directory in the container\n"
a8828ed9 224 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 225 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 226 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 227 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 228 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 229 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 230 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 231 " Similar, but with user configured UID/GID range\n"
24597ee0 232 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
233 " --private-network Disable network in container\n"
234 " --network-interface=INTERFACE\n"
235 " Assign an existing network interface to the\n"
236 " container\n"
c74e630d
LP
237 " --network-macvlan=INTERFACE\n"
238 " Create a macvlan network interface based on an\n"
239 " existing network interface to the container\n"
4bbfe7ad
TG
240 " --network-ipvlan=INTERFACE\n"
241 " Create a ipvlan network interface based on an\n"
242 " existing network interface to the container\n"
a8eaaee7 243 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 244 " and container\n"
f6d6bad1
LP
245 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
246 " Add an additional virtual Ethernet link between\n"
247 " host and container\n"
ab046dde 248 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
249 " Add a virtual Ethernet connection to the container\n"
250 " and attach it to an existing bridge on the host\n"
251 " --network-zone=NAME Similar, but attach the new interface to an\n"
252 " an automatically managed bridge interface\n"
6d0b55c2 253 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 254 " Expose a container IP port on the host\n"
82adf6af
LP
255 " -Z --selinux-context=SECLABEL\n"
256 " Set the SELinux security context to be used by\n"
257 " processes in the container\n"
258 " -L --selinux-apifs-context=SECLABEL\n"
259 " Set the SELinux security context to be used by\n"
260 " API/tmpfs file systems in the container\n"
a8828ed9
DW
261 " --capability=CAP In addition to the default, retain specified\n"
262 " capability\n"
263 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 264 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
265 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
266 " host, try-guest, try-host\n"
574edc90 267 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 268 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
269 " --bind=PATH[:PATH[:OPTIONS]]\n"
270 " Bind mount a file or directory from the host into\n"
a8828ed9 271 " the container\n"
5e5bfa6e
EY
272 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
273 " Similar, but creates a read-only bind mount\n"
06c17c39 274 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
275 " --overlay=PATH[:PATH...]:PATH\n"
276 " Create an overlay mount from the host to \n"
277 " the container\n"
278 " --overlay-ro=PATH[:PATH...]:PATH\n"
279 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 280 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 281 " --register=BOOLEAN Register container as machine\n"
89f7c846 282 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 283 " the service unit nspawn is running in\n"
6d0b55c2 284 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 285 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 286 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 287 , program_invocation_short_name);
88213476
LP
288}
289
86c0dd4a 290static int custom_mount_check_all(void) {
5a8af538 291 unsigned i;
5a8af538 292
5a8af538
LP
293 for (i = 0; i < arg_n_custom_mounts; i++) {
294 CustomMount *m = &arg_custom_mounts[i];
295
0de7acce 296 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
297
298 if (arg_userns_chown) {
299 log_error("--private-users-chown may not be combined with custom root mounts.");
300 return -EINVAL;
301 } else if (arg_uid_shift == UID_INVALID) {
302 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
303 return -EINVAL;
304 }
825d5287 305 }
5a8af538
LP
306 }
307
308 return 0;
309}
310
0fd9563f 311static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 312 const char *e;
5da38d07
TH
313 int r, all_unified, systemd_unified;
314
efdb0237
LP
315 /* Allow the user to control whether the unified hierarchy is used */
316 e = getenv("UNIFIED_CGROUP_HIERARCHY");
317 if (e) {
318 r = parse_boolean(e);
319 if (r < 0)
320 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
321 if (r > 0)
322 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
323 else
324 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 325
efdb0237
LP
326 return 0;
327 }
328
98afd6af
ZJS
329 all_unified = cg_all_unified();
330 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
331
332 if (all_unified < 0 || systemd_unified < 0)
333 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
334 "Failed to determine whether the unified cgroups hierarchy is used: %m");
335
efdb0237 336 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
337 if (all_unified > 0) {
338 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
339 * routine only detects 231, so we'll have a false negative here for 230. */
340 r = systemd_installation_has_version(directory, 230);
341 if (r < 0)
342 return log_error_errno(r, "Failed to determine systemd version in container: %m");
343 if (r > 0)
344 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
345 else
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
347 } else if (systemd_unified > 0) {
348 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
349 r = systemd_installation_has_version(directory, 232);
350 if (r < 0)
351 return log_error_errno(r, "Failed to determine systemd version in container: %m");
352 if (r > 0)
353 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
354 else
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
356 } else
5da38d07 357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 358
efdb0237
LP
359 return 0;
360}
361
0c582db0
LB
362static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
363 int r;
364
365 r = getenv_bool(name);
366 if (r == -ENXIO)
367 return;
368 if (r < 0)
369 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
370 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
371}
372
4f086aab
SU
373static void parse_mount_settings_env(void) {
374 int r;
375 const char *e;
376
377 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
378 if (!e)
379 return;
380
381 if (streq(e, "network")) {
382 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
383 return;
384 }
385
386 r = parse_boolean(e);
387 if (r < 0) {
388 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
389 return;
390 } else if (r > 0)
391 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
392 else
393 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
394
395 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
396}
397
88213476
LP
398static int parse_argv(int argc, char *argv[]) {
399
a41fe3a2 400 enum {
acbeb427
ZJS
401 ARG_VERSION = 0x100,
402 ARG_PRIVATE_NETWORK,
bc2f673e 403 ARG_UUID,
5076f0cc 404 ARG_READ_ONLY,
57fb9fb5 405 ARG_CAPABILITY,
420c7379 406 ARG_DROP_CAPABILITY,
17fe0523
LP
407 ARG_LINK_JOURNAL,
408 ARG_BIND,
f4889f65 409 ARG_BIND_RO,
06c17c39 410 ARG_TMPFS,
5a8af538
LP
411 ARG_OVERLAY,
412 ARG_OVERLAY_RO,
eb91eb18 413 ARG_SHARE_SYSTEM,
89f7c846 414 ARG_REGISTER,
aa28aefe 415 ARG_KEEP_UNIT,
69c79d3c 416 ARG_NETWORK_INTERFACE,
c74e630d 417 ARG_NETWORK_MACVLAN,
4bbfe7ad 418 ARG_NETWORK_IPVLAN,
ab046dde 419 ARG_NETWORK_BRIDGE,
22b28dfd 420 ARG_NETWORK_ZONE,
f6d6bad1 421 ARG_NETWORK_VETH_EXTRA,
6afc95b7 422 ARG_PERSONALITY,
4d9f07b4 423 ARG_VOLATILE,
ec16945e 424 ARG_TEMPLATE,
f36933fe 425 ARG_PROPERTY,
6dac160c 426 ARG_PRIVATE_USERS,
c6c8f6e2 427 ARG_KILL_SIGNAL,
f757855e 428 ARG_SETTINGS,
5f932eb9 429 ARG_CHDIR,
7336138e 430 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 431 ARG_NOTIFY_READY,
4623e8e6 432 ARG_ROOT_HASH,
a41fe3a2
LP
433 };
434
88213476 435 static const struct option options[] = {
27eb8e90
ZJS
436 { "help", no_argument, NULL, 'h' },
437 { "version", no_argument, NULL, ARG_VERSION },
438 { "directory", required_argument, NULL, 'D' },
439 { "template", required_argument, NULL, ARG_TEMPLATE },
440 { "ephemeral", no_argument, NULL, 'x' },
441 { "user", required_argument, NULL, 'u' },
442 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
443 { "as-pid2", no_argument, NULL, 'a' },
444 { "boot", no_argument, NULL, 'b' },
445 { "uuid", required_argument, NULL, ARG_UUID },
446 { "read-only", no_argument, NULL, ARG_READ_ONLY },
447 { "capability", required_argument, NULL, ARG_CAPABILITY },
448 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
449 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
450 { "bind", required_argument, NULL, ARG_BIND },
451 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
452 { "tmpfs", required_argument, NULL, ARG_TMPFS },
453 { "overlay", required_argument, NULL, ARG_OVERLAY },
454 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
455 { "machine", required_argument, NULL, 'M' },
456 { "slice", required_argument, NULL, 'S' },
457 { "setenv", required_argument, NULL, 'E' },
458 { "selinux-context", required_argument, NULL, 'Z' },
459 { "selinux-apifs-context", required_argument, NULL, 'L' },
460 { "quiet", no_argument, NULL, 'q' },
461 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
462 { "register", required_argument, NULL, ARG_REGISTER },
463 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
464 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
465 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
466 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
467 { "network-veth", no_argument, NULL, 'n' },
468 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
469 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
470 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
471 { "personality", required_argument, NULL, ARG_PERSONALITY },
472 { "image", required_argument, NULL, 'i' },
473 { "volatile", optional_argument, NULL, ARG_VOLATILE },
474 { "port", required_argument, NULL, 'p' },
475 { "property", required_argument, NULL, ARG_PROPERTY },
476 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
477 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
478 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
479 { "settings", required_argument, NULL, ARG_SETTINGS },
480 { "chdir", required_argument, NULL, ARG_CHDIR },
481 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
4623e8e6 482 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
eb9da376 483 {}
88213476
LP
484 };
485
9444b1f2 486 int c, r;
6aadfa4c 487 const char *p, *e;
a42c8b54 488 uint64_t plus = 0, minus = 0;
f757855e 489 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
490
491 assert(argc >= 0);
492 assert(argv);
493
2e1f244e 494 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
495
496 switch (c) {
497
498 case 'h':
601185b4
ZJS
499 help();
500 return 0;
88213476 501
acbeb427 502 case ARG_VERSION:
3f6fd1ba 503 return version();
acbeb427 504
88213476 505 case 'D':
0f03c2a4 506 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 507 if (r < 0)
0f03c2a4 508 return r;
ec16945e
LP
509 break;
510
511 case ARG_TEMPLATE:
0f03c2a4 512 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 513 if (r < 0)
0f03c2a4 514 return r;
88213476
LP
515 break;
516
1b9e5b12 517 case 'i':
0f03c2a4 518 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 519 if (r < 0)
0f03c2a4 520 return r;
ec16945e
LP
521 break;
522
523 case 'x':
524 arg_ephemeral = true;
1b9e5b12
LP
525 break;
526
687d0825 527 case 'u':
2fc09a9c
DM
528 r = free_and_strdup(&arg_user, optarg);
529 if (r < 0)
7027ff61 530 return log_oom();
687d0825 531
f757855e 532 arg_settings_mask |= SETTING_USER;
687d0825
MV
533 break;
534
22b28dfd
LP
535 case ARG_NETWORK_ZONE: {
536 char *j;
537
538 j = strappend("vz-", optarg);
539 if (!j)
540 return log_oom();
541
542 if (!ifname_valid(j)) {
543 log_error("Network zone name not valid: %s", j);
544 free(j);
545 return -EINVAL;
546 }
547
548 free(arg_network_zone);
549 arg_network_zone = j;
550
551 arg_network_veth = true;
552 arg_private_network = true;
553 arg_settings_mask |= SETTING_NETWORK;
554 break;
555 }
556
ab046dde 557 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
558
559 if (!ifname_valid(optarg)) {
560 log_error("Bridge interface name not valid: %s", optarg);
561 return -EINVAL;
562 }
563
f757855e
LP
564 r = free_and_strdup(&arg_network_bridge, optarg);
565 if (r < 0)
566 return log_oom();
ab046dde
TG
567
568 /* fall through */
569
0dfaa006 570 case 'n':
69c79d3c
LP
571 arg_network_veth = true;
572 arg_private_network = true;
f757855e 573 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
574 break;
575
f6d6bad1
LP
576 case ARG_NETWORK_VETH_EXTRA:
577 r = veth_extra_parse(&arg_network_veth_extra, optarg);
578 if (r < 0)
579 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
580
581 arg_private_network = true;
582 arg_settings_mask |= SETTING_NETWORK;
583 break;
584
aa28aefe 585 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
586
587 if (!ifname_valid(optarg)) {
588 log_error("Network interface name not valid: %s", optarg);
589 return -EINVAL;
590 }
591
c74e630d
LP
592 if (strv_extend(&arg_network_interfaces, optarg) < 0)
593 return log_oom();
594
595 arg_private_network = true;
f757855e 596 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
597 break;
598
599 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
600
601 if (!ifname_valid(optarg)) {
602 log_error("MACVLAN network interface name not valid: %s", optarg);
603 return -EINVAL;
604 }
605
c74e630d 606 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
607 return log_oom();
608
4bbfe7ad 609 arg_private_network = true;
f757855e 610 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
611 break;
612
613 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
614
615 if (!ifname_valid(optarg)) {
616 log_error("IPVLAN network interface name not valid: %s", optarg);
617 return -EINVAL;
618 }
619
4bbfe7ad
TG
620 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
621 return log_oom();
622
aa28aefe
LP
623 /* fall through */
624
ff01d048
LP
625 case ARG_PRIVATE_NETWORK:
626 arg_private_network = true;
f757855e 627 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
628 break;
629
0f0dbc46 630 case 'b':
7732f92b
LP
631 if (arg_start_mode == START_PID2) {
632 log_error("--boot and --as-pid2 may not be combined.");
633 return -EINVAL;
634 }
635
636 arg_start_mode = START_BOOT;
637 arg_settings_mask |= SETTING_START_MODE;
638 break;
639
640 case 'a':
641 if (arg_start_mode == START_BOOT) {
642 log_error("--boot and --as-pid2 may not be combined.");
643 return -EINVAL;
644 }
645
646 arg_start_mode = START_PID2;
647 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
648 break;
649
144f0fc0 650 case ARG_UUID:
9444b1f2 651 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
652 if (r < 0)
653 return log_error_errno(r, "Invalid UUID: %s", optarg);
654
655 if (sd_id128_is_null(arg_uuid)) {
656 log_error("Machine UUID may not be all zeroes.");
657 return -EINVAL;
aa96c6cb 658 }
f757855e
LP
659
660 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 661 break;
aa96c6cb 662
9444b1f2 663 case 'S':
c74e630d 664 arg_slice = optarg;
144f0fc0
LP
665 break;
666
7027ff61 667 case 'M':
c1521918 668 if (isempty(optarg))
97b11eed 669 arg_machine = mfree(arg_machine);
c1521918 670 else {
0c3c4284 671 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
672 log_error("Invalid machine name: %s", optarg);
673 return -EINVAL;
674 }
7027ff61 675
0c3c4284
LP
676 r = free_and_strdup(&arg_machine, optarg);
677 if (r < 0)
eb91eb18 678 return log_oom();
eb91eb18 679 }
9ce6d1b3 680 break;
7027ff61 681
82adf6af
LP
682 case 'Z':
683 arg_selinux_context = optarg;
a8828ed9
DW
684 break;
685
82adf6af
LP
686 case 'L':
687 arg_selinux_apifs_context = optarg;
a8828ed9
DW
688 break;
689
bc2f673e
LP
690 case ARG_READ_ONLY:
691 arg_read_only = true;
f757855e 692 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
693 break;
694
420c7379
LP
695 case ARG_CAPABILITY:
696 case ARG_DROP_CAPABILITY: {
6cbe4ed1 697 p = optarg;
9ed794a3 698 for (;;) {
6cbe4ed1 699 _cleanup_free_ char *t = NULL;
5076f0cc 700
6cbe4ed1
SS
701 r = extract_first_word(&p, &t, ",", 0);
702 if (r < 0)
703 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 704
6cbe4ed1
SS
705 if (r == 0)
706 break;
5076f0cc 707
39ed67d1
LP
708 if (streq(t, "all")) {
709 if (c == ARG_CAPABILITY)
a42c8b54 710 plus = (uint64_t) -1;
39ed67d1 711 else
a42c8b54 712 minus = (uint64_t) -1;
39ed67d1 713 } else {
2822da4f
LP
714 int cap;
715
716 cap = capability_from_name(t);
717 if (cap < 0) {
39ed67d1
LP
718 log_error("Failed to parse capability %s.", t);
719 return -EINVAL;
720 }
721
722 if (c == ARG_CAPABILITY)
a42c8b54 723 plus |= 1ULL << (uint64_t) cap;
39ed67d1 724 else
a42c8b54 725 minus |= 1ULL << (uint64_t) cap;
5076f0cc 726 }
5076f0cc
LP
727 }
728
f757855e 729 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
730 break;
731 }
732
57fb9fb5
LP
733 case 'j':
734 arg_link_journal = LINK_GUEST;
574edc90 735 arg_link_journal_try = true;
57fb9fb5
LP
736 break;
737
738 case ARG_LINK_JOURNAL:
53e438e3 739 if (streq(optarg, "auto")) {
57fb9fb5 740 arg_link_journal = LINK_AUTO;
53e438e3
LP
741 arg_link_journal_try = false;
742 } else if (streq(optarg, "no")) {
57fb9fb5 743 arg_link_journal = LINK_NO;
53e438e3
LP
744 arg_link_journal_try = false;
745 } else if (streq(optarg, "guest")) {
57fb9fb5 746 arg_link_journal = LINK_GUEST;
53e438e3
LP
747 arg_link_journal_try = false;
748 } else if (streq(optarg, "host")) {
57fb9fb5 749 arg_link_journal = LINK_HOST;
53e438e3
LP
750 arg_link_journal_try = false;
751 } else if (streq(optarg, "try-guest")) {
574edc90
MP
752 arg_link_journal = LINK_GUEST;
753 arg_link_journal_try = true;
754 } else if (streq(optarg, "try-host")) {
755 arg_link_journal = LINK_HOST;
756 arg_link_journal_try = true;
757 } else {
57fb9fb5
LP
758 log_error("Failed to parse link journal mode %s", optarg);
759 return -EINVAL;
760 }
761
762 break;
763
17fe0523 764 case ARG_BIND:
f757855e
LP
765 case ARG_BIND_RO:
766 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
767 if (r < 0)
768 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 769
f757855e 770 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 771 break;
06c17c39 772
f757855e
LP
773 case ARG_TMPFS:
774 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
775 if (r < 0)
776 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 777
f757855e 778 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 779 break;
5a8af538
LP
780
781 case ARG_OVERLAY:
ad85779a
LP
782 case ARG_OVERLAY_RO:
783 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
784 if (r == -EADDRNOTAVAIL)
785 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
786 if (r < 0)
787 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 788
f757855e 789 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 790 break;
06c17c39 791
a5f1cb3b 792 case 'E': {
f4889f65
LP
793 char **n;
794
795 if (!env_assignment_is_valid(optarg)) {
796 log_error("Environment variable assignment '%s' is not valid.", optarg);
797 return -EINVAL;
798 }
799
800 n = strv_env_set(arg_setenv, optarg);
801 if (!n)
802 return log_oom();
803
804 strv_free(arg_setenv);
805 arg_setenv = n;
f757855e
LP
806
807 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
808 break;
809 }
810
284c0b91
LP
811 case 'q':
812 arg_quiet = true;
813 break;
814
8a96d94e 815 case ARG_SHARE_SYSTEM:
a6b5216c 816 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
817 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
818 arg_clone_ns_flags = 0;
8a96d94e
LP
819 break;
820
eb91eb18
LP
821 case ARG_REGISTER:
822 r = parse_boolean(optarg);
823 if (r < 0) {
824 log_error("Failed to parse --register= argument: %s", optarg);
825 return r;
826 }
827
828 arg_register = r;
829 break;
830
89f7c846
LP
831 case ARG_KEEP_UNIT:
832 arg_keep_unit = true;
833 break;
834
6afc95b7
LP
835 case ARG_PERSONALITY:
836
ac45f971 837 arg_personality = personality_from_string(optarg);
050f7277 838 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
839 log_error("Unknown or unsupported personality '%s'.", optarg);
840 return -EINVAL;
841 }
842
f757855e 843 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
844 break;
845
4d9f07b4
LP
846 case ARG_VOLATILE:
847
848 if (!optarg)
f757855e 849 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 850 else {
f757855e 851 VolatileMode m;
4d9f07b4 852
f757855e
LP
853 m = volatile_mode_from_string(optarg);
854 if (m < 0) {
855 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 856 return -EINVAL;
f757855e
LP
857 } else
858 arg_volatile_mode = m;
6d0b55c2
LP
859 }
860
f757855e
LP
861 arg_settings_mask |= SETTING_VOLATILE_MODE;
862 break;
6d0b55c2 863
f757855e
LP
864 case 'p':
865 r = expose_port_parse(&arg_expose_ports, optarg);
866 if (r == -EEXIST)
867 return log_error_errno(r, "Duplicate port specification: %s", optarg);
868 if (r < 0)
869 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 870
f757855e 871 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 872 break;
6d0b55c2 873
f36933fe
LP
874 case ARG_PROPERTY:
875 if (strv_extend(&arg_property, optarg) < 0)
876 return log_oom();
877
878 break;
879
ae209204
ZJS
880 case ARG_PRIVATE_USERS: {
881 int boolean = -1;
0de7acce 882
ae209204
ZJS
883 if (!optarg)
884 boolean = true;
885 else if (!in_charset(optarg, DIGITS))
886 /* do *not* parse numbers as booleans */
887 boolean = parse_boolean(optarg);
888
889 if (boolean == false) {
0de7acce
LP
890 /* no: User namespacing off */
891 arg_userns_mode = USER_NAMESPACE_NO;
892 arg_uid_shift = UID_INVALID;
893 arg_uid_range = UINT32_C(0x10000);
ae209204 894 } else if (boolean == true) {
0de7acce
LP
895 /* yes: User namespacing on, UID range is read from root dir */
896 arg_userns_mode = USER_NAMESPACE_FIXED;
897 arg_uid_shift = UID_INVALID;
898 arg_uid_range = UINT32_C(0x10000);
899 } else if (streq(optarg, "pick")) {
900 /* pick: User namespacing on, UID range is picked randomly */
901 arg_userns_mode = USER_NAMESPACE_PICK;
902 arg_uid_shift = UID_INVALID;
903 arg_uid_range = UINT32_C(0x10000);
904 } else {
6c2058b3 905 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
906 const char *range, *shift;
907
0de7acce
LP
908 /* anything else: User namespacing on, UID range is explicitly configured */
909
6dac160c
LP
910 range = strchr(optarg, ':');
911 if (range) {
6c2058b3
ZJS
912 buffer = strndup(optarg, range - optarg);
913 if (!buffer)
914 return log_oom();
915 shift = buffer;
6dac160c
LP
916
917 range++;
bfd292ec
ZJS
918 r = safe_atou32(range, &arg_uid_range);
919 if (r < 0)
be715731 920 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
921 } else
922 shift = optarg;
923
be715731
ZJS
924 r = parse_uid(shift, &arg_uid_shift);
925 if (r < 0)
926 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
927
928 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
929 }
930
be715731
ZJS
931 if (arg_uid_range <= 0) {
932 log_error("UID range cannot be 0.");
933 return -EINVAL;
934 }
935
0de7acce 936 arg_settings_mask |= SETTING_USERNS;
6dac160c 937 break;
ae209204 938 }
6dac160c 939
0de7acce 940 case 'U':
ccabee0d
LP
941 if (userns_supported()) {
942 arg_userns_mode = USER_NAMESPACE_PICK;
943 arg_uid_shift = UID_INVALID;
944 arg_uid_range = UINT32_C(0x10000);
945
946 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
947 }
948
7336138e
LP
949 break;
950
0de7acce 951 case ARG_PRIVATE_USERS_CHOWN:
19aac838 952 arg_userns_chown = true;
0de7acce
LP
953
954 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
955 break;
956
c6c8f6e2
LP
957 case ARG_KILL_SIGNAL:
958 arg_kill_signal = signal_from_string_try_harder(optarg);
959 if (arg_kill_signal < 0) {
960 log_error("Cannot parse signal: %s", optarg);
961 return -EINVAL;
962 }
963
f757855e
LP
964 arg_settings_mask |= SETTING_KILL_SIGNAL;
965 break;
966
967 case ARG_SETTINGS:
968
969 /* no → do not read files
970 * yes → read files, do not override cmdline, trust only subset
971 * override → read files, override cmdline, trust only subset
972 * trusted → read files, do not override cmdline, trust all
973 */
974
975 r = parse_boolean(optarg);
976 if (r < 0) {
977 if (streq(optarg, "trusted")) {
978 mask_all_settings = false;
979 mask_no_settings = false;
980 arg_settings_trusted = true;
981
982 } else if (streq(optarg, "override")) {
983 mask_all_settings = false;
984 mask_no_settings = true;
985 arg_settings_trusted = -1;
986 } else
987 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
988 } else if (r > 0) {
989 /* yes */
990 mask_all_settings = false;
991 mask_no_settings = false;
992 arg_settings_trusted = -1;
993 } else {
994 /* no */
995 mask_all_settings = true;
996 mask_no_settings = false;
997 arg_settings_trusted = false;
998 }
999
c6c8f6e2
LP
1000 break;
1001
5f932eb9
LP
1002 case ARG_CHDIR:
1003 if (!path_is_absolute(optarg)) {
1004 log_error("Working directory %s is not an absolute path.", optarg);
1005 return -EINVAL;
1006 }
1007
1008 r = free_and_strdup(&arg_chdir, optarg);
1009 if (r < 0)
1010 return log_oom();
1011
1012 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1013 break;
1014
9c1e04d0
AP
1015 case ARG_NOTIFY_READY:
1016 r = parse_boolean(optarg);
1017 if (r < 0) {
1018 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1019 return -EINVAL;
1020 }
1021 arg_notify_ready = r;
1022 arg_settings_mask |= SETTING_NOTIFY_READY;
1023 break;
1024
4623e8e6
LP
1025 case ARG_ROOT_HASH: {
1026 void *k;
1027 size_t l;
1028
1029 r = unhexmem(optarg, strlen(optarg), &k, &l);
1030 if (r < 0)
1031 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1032 if (l < sizeof(sd_id128_t)) {
1033 log_error("Root hash must be at least 128bit long: %s", optarg);
1034 free(k);
1035 return -EINVAL;
1036 }
1037
1038 free(arg_root_hash);
1039 arg_root_hash = k;
1040 arg_root_hash_size = l;
1041 break;
1042 }
1043
88213476
LP
1044 case '?':
1045 return -EINVAL;
1046
1047 default:
eb9da376 1048 assert_not_reached("Unhandled option");
88213476 1049 }
88213476 1050
0c582db0
LB
1051 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1052 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1053 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1054 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1055
4f086aab
SU
1056 if (arg_userns_mode != USER_NAMESPACE_NO)
1057 arg_mount_settings |= MOUNT_USE_USERNS;
1058
1059 if (arg_private_network)
1060 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1061
1062 parse_mount_settings_env();
1063
48a8d337
LB
1064 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1065 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1066 arg_register = false;
0c582db0
LB
1067 if (arg_start_mode != START_PID1) {
1068 log_error("--boot cannot be used without namespacing.");
1069 return -EINVAL;
1070 }
1071 }
eb91eb18 1072
0de7acce 1073 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1074 arg_userns_chown = true;
1075
89f7c846
LP
1076 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1077 log_error("--keep-unit may not be used when invoked from a user session.");
1078 return -EINVAL;
1079 }
1080
1b9e5b12
LP
1081 if (arg_directory && arg_image) {
1082 log_error("--directory= and --image= may not be combined.");
1083 return -EINVAL;
1084 }
1085
ec16945e
LP
1086 if (arg_template && arg_image) {
1087 log_error("--template= and --image= may not be combined.");
1088 return -EINVAL;
1089 }
1090
8cd328d8
LP
1091 if (arg_ephemeral && arg_template && !arg_directory) {
1092 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1093 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1094 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1095 * --directory=". */
1096
1097 arg_directory = arg_template;
1098 arg_template = NULL;
1099 }
1100
ec16945e
LP
1101 if (arg_template && !(arg_directory || arg_machine)) {
1102 log_error("--template= needs --directory= or --machine=.");
1103 return -EINVAL;
1104 }
1105
1106 if (arg_ephemeral && arg_template) {
1107 log_error("--ephemeral and --template= may not be combined.");
1108 return -EINVAL;
1109 }
1110
df9a75e4
LP
1111 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1112 log_error("--ephemeral and --link-journal= may not be combined.");
1113 return -EINVAL;
1114 }
1115
ccabee0d 1116 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1117 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1118 return -EOPNOTSUPP;
1119 }
1120
1121 if (arg_userns_chown && arg_read_only) {
1122 log_error("--read-only and --private-users-chown may not be combined.");
1123 return -EINVAL;
1124 }
f757855e 1125
22b28dfd
LP
1126 if (arg_network_bridge && arg_network_zone) {
1127 log_error("--network-bridge= and --network-zone= may not be combined.");
1128 return -EINVAL;
1129 }
1130
f757855e
LP
1131 if (argc > optind) {
1132 arg_parameters = strv_copy(argv + optind);
1133 if (!arg_parameters)
1134 return log_oom();
1135
7732f92b 1136 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1137 }
1138
1139 /* Load all settings from .nspawn files */
1140 if (mask_no_settings)
1141 arg_settings_mask = 0;
1142
1143 /* Don't load any settings from .nspawn files */
1144 if (mask_all_settings)
1145 arg_settings_mask = _SETTINGS_MASK_ALL;
1146
520e0d54 1147 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1148
6aadfa4c
ILG
1149 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1150 if (e)
1151 arg_container_service_name = e;
1152
5a8ff0e6
CB
1153 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1154 if (r < 0)
1155 arg_use_cgns = cg_ns_supported();
1156 else
1157 arg_use_cgns = r;
1158
86c0dd4a
LP
1159 r = custom_mount_check_all();
1160 if (r < 0)
1161 return r;
1162
f757855e
LP
1163 return 1;
1164}
1165
1166static int verify_arguments(void) {
4f086aab
SU
1167 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1168 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1169 return -EINVAL;
1170 }
1171
1172 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1173 log_error("Cannot combine --private-users with read-write mounts.");
1174 return -EINVAL;
1175 }
f757855e
LP
1176
1177 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1178 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1179 return -EINVAL;
1180 }
1181
6d0b55c2
LP
1182 if (arg_expose_ports && !arg_private_network) {
1183 log_error("Cannot use --port= without private networking.");
1184 return -EINVAL;
1185 }
1186
1c1ea217
EV
1187#ifndef HAVE_LIBIPTC
1188 if (arg_expose_ports) {
1189 log_error("--port= is not supported, compiled without libiptc support.");
1190 return -EOPNOTSUPP;
1191 }
1192#endif
1193
7732f92b 1194 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1195 arg_kill_signal = SIGRTMIN+3;
1196
f757855e 1197 return 0;
88213476
LP
1198}
1199
03cfe0d5
LP
1200static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1201 assert(p);
1202
0de7acce 1203 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1204 return 0;
1205
1206 if (uid == UID_INVALID && gid == GID_INVALID)
1207 return 0;
1208
1209 if (uid != UID_INVALID) {
1210 uid += arg_uid_shift;
1211
1212 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1213 return -EOVERFLOW;
1214 }
1215
1216 if (gid != GID_INVALID) {
1217 gid += (gid_t) arg_uid_shift;
1218
1219 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1220 return -EOVERFLOW;
1221 }
1222
1223 if (lchown(p, uid, gid) < 0)
1224 return -errno;
b12afc8c
LP
1225
1226 return 0;
1227}
1228
03cfe0d5
LP
1229static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1230 const char *q;
1231
1232 q = prefix_roota(root, path);
1233 if (mkdir(q, mode) < 0) {
1234 if (errno == EEXIST)
1235 return 0;
1236 return -errno;
1237 }
1238
1239 return userns_lchown(q, uid, gid);
1240}
1241
e58a1277 1242static int setup_timezone(const char *dest) {
03cfe0d5
LP
1243 _cleanup_free_ char *p = NULL, *q = NULL;
1244 const char *where, *check, *what;
d4036145
LP
1245 char *z, *y;
1246 int r;
f8440af5 1247
e58a1277
LP
1248 assert(dest);
1249
1250 /* Fix the timezone, if possible */
d4036145
LP
1251 r = readlink_malloc("/etc/localtime", &p);
1252 if (r < 0) {
0b493a02
MP
1253 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1254 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1255 * with a symbolic link to a time zone data file.
0b493a02
MP
1256 *
1257 * Example:
21dc0227 1258 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1259 */
d4036145
LP
1260 return 0;
1261 }
1262
1263 z = path_startswith(p, "../usr/share/zoneinfo/");
1264 if (!z)
1265 z = path_startswith(p, "/usr/share/zoneinfo/");
1266 if (!z) {
1267 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1268 return 0;
1269 }
1270
03cfe0d5 1271 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1272 r = readlink_malloc(where, &q);
1273 if (r >= 0) {
1274 y = path_startswith(q, "../usr/share/zoneinfo/");
1275 if (!y)
1276 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1277
d4036145
LP
1278 /* Already pointing to the right place? Then do nothing .. */
1279 if (y && streq(y, z))
1280 return 0;
1281 }
1282
03cfe0d5 1283 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1284 check = prefix_roota(dest, check);
03cfe0d5 1285 if (laccess(check, F_OK) < 0) {
d4036145
LP
1286 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1287 return 0;
1288 }
68fb0892 1289
8ccf7e9e
LP
1290 if (unlink(where) < 0 && errno != ENOENT) {
1291 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1292 errno,
1293 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1294 return 0;
1295 }
4d9f07b4 1296
03cfe0d5 1297 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1298 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1299 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1300 errno,
1301 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1302 return 0;
1303 }
e58a1277 1304
03cfe0d5
LP
1305 r = userns_lchown(where, 0, 0);
1306 if (r < 0)
1307 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1308
e58a1277 1309 return 0;
88213476
LP
1310}
1311
2547bb41 1312static int setup_resolv_conf(const char *dest) {
87447ae4
LP
1313 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1314 const char *where;
1315 int r, found;
2547bb41
LP
1316
1317 assert(dest);
1318
1319 if (arg_private_network)
1320 return 0;
1321
87447ae4
LP
1322 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1323 if (r < 0) {
1324 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1325 return 0;
1326 }
1327
1328 where = strjoina(etc, "/resolv.conf");
1329 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1330 if (found < 0) {
1331 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1332 return 0;
1333 }
79d80fc1 1334
7debb05d 1335 if (access("/run/systemd/resolve/resolv.conf", F_OK) >= 0 &&
87447ae4
LP
1336 access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
1337
3539724c
LP
1338 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1339 * container, so that the container can use the host's resolver. Given that network namespacing is
1340 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1341 * advantage that the container will be able to follow the host's DNS server configuration changes
1342 * transparently. */
1343
87447ae4
LP
1344 if (found == 0) /* missing? */
1345 (void) touch(resolved);
5367354d 1346
87447ae4 1347 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
60e76d48 1348 if (r >= 0)
87447ae4 1349 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1350 }
1351
1352 /* If that didn't work, let's copy the file */
f2068bcc 1353 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1354 if (r < 0) {
3539724c
LP
1355 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1356 * resolved or something similar runs inside and the symlink points there.
68a313c5 1357 *
3539724c 1358 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1359 */
87447ae4 1360 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1361 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1362 return 0;
1363 }
2547bb41 1364
03cfe0d5
LP
1365 r = userns_lchown(where, 0, 0);
1366 if (r < 0)
3539724c 1367 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1368
2547bb41
LP
1369 return 0;
1370}
1371
04bc4a3f 1372static int setup_boot_id(const char *dest) {
3bbaff3e 1373 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1374 const char *from, *to;
04bc4a3f
LP
1375 int r;
1376
04bc4a3f
LP
1377 /* Generate a new randomized boot ID, so that each boot-up of
1378 * the container gets a new one */
1379
03cfe0d5
LP
1380 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1381 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1382
1383 r = sd_id128_randomize(&rnd);
f647962d
MS
1384 if (r < 0)
1385 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1386
15b1248a 1387 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1388 if (r < 0)
1389 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1390
60e76d48
ZJS
1391 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1392 if (r >= 0)
1393 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1394 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1395
3bbaff3e 1396 (void) unlink(from);
04bc4a3f
LP
1397 return r;
1398}
1399
e58a1277 1400static int copy_devnodes(const char *dest) {
88213476
LP
1401
1402 static const char devnodes[] =
1403 "null\0"
1404 "zero\0"
1405 "full\0"
1406 "random\0"
1407 "urandom\0"
85614d66
TG
1408 "tty\0"
1409 "net/tun\0";
88213476
LP
1410
1411 const char *d;
e58a1277 1412 int r = 0;
7fd1b19b 1413 _cleanup_umask_ mode_t u;
a258bf26
LP
1414
1415 assert(dest);
124640f1
LP
1416
1417 u = umask(0000);
88213476 1418
03cfe0d5
LP
1419 /* Create /dev/net, so that we can create /dev/net/tun in it */
1420 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1421 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1422
88213476 1423 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1424 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1425 struct stat st;
88213476 1426
7f112f50 1427 from = strappend("/dev/", d);
03cfe0d5 1428 to = prefix_root(dest, from);
88213476
LP
1429
1430 if (stat(from, &st) < 0) {
1431
4a62c710
MS
1432 if (errno != ENOENT)
1433 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1434
a258bf26 1435 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1436
03cfe0d5 1437 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1438 return -EIO;
a258bf26 1439
85614d66 1440 } else {
81f5049b 1441 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1442 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1443 if (errno == EEXIST)
8dbf71ec 1444 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1445 if (errno != EPERM)
1446 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1447
1448 /* Some systems abusively restrict mknod but
1449 * allow bind mounts. */
1450 r = touch(to);
1451 if (r < 0)
1452 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1453 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1454 if (r < 0)
1455 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1456 }
6278cf60 1457
03cfe0d5
LP
1458 r = userns_lchown(to, 0, 0);
1459 if (r < 0)
1460 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1461 }
88213476
LP
1462 }
1463
e58a1277
LP
1464 return r;
1465}
88213476 1466
03cfe0d5
LP
1467static int setup_pts(const char *dest) {
1468 _cleanup_free_ char *options = NULL;
1469 const char *p;
709f6e46 1470 int r;
03cfe0d5
LP
1471
1472#ifdef HAVE_SELINUX
1473 if (arg_selinux_apifs_context)
1474 (void) asprintf(&options,
3dce8915 1475 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1476 arg_uid_shift + TTY_GID,
1477 arg_selinux_apifs_context);
1478 else
1479#endif
1480 (void) asprintf(&options,
3dce8915 1481 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1482 arg_uid_shift + TTY_GID);
f2d88580 1483
03cfe0d5 1484 if (!options)
f2d88580
LP
1485 return log_oom();
1486
03cfe0d5 1487 /* Mount /dev/pts itself */
cc9fce65 1488 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1489 if (mkdir(p, 0755) < 0)
1490 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1491 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1492 if (r < 0)
1493 return r;
709f6e46
MS
1494 r = userns_lchown(p, 0, 0);
1495 if (r < 0)
1496 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1497
1498 /* Create /dev/ptmx symlink */
1499 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1500 if (symlink("pts/ptmx", p) < 0)
1501 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1502 r = userns_lchown(p, 0, 0);
1503 if (r < 0)
1504 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1505
03cfe0d5
LP
1506 /* And fix /dev/pts/ptmx ownership */
1507 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1508 r = userns_lchown(p, 0, 0);
1509 if (r < 0)
1510 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1511
f2d88580
LP
1512 return 0;
1513}
1514
e58a1277 1515static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1516 _cleanup_umask_ mode_t u;
1517 const char *to;
e58a1277 1518 int r;
e58a1277
LP
1519
1520 assert(dest);
1521 assert(console);
1522
1523 u = umask(0000);
1524
03cfe0d5 1525 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1526 if (r < 0)
1527 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1528
a258bf26
LP
1529 /* We need to bind mount the right tty to /dev/console since
1530 * ptys can only exist on pts file systems. To have something
81f5049b 1531 * to bind mount things on we create a empty regular file. */
a258bf26 1532
03cfe0d5 1533 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1534 r = touch(to);
1535 if (r < 0)
1536 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1537
60e76d48 1538 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1539}
1540
1541static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1542 const char *from, *to;
7fd1b19b 1543 _cleanup_umask_ mode_t u;
d9603714 1544 int fd, r;
e58a1277 1545
e58a1277 1546 assert(kmsg_socket >= 0);
a258bf26 1547
e58a1277 1548 u = umask(0000);
a258bf26 1549
03cfe0d5 1550 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1551 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1552 * on the reading side behave very similar to /proc/kmsg,
1553 * their writing side behaves differently from /dev/kmsg in
1554 * that writing blocks when nothing is reading. In order to
1555 * avoid any problems with containers deadlocking due to this
1556 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1557 from = prefix_roota(dest, "/run/kmsg");
1558 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1559
4a62c710 1560 if (mkfifo(from, 0600) < 0)
03cfe0d5 1561 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1562 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1563 if (r < 0)
1564 return r;
e58a1277
LP
1565
1566 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1567 if (fd < 0)
1568 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1569
e58a1277
LP
1570 /* Store away the fd in the socket, so that it stays open as
1571 * long as we run the child */
3ee897d6 1572 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1573 safe_close(fd);
e58a1277 1574
d9603714
DH
1575 if (r < 0)
1576 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1577
03cfe0d5
LP
1578 /* And now make the FIFO unavailable as /run/kmsg... */
1579 (void) unlink(from);
1580
25ea79fe 1581 return 0;
88213476
LP
1582}
1583
1c4baffc 1584static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1585 union in_addr_union *exposed = userdata;
1586
1587 assert(rtnl);
1588 assert(m);
1589 assert(exposed);
1590
7a8f6325 1591 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1592 return 0;
1593}
1594
3a74cea5 1595static int setup_hostname(void) {
3a74cea5 1596
0c582db0 1597 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1598 return 0;
1599
605f81a8 1600 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1601 return -errno;
3a74cea5 1602
7027ff61 1603 return 0;
3a74cea5
LP
1604}
1605
57fb9fb5 1606static int setup_journal(const char *directory) {
e01ff70a 1607 sd_id128_t this_id;
0f5e1382 1608 _cleanup_free_ char *d = NULL;
e01ff70a 1609 const char *p, *q;
8054d749 1610 bool try;
e01ff70a 1611 char id[33];
57fb9fb5
LP
1612 int r;
1613
df9a75e4
LP
1614 /* Don't link journals in ephemeral mode */
1615 if (arg_ephemeral)
1616 return 0;
1617
8054d749
LP
1618 if (arg_link_journal == LINK_NO)
1619 return 0;
1620
1621 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1622
4d680aee 1623 r = sd_id128_get_machine(&this_id);
f647962d
MS
1624 if (r < 0)
1625 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1626
e01ff70a 1627 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1628 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1629 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1630 if (try)
4d680aee 1631 return 0;
df9a75e4 1632 return -EEXIST;
4d680aee
ZJS
1633 }
1634
03cfe0d5
LP
1635 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1636 if (r < 0)
1637 return log_error_errno(r, "Failed to create /var: %m");
1638
1639 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1640 if (r < 0)
1641 return log_error_errno(r, "Failed to create /var/log: %m");
1642
1643 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1644 if (r < 0)
1645 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1646
e01ff70a
MS
1647 (void) sd_id128_to_string(arg_uuid, id);
1648
03cfe0d5
LP
1649 p = strjoina("/var/log/journal/", id);
1650 q = prefix_roota(directory, p);
27407a01 1651
e1873695 1652 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1653 if (try)
1654 return 0;
27407a01 1655
8054d749
LP
1656 log_error("%s: already a mount point, refusing to use for journal", p);
1657 return -EEXIST;
57fb9fb5
LP
1658 }
1659
e1873695 1660 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1661 if (try)
1662 return 0;
57fb9fb5 1663
8054d749
LP
1664 log_error("%s: already a mount point, refusing to use for journal", q);
1665 return -EEXIST;
57fb9fb5
LP
1666 }
1667
1668 r = readlink_and_make_absolute(p, &d);
1669 if (r >= 0) {
1670 if ((arg_link_journal == LINK_GUEST ||
1671 arg_link_journal == LINK_AUTO) &&
1672 path_equal(d, q)) {
1673
03cfe0d5 1674 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1675 if (r < 0)
709f6e46 1676 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1677 return 0;
57fb9fb5
LP
1678 }
1679
4a62c710
MS
1680 if (unlink(p) < 0)
1681 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1682 } else if (r == -EINVAL) {
1683
1684 if (arg_link_journal == LINK_GUEST &&
1685 rmdir(p) < 0) {
1686
27407a01
ZJS
1687 if (errno == ENOTDIR) {
1688 log_error("%s already exists and is neither a symlink nor a directory", p);
1689 return r;
4314d33f
MS
1690 } else
1691 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1692 }
4314d33f
MS
1693 } else if (r != -ENOENT)
1694 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1695
1696 if (arg_link_journal == LINK_GUEST) {
1697
1698 if (symlink(q, p) < 0) {
8054d749 1699 if (try) {
56f64d95 1700 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1701 return 0;
4314d33f
MS
1702 } else
1703 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1704 }
1705
03cfe0d5 1706 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1707 if (r < 0)
709f6e46 1708 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1709 return 0;
57fb9fb5
LP
1710 }
1711
1712 if (arg_link_journal == LINK_HOST) {
ccddd104 1713 /* don't create parents here — if the host doesn't have
574edc90 1714 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1715
1716 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1717 if (try) {
56f64d95 1718 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1719 return 0;
4314d33f
MS
1720 } else
1721 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1722 }
1723
27407a01
ZJS
1724 } else if (access(p, F_OK) < 0)
1725 return 0;
57fb9fb5 1726
cdb2b9d0
LP
1727 if (dir_is_empty(q) == 0)
1728 log_warning("%s is not empty, proceeding anyway.", q);
1729
03cfe0d5 1730 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1733
60e76d48
ZJS
1734 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1735 if (r < 0)
4a62c710 1736 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1737
27407a01 1738 return 0;
57fb9fb5
LP
1739}
1740
88213476 1741static int drop_capabilities(void) {
520e0d54 1742 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1743}
1744
db999e0f
LP
1745static int reset_audit_loginuid(void) {
1746 _cleanup_free_ char *p = NULL;
1747 int r;
1748
0c582db0 1749 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1750 return 0;
1751
1752 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1753 if (r == -ENOENT)
db999e0f 1754 return 0;
f647962d
MS
1755 if (r < 0)
1756 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1757
1758 /* Already reset? */
1759 if (streq(p, "4294967295"))
1760 return 0;
1761
ad118bda 1762 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1763 if (r < 0) {
10a87006
LP
1764 log_error_errno(r,
1765 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1766 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1767 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1768 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1769 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1770
db999e0f 1771 sleep(5);
77b6e194 1772 }
db999e0f
LP
1773
1774 return 0;
77b6e194
LP
1775}
1776
24fb1112 1777
785890ac
LP
1778static int setup_propagate(const char *root) {
1779 const char *p, *q;
709f6e46 1780 int r;
785890ac
LP
1781
1782 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1783 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1784 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1785 (void) mkdir_p(p, 0600);
1786
709f6e46
MS
1787 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1788 if (r < 0)
1789 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1790
709f6e46
MS
1791 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1792 if (r < 0)
1793 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1794
709f6e46
MS
1795 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1796 if (r < 0)
1797 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1798
03cfe0d5 1799 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1800 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1801 if (r < 0)
1802 return r;
785890ac 1803
60e76d48
ZJS
1804 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1805 if (r < 0)
1806 return r;
785890ac 1807
19caffac
AC
1808 /* machined will MS_MOVE into that directory, and that's only
1809 * supported for non-shared mounts. */
60e76d48 1810 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1811}
1812
317feb4d 1813static int setup_machine_id(const char *directory) {
691675ba
LP
1814 const char *etc_machine_id;
1815 sd_id128_t id;
3bbaff3e 1816 int r;
e01ff70a 1817
317feb4d
LP
1818 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1819 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1820 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1821 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1822 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1823 * container behaves nicely). */
1824
e01ff70a
MS
1825 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1826
691675ba 1827 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
1828 if (r < 0) {
1829 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1830 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 1831
317feb4d
LP
1832 if (sd_id128_is_null(arg_uuid)) {
1833 r = sd_id128_randomize(&arg_uuid);
1834 if (r < 0)
1835 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1836 }
1837 } else {
1838 if (sd_id128_is_null(id)) {
1839 log_error("Machine ID in container image is zero, refusing.");
1840 return -EINVAL;
1841 }
e01ff70a 1842
317feb4d
LP
1843 arg_uuid = id;
1844 }
691675ba 1845
e01ff70a
MS
1846 return 0;
1847}
1848
7336138e
LP
1849static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1850 int r;
1851
1852 assert(directory);
1853
0de7acce 1854 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
1855 return 0;
1856
1857 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1858 if (r == -EOPNOTSUPP)
1859 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1860 if (r == -EBADE)
1861 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1862 if (r < 0)
1863 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1864 if (r == 0)
1865 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1866 else
1867 log_debug("Patched directory tree to match UID/GID range.");
1868
1869 return r;
1870}
1871
113cea80 1872/*
6d416b9c
LS
1873 * Return values:
1874 * < 0 : wait_for_terminate() failed to get the state of the
1875 * container, the container was terminated by a signal, or
1876 * failed for an unknown reason. No change is made to the
1877 * container argument.
1878 * > 0 : The program executed in the container terminated with an
1879 * error. The exit code of the program executed in the
919699ec
LP
1880 * container is returned. The container argument has been set
1881 * to CONTAINER_TERMINATED.
6d416b9c
LS
1882 * 0 : The container is being rebooted, has been shut down or exited
1883 * successfully. The container argument has been set to either
1884 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 1885 *
6d416b9c
LS
1886 * That is, success is indicated by a return value of zero, and an
1887 * error is indicated by a non-zero value.
113cea80
DH
1888 */
1889static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 1890 siginfo_t status;
919699ec 1891 int r;
113cea80
DH
1892
1893 r = wait_for_terminate(pid, &status);
f647962d
MS
1894 if (r < 0)
1895 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
1896
1897 switch (status.si_code) {
fddbb89c 1898
113cea80 1899 case CLD_EXITED:
b5a2179b 1900 if (status.si_status == 0)
919699ec 1901 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 1902 else
919699ec 1903 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 1904
919699ec
LP
1905 *container = CONTAINER_TERMINATED;
1906 return status.si_status;
113cea80
DH
1907
1908 case CLD_KILLED:
1909 if (status.si_status == SIGINT) {
919699ec 1910 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 1911 *container = CONTAINER_TERMINATED;
919699ec
LP
1912 return 0;
1913
113cea80 1914 } else if (status.si_status == SIGHUP) {
919699ec 1915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 1916 *container = CONTAINER_REBOOTED;
919699ec 1917 return 0;
113cea80 1918 }
919699ec 1919
113cea80
DH
1920 /* CLD_KILLED fallthrough */
1921
1922 case CLD_DUMPED:
fddbb89c 1923 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 1924 return -EIO;
113cea80
DH
1925
1926 default:
fddbb89c 1927 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 1928 return -EIO;
113cea80 1929 }
113cea80
DH
1930}
1931
023fb90b
LP
1932static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1933 pid_t pid;
1934
4a0b58c4 1935 pid = PTR_TO_PID(userdata);
023fb90b 1936 if (pid > 0) {
c6c8f6e2 1937 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
1938 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1939 sd_event_source_set_userdata(s, NULL);
1940 return 0;
1941 }
1942 }
1943
1944 sd_event_exit(sd_event_source_get_event(s), 0);
1945 return 0;
1946}
1947
6916b164
AU
1948static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
1949 for (;;) {
1950 siginfo_t si = {};
1951 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
1952 return log_error_errno(errno, "Failed to waitid(): %m");
1953 if (si.si_pid == 0) /* No pending children. */
1954 break;
1955 if (si.si_pid == PTR_TO_PID(userdata)) {
1956 /* The main process we care for has exited. Return from
1957 * signal handler but leave the zombie. */
1958 sd_event_exit(sd_event_source_get_event(s), 0);
1959 break;
1960 }
1961 /* Reap all other children. */
1962 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
1963 }
1964
1965 return 0;
1966}
1967
ec16945e 1968static int determine_names(void) {
1b9cebf6 1969 int r;
ec16945e 1970
c1521918
LP
1971 if (arg_template && !arg_directory && arg_machine) {
1972
1973 /* If --template= was specified then we should not
1974 * search for a machine, but instead create a new one
1975 * in /var/lib/machine. */
1976
605405c6 1977 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
1978 if (!arg_directory)
1979 return log_oom();
1980 }
1981
ec16945e 1982 if (!arg_image && !arg_directory) {
1b9cebf6
LP
1983 if (arg_machine) {
1984 _cleanup_(image_unrefp) Image *i = NULL;
1985
1986 r = image_find(arg_machine, &i);
1987 if (r < 0)
1988 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 1989 if (r == 0) {
1b9cebf6
LP
1990 log_error("No image for machine '%s': %m", arg_machine);
1991 return -ENOENT;
1992 }
1993
aceac2f0 1994 if (i->type == IMAGE_RAW)
0f03c2a4 1995 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 1996 else
0f03c2a4 1997 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 1998 if (r < 0)
0f3be6ca 1999 return log_oom();
1b9cebf6 2000
aee327b8
LP
2001 if (!arg_ephemeral)
2002 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2003 } else
ec16945e
LP
2004 arg_directory = get_current_dir_name();
2005
0f3be6ca 2006 if (!arg_directory && !arg_image) {
1b9cebf6 2007 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2008 return -EINVAL;
2009 }
2010 }
2011
2012 if (!arg_machine) {
4827ab48 2013
b9ba4dab
LP
2014 if (arg_directory && path_equal(arg_directory, "/"))
2015 arg_machine = gethostname_malloc();
4827ab48
LP
2016 else {
2017 if (arg_image) {
2018 char *e;
2019
2020 arg_machine = strdup(basename(arg_image));
2021
2022 /* Truncate suffix if there is one */
2023 e = endswith(arg_machine, ".raw");
2024 if (e)
2025 *e = 0;
2026 } else
2027 arg_machine = strdup(basename(arg_directory));
2028 }
ec16945e
LP
2029 if (!arg_machine)
2030 return log_oom();
2031
ae691c1d 2032 hostname_cleanup(arg_machine);
ec16945e
LP
2033 if (!machine_name_is_valid(arg_machine)) {
2034 log_error("Failed to determine machine name automatically, please use -M.");
2035 return -EINVAL;
2036 }
b9ba4dab
LP
2037
2038 if (arg_ephemeral) {
2039 char *b;
2040
2041 /* Add a random suffix when this is an
2042 * ephemeral machine, so that we can run many
2043 * instances at once without manually having
2044 * to specify -M each time. */
2045
2046 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2047 return log_oom();
2048
2049 free(arg_machine);
2050 arg_machine = b;
2051 }
ec16945e
LP
2052 }
2053
2054 return 0;
2055}
2056
8d4aa2bb 2057static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2058 char *chased;
2059 int r;
2060
2061 assert(p);
2062
2063 if (!*p)
2064 return 0;
2065
8d4aa2bb 2066 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2067 if (r < 0)
2068 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2069
2070 free(*p);
2071 *p = chased;
2072
2073 return 0;
2074}
2075
03cfe0d5 2076static int determine_uid_shift(const char *directory) {
6dac160c
LP
2077 int r;
2078
0de7acce 2079 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2080 arg_uid_shift = 0;
6dac160c 2081 return 0;
03cfe0d5 2082 }
6dac160c
LP
2083
2084 if (arg_uid_shift == UID_INVALID) {
2085 struct stat st;
2086
03cfe0d5 2087 r = stat(directory, &st);
6dac160c 2088 if (r < 0)
03cfe0d5 2089 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2090
2091 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2092
2093 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2094 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2095 return -EINVAL;
2096 }
2097
2098 arg_uid_range = UINT32_C(0x10000);
2099 }
2100
2101 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2102 log_error("UID base too high for UID range.");
2103 return -EINVAL;
2104 }
2105
6dac160c
LP
2106 return 0;
2107}
2108
03cfe0d5
LP
2109static int inner_child(
2110 Barrier *barrier,
2111 const char *directory,
2112 bool secondary,
2113 int kmsg_socket,
2114 int rtnl_socket,
f757855e 2115 FDSet *fds) {
69c79d3c 2116
03cfe0d5 2117 _cleanup_free_ char *home = NULL;
e01ff70a 2118 char as_uuid[37];
6aadfa4c 2119 unsigned n_env = 1;
03cfe0d5
LP
2120 const char *envp[] = {
2121 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2122 NULL, /* container */
03cfe0d5
LP
2123 NULL, /* TERM */
2124 NULL, /* HOME */
2125 NULL, /* USER */
2126 NULL, /* LOGNAME */
2127 NULL, /* container_uuid */
2128 NULL, /* LISTEN_FDS */
2129 NULL, /* LISTEN_PID */
9c1e04d0 2130 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2131 NULL
2132 };
88213476 2133
2371271c 2134 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2135 int r;
88213476 2136
03cfe0d5
LP
2137 assert(barrier);
2138 assert(directory);
2139 assert(kmsg_socket >= 0);
88213476 2140
efdb0237
LP
2141 cg_unified_flush();
2142
0de7acce 2143 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2144 /* Tell the parent, that it now can write the UID map. */
2145 (void) barrier_place(barrier); /* #1 */
7027ff61 2146
03cfe0d5
LP
2147 /* Wait until the parent wrote the UID map */
2148 if (!barrier_place_and_sync(barrier)) { /* #2 */
2149 log_error("Parent died too early");
2150 return -ESRCH;
2151 }
88213476
LP
2152 }
2153
6d66bd3b
EV
2154 r = reset_uid_gid();
2155 if (r < 0)
2156 return log_error_errno(r, "Couldn't become new root: %m");
2157
0de7acce 2158 r = mount_all(NULL,
4f086aab 2159 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2160 arg_uid_shift,
2161 arg_uid_range,
2162 arg_selinux_apifs_context);
2163
03cfe0d5
LP
2164 if (r < 0)
2165 return r;
2166
4f086aab 2167 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2168 if (r < 0)
2169 return r;
2170
03cfe0d5
LP
2171 /* Wait until we are cgroup-ified, so that we
2172 * can mount the right cgroup path writable */
2173 if (!barrier_place_and_sync(barrier)) { /* #3 */
2174 log_error("Parent died too early");
2175 return -ESRCH;
88213476
LP
2176 }
2177
5a8ff0e6 2178 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2179 r = unshare(CLONE_NEWCGROUP);
2180 if (r < 0)
2181 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2182 r = mount_cgroups(
2183 "",
2184 arg_unified_cgroup_hierarchy,
2185 arg_userns_mode != USER_NAMESPACE_NO,
2186 arg_uid_shift,
2187 arg_uid_range,
5a8ff0e6 2188 arg_selinux_apifs_context,
ada54120 2189 true);
0996ef00
CB
2190 if (r < 0)
2191 return r;
2192 } else {
2193 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2194 if (r < 0)
2195 return r;
2196 }
ec16945e 2197
03cfe0d5
LP
2198 r = setup_boot_id(NULL);
2199 if (r < 0)
2200 return r;
ec16945e 2201
03cfe0d5
LP
2202 r = setup_kmsg(NULL, kmsg_socket);
2203 if (r < 0)
2204 return r;
2205 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2206
03cfe0d5 2207 umask(0022);
30535c16 2208
03cfe0d5
LP
2209 if (setsid() < 0)
2210 return log_error_errno(errno, "setsid() failed: %m");
2211
2212 if (arg_private_network)
2213 loopback_setup();
2214
7a8f6325
LP
2215 if (arg_expose_ports) {
2216 r = expose_port_send_rtnl(rtnl_socket);
2217 if (r < 0)
2218 return r;
2219 rtnl_socket = safe_close(rtnl_socket);
2220 }
03cfe0d5 2221
709f6e46
MS
2222 r = drop_capabilities();
2223 if (r < 0)
2224 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2225
2226 setup_hostname();
2227
050f7277 2228 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2229 if (personality(arg_personality) < 0)
2230 return log_error_errno(errno, "personality() failed: %m");
2231 } else if (secondary) {
2232 if (personality(PER_LINUX32) < 0)
2233 return log_error_errno(errno, "personality() failed: %m");
2234 }
2235
2236#ifdef HAVE_SELINUX
2237 if (arg_selinux_context)
2ed96880 2238 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2239 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2240#endif
2241
ee645080 2242 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2243 if (r < 0)
2244 return r;
2245
6aadfa4c
ILG
2246 /* LXC sets container=lxc, so follow the scheme here */
2247 envp[n_env++] = strjoina("container=", arg_container_service_name);
2248
03cfe0d5
LP
2249 envp[n_env] = strv_find_prefix(environ, "TERM=");
2250 if (envp[n_env])
313cefa1 2251 n_env++;
03cfe0d5
LP
2252
2253 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2254 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2255 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2256 return log_oom();
2257
3bbaff3e 2258 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2259
691675ba 2260 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2261 return log_oom();
03cfe0d5
LP
2262
2263 if (fdset_size(fds) > 0) {
2264 r = fdset_cloexec(fds, false);
2265 if (r < 0)
2266 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2267
2268 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2269 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2270 return log_oom();
2271 }
9c1e04d0
AP
2272 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2273 return log_oom();
03cfe0d5 2274
2371271c
TG
2275 env_use = strv_env_merge(2, envp, arg_setenv);
2276 if (!env_use)
2277 return log_oom();
03cfe0d5
LP
2278
2279 /* Let the parent know that we are ready and
2280 * wait until the parent is ready with the
2281 * setup, too... */
2282 if (!barrier_place_and_sync(barrier)) { /* #4 */
2283 log_error("Parent died too early");
2284 return -ESRCH;
2285 }
2286
5f932eb9
LP
2287 if (arg_chdir)
2288 if (chdir(arg_chdir) < 0)
2289 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2290
7732f92b 2291 if (arg_start_mode == START_PID2) {
75bf701f 2292 r = stub_pid1(arg_uuid);
7732f92b
LP
2293 if (r < 0)
2294 return r;
2295 }
2296
03cfe0d5
LP
2297 /* Now, explicitly close the log, so that we
2298 * then can close all remaining fds. Closing
2299 * the log explicitly first has the benefit
2300 * that the logging subsystem knows about it,
2301 * and is thus ready to be reopened should we
2302 * need it again. Note that the other fds
2303 * closed here are at least the locking and
2304 * barrier fds. */
2305 log_close();
2306 (void) fdset_close_others(fds);
2307
7732f92b 2308 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2309 char **a;
2310 size_t m;
2311
2312 /* Automatically search for the init system */
2313
75f32f04
ZJS
2314 m = strv_length(arg_parameters);
2315 a = newa(char*, m + 2);
2316 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2317 a[1 + m] = NULL;
03cfe0d5
LP
2318
2319 a[0] = (char*) "/usr/lib/systemd/systemd";
2320 execve(a[0], a, env_use);
2321
2322 a[0] = (char*) "/lib/systemd/systemd";
2323 execve(a[0], a, env_use);
2324
2325 a[0] = (char*) "/sbin/init";
2326 execve(a[0], a, env_use);
f757855e
LP
2327 } else if (!strv_isempty(arg_parameters))
2328 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2329 else {
5f932eb9 2330 if (!arg_chdir)
d929b0f9
ZJS
2331 /* If we cannot change the directory, we'll end up in /, that is expected. */
2332 (void) chdir(home ?: "/root");
5f932eb9 2333
03cfe0d5
LP
2334 execle("/bin/bash", "-bash", NULL, env_use);
2335 execle("/bin/sh", "-sh", NULL, env_use);
2336 }
2337
35607a8d 2338 r = -errno;
03cfe0d5 2339 (void) log_open();
35607a8d 2340 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2341}
2342
9c1e04d0
AP
2343static int setup_sd_notify_child(void) {
2344 static const int one = 1;
2345 int fd = -1;
2346 union sockaddr_union sa = {
2347 .sa.sa_family = AF_UNIX,
2348 };
2349 int r;
2350
2351 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2352 if (fd < 0)
2353 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2354
2355 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2356 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2357
2358 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2359 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2360 if (r < 0) {
2361 safe_close(fd);
2362 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2363 }
2364
adc7d9f0
EV
2365 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2366 if (r < 0) {
2367 safe_close(fd);
2368 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2369 }
2370
9c1e04d0
AP
2371 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2372 if (r < 0) {
2373 safe_close(fd);
2374 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2375 }
2376
2377 return fd;
2378}
2379
03cfe0d5
LP
2380static int outer_child(
2381 Barrier *barrier,
2382 const char *directory,
2383 const char *console,
2d845785 2384 DissectedImage *dissected_image,
03cfe0d5
LP
2385 bool interactive,
2386 bool secondary,
2387 int pid_socket,
e01ff70a 2388 int uuid_socket,
9c1e04d0 2389 int notify_socket,
03cfe0d5
LP
2390 int kmsg_socket,
2391 int rtnl_socket,
825d5287 2392 int uid_shift_socket,
f757855e 2393 FDSet *fds) {
03cfe0d5
LP
2394
2395 pid_t pid;
2396 ssize_t l;
2397 int r;
9c1e04d0 2398 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2399
2400 assert(barrier);
2401 assert(directory);
2402 assert(console);
2403 assert(pid_socket >= 0);
e01ff70a 2404 assert(uuid_socket >= 0);
9c1e04d0 2405 assert(notify_socket >= 0);
03cfe0d5
LP
2406 assert(kmsg_socket >= 0);
2407
efdb0237
LP
2408 cg_unified_flush();
2409
03cfe0d5
LP
2410 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2411 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2412
2413 if (interactive) {
2414 close_nointr(STDIN_FILENO);
2415 close_nointr(STDOUT_FILENO);
2416 close_nointr(STDERR_FILENO);
2417
2418 r = open_terminal(console, O_RDWR);
2419 if (r != STDIN_FILENO) {
2420 if (r >= 0) {
2421 safe_close(r);
2422 r = -EINVAL;
2423 }
2424
2425 return log_error_errno(r, "Failed to open console: %m");
2426 }
2427
2428 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2429 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2430 return log_error_errno(errno, "Failed to duplicate console: %m");
2431 }
2432
2433 r = reset_audit_loginuid();
2434 if (r < 0)
2435 return r;
2436
2437 /* Mark everything as slave, so that we still
2438 * receive mounts from the real root, but don't
2439 * propagate mounts to the real root. */
60e76d48
ZJS
2440 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2441 if (r < 0)
2442 return r;
03cfe0d5 2443
2d845785 2444 if (dissected_image) {
18b5886e 2445 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2446 if (r < 0)
2447 return r;
2448 }
03cfe0d5 2449
391567f4
LP
2450 r = determine_uid_shift(directory);
2451 if (r < 0)
2452 return r;
2453
0fd9563f
ZJS
2454 r = detect_unified_cgroup_hierarchy(directory);
2455 if (r < 0)
2456 return r;
2457
0de7acce 2458 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2459 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2460 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2461 if (l < 0)
2462 return log_error_errno(errno, "Failed to send UID shift: %m");
2463 if (l != sizeof(arg_uid_shift)) {
2464 log_error("Short write while sending UID shift.");
2465 return -EIO;
2466 }
0e7ac751 2467
0de7acce 2468 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2469 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2470 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2471 * not it will pick a different one, and send it back to us. */
2472
2473 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2474 if (l < 0)
2475 return log_error_errno(errno, "Failed to recv UID shift: %m");
2476 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2477 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2478 return -EIO;
2479 }
2480 }
2481
2482 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2483 }
2484
03cfe0d5 2485 /* Turn directory into bind mount */
60e76d48
ZJS
2486 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2487 if (r < 0)
2488 return r;
03cfe0d5 2489
0de7acce
LP
2490 r = setup_volatile(
2491 directory,
2492 arg_volatile_mode,
2493 arg_userns_mode != USER_NAMESPACE_NO,
2494 arg_uid_shift,
2495 arg_uid_range,
2496 arg_selinux_context);
03cfe0d5
LP
2497 if (r < 0)
2498 return r;
2499
0de7acce
LP
2500 r = setup_volatile_state(
2501 directory,
2502 arg_volatile_mode,
2503 arg_userns_mode != USER_NAMESPACE_NO,
2504 arg_uid_shift,
2505 arg_uid_range,
2506 arg_selinux_context);
03cfe0d5
LP
2507 if (r < 0)
2508 return r;
2509
4ad14eff
LP
2510 /* Mark everything as shared so our mounts get propagated down. This is
2511 * required to make new bind mounts available in systemd services
2512 * inside the containter that create a new mount namespace.
2513 * See https://github.com/systemd/systemd/issues/3860
2514 * Further submounts (such as /dev) done after this will inherit the
2515 * shared propagation mode.*/
2516 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2517 if (r < 0)
2518 return r;
2519
2520 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2521 if (r < 0)
2522 return r;
2523
03cfe0d5
LP
2524 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2525 if (r < 0)
2526 return r;
2527
03cfe0d5 2528 if (arg_read_only) {
6b7c9f8b 2529 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2530 if (r < 0)
2531 return log_error_errno(r, "Failed to make tree read-only: %m");
2532 }
2533
0de7acce 2534 r = mount_all(directory,
4f086aab 2535 arg_mount_settings,
0de7acce
LP
2536 arg_uid_shift,
2537 arg_uid_range,
2538 arg_selinux_apifs_context);
03cfe0d5
LP
2539 if (r < 0)
2540 return r;
2541
07fa00f9
LP
2542 r = copy_devnodes(directory);
2543 if (r < 0)
03cfe0d5
LP
2544 return r;
2545
2546 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2547
07fa00f9
LP
2548 r = setup_pts(directory);
2549 if (r < 0)
03cfe0d5
LP
2550 return r;
2551
2552 r = setup_propagate(directory);
2553 if (r < 0)
2554 return r;
2555
2556 r = setup_dev_console(directory, console);
2557 if (r < 0)
2558 return r;
2559
520e0d54 2560 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
2561 if (r < 0)
2562 return r;
2563
2564 r = setup_timezone(directory);
2565 if (r < 0)
2566 return r;
2567
2568 r = setup_resolv_conf(directory);
2569 if (r < 0)
2570 return r;
2571
e01ff70a
MS
2572 r = setup_machine_id(directory);
2573 if (r < 0)
2574 return r;
2575
03cfe0d5
LP
2576 r = setup_journal(directory);
2577 if (r < 0)
2578 return r;
2579
0de7acce
LP
2580 r = mount_custom(
2581 directory,
2582 arg_custom_mounts,
2583 arg_n_custom_mounts,
2584 arg_userns_mode != USER_NAMESPACE_NO,
2585 arg_uid_shift,
2586 arg_uid_range,
2587 arg_selinux_apifs_context);
03cfe0d5
LP
2588 if (r < 0)
2589 return r;
2590
5a8ff0e6 2591 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2592 r = mount_cgroups(
2593 directory,
2594 arg_unified_cgroup_hierarchy,
2595 arg_userns_mode != USER_NAMESPACE_NO,
2596 arg_uid_shift,
2597 arg_uid_range,
5a8ff0e6 2598 arg_selinux_apifs_context,
ada54120 2599 false);
0996ef00
CB
2600 if (r < 0)
2601 return r;
2602 }
03cfe0d5
LP
2603
2604 r = mount_move_root(directory);
2605 if (r < 0)
2606 return log_error_errno(r, "Failed to move root directory: %m");
2607
9c1e04d0
AP
2608 fd = setup_sd_notify_child();
2609 if (fd < 0)
2610 return fd;
2611
03cfe0d5 2612 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2613 arg_clone_ns_flags |
03cfe0d5 2614 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2615 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2616 if (pid < 0)
2617 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2618 if (pid == 0) {
2619 pid_socket = safe_close(pid_socket);
e01ff70a 2620 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2621 notify_socket = safe_close(notify_socket);
825d5287 2622 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2623
2624 /* The inner child has all namespaces that are
2625 * requested, so that we all are owned by the user if
2626 * user namespaces are turned on. */
2627
f757855e 2628 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2629 if (r < 0)
2630 _exit(EXIT_FAILURE);
2631
2632 _exit(EXIT_SUCCESS);
2633 }
2634
2635 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2636 if (l < 0)
2637 return log_error_errno(errno, "Failed to send PID: %m");
2638 if (l != sizeof(pid)) {
2639 log_error("Short write while sending PID.");
2640 return -EIO;
2641 }
2642
e01ff70a
MS
2643 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2644 if (l < 0)
2645 return log_error_errno(errno, "Failed to send machine ID: %m");
2646 if (l != sizeof(arg_uuid)) {
2647 log_error("Short write while sending machine ID.");
2648 return -EIO;
2649 }
2650
9c1e04d0
AP
2651 l = send_one_fd(notify_socket, fd, 0);
2652 if (l < 0)
2653 return log_error_errno(errno, "Failed to send notify fd: %m");
2654
03cfe0d5 2655 pid_socket = safe_close(pid_socket);
e01ff70a 2656 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2657 notify_socket = safe_close(notify_socket);
327e26d6
KN
2658 kmsg_socket = safe_close(kmsg_socket);
2659 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2660
2661 return 0;
2662}
2663
0e7ac751
LP
2664static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2665 unsigned n_tries = 100;
2666 uid_t candidate;
2667 int r;
2668
2669 assert(shift);
2670 assert(ret_lock_file);
0de7acce 2671 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2672 assert(arg_uid_range == 0x10000U);
2673
2674 candidate = *shift;
2675
2676 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2677
2678 for (;;) {
2679 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2680 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2681
2682 if (--n_tries <= 0)
2683 return -EBUSY;
2684
2685 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2686 goto next;
2687 if ((candidate & UINT32_C(0xFFFF)) != 0)
2688 goto next;
2689
2690 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2691 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2692 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2693 goto next;
2694 if (r < 0)
2695 return r;
2696
2697 /* Make some superficial checks whether the range is currently known in the user database */
2698 if (getpwuid(candidate))
2699 goto next;
2700 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2701 goto next;
2702 if (getgrgid(candidate))
2703 goto next;
2704 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2705 goto next;
2706
2707 *ret_lock_file = lf;
2708 lf = (struct LockFile) LOCK_FILE_INIT;
2709 *shift = candidate;
2710 return 0;
2711
2712 next:
2713 random_bytes(&candidate, sizeof(candidate));
2714 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2715 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2716 }
2717}
2718
03cfe0d5
LP
2719static int setup_uid_map(pid_t pid) {
2720 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2721 int r;
2722
2723 assert(pid > 1);
2724
2725 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2726 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2727 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2728 if (r < 0)
2729 return log_error_errno(r, "Failed to write UID map: %m");
2730
2731 /* We always assign the same UID and GID ranges */
2732 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2733 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2734 if (r < 0)
2735 return log_error_errno(r, "Failed to write GID map: %m");
2736
2737 return 0;
2738}
2739
9c1e04d0 2740static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
2741 char buf[NOTIFY_BUFFER_MAX+1];
2742 char *p = NULL;
2743 struct iovec iovec = {
2744 .iov_base = buf,
2745 .iov_len = sizeof(buf)-1,
2746 };
2747 union {
2748 struct cmsghdr cmsghdr;
2749 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2750 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2751 } control = {};
2752 struct msghdr msghdr = {
2753 .msg_iov = &iovec,
2754 .msg_iovlen = 1,
2755 .msg_control = &control,
2756 .msg_controllen = sizeof(control),
2757 };
2758 struct cmsghdr *cmsg;
2759 struct ucred *ucred = NULL;
2760 ssize_t n;
2761 pid_t inner_child_pid;
2762 _cleanup_strv_free_ char **tags = NULL;
2763
2764 assert(userdata);
2765
2766 inner_child_pid = PTR_TO_PID(userdata);
2767
2768 if (revents != EPOLLIN) {
2769 log_warning("Got unexpected poll event for notify fd.");
2770 return 0;
2771 }
2772
2773 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2774 if (n < 0) {
2775 if (errno == EAGAIN || errno == EINTR)
2776 return 0;
2777
2778 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2779 }
2780 cmsg_close_all(&msghdr);
2781
2782 CMSG_FOREACH(cmsg, &msghdr) {
2783 if (cmsg->cmsg_level == SOL_SOCKET &&
2784 cmsg->cmsg_type == SCM_CREDENTIALS &&
2785 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2786
2787 ucred = (struct ucred*) CMSG_DATA(cmsg);
2788 }
2789 }
2790
2791 if (!ucred || ucred->pid != inner_child_pid) {
2792 log_warning("Received notify message without valid credentials. Ignoring.");
2793 return 0;
2794 }
2795
2796 if ((size_t) n >= sizeof(buf)) {
2797 log_warning("Received notify message exceeded maximum size. Ignoring.");
2798 return 0;
2799 }
2800
2801 buf[n] = 0;
2802 tags = strv_split(buf, "\n\r");
2803 if (!tags)
2804 return log_oom();
2805
2806 if (strv_find(tags, "READY=1"))
2807 sd_notifyf(false, "READY=1\n");
2808
2809 p = strv_find_startswith(tags, "STATUS=");
2810 if (p)
2811 sd_notifyf(false, "STATUS=Container running: %s", p);
2812
2813 return 0;
2814}
2815
5773024d 2816static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 2817 int r;
9c1e04d0 2818
5773024d 2819 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
2820 if (r < 0)
2821 return log_error_errno(r, "Failed to allocate notify event source: %m");
2822
5773024d 2823 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
2824
2825 return 0;
2826}
2827
f757855e
LP
2828static int load_settings(void) {
2829 _cleanup_(settings_freep) Settings *settings = NULL;
2830 _cleanup_fclose_ FILE *f = NULL;
2831 _cleanup_free_ char *p = NULL;
2832 const char *fn, *i;
2833 int r;
2834
2835 /* If all settings are masked, there's no point in looking for
2836 * the settings file */
2837 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2838 return 0;
2839
2840 fn = strjoina(arg_machine, ".nspawn");
2841
2842 /* We first look in the admin's directories in /etc and /run */
2843 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2844 _cleanup_free_ char *j = NULL;
2845
605405c6 2846 j = strjoin(i, "/", fn);
f757855e
LP
2847 if (!j)
2848 return log_oom();
2849
2850 f = fopen(j, "re");
2851 if (f) {
2852 p = j;
2853 j = NULL;
2854
b938cb90 2855 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2856 if (arg_settings_trusted < 0)
2857 arg_settings_trusted = true;
2858
2859 break;
2860 }
2861
2862 if (errno != ENOENT)
2863 return log_error_errno(errno, "Failed to open %s: %m", j);
2864 }
2865
2866 if (!f) {
2867 /* After that, let's look for a file next to the
2868 * actual image we shall boot. */
2869
2870 if (arg_image) {
2871 p = file_in_same_dir(arg_image, fn);
2872 if (!p)
2873 return log_oom();
2874 } else if (arg_directory) {
2875 p = file_in_same_dir(arg_directory, fn);
2876 if (!p)
2877 return log_oom();
2878 }
2879
2880 if (p) {
2881 f = fopen(p, "re");
2882 if (!f && errno != ENOENT)
2883 return log_error_errno(errno, "Failed to open %s: %m", p);
2884
b938cb90 2885 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2886 if (arg_settings_trusted < 0)
2887 arg_settings_trusted = false;
2888 }
2889 }
2890
2891 if (!f)
2892 return 0;
2893
2894 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2895
2896 r = settings_load(f, p, &settings);
2897 if (r < 0)
2898 return r;
2899
2900 /* Copy over bits from the settings, unless they have been
2901 * explicitly masked by command line switches. */
2902
7732f92b
LP
2903 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2904 settings->start_mode >= 0) {
2905 arg_start_mode = settings->start_mode;
f757855e
LP
2906
2907 strv_free(arg_parameters);
2908 arg_parameters = settings->parameters;
2909 settings->parameters = NULL;
2910 }
2911
5f932eb9
LP
2912 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2913 settings->working_directory) {
2914 free(arg_chdir);
2915 arg_chdir = settings->working_directory;
2916 settings->working_directory = NULL;
2917 }
2918
f757855e
LP
2919 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2920 settings->environment) {
2921 strv_free(arg_setenv);
2922 arg_setenv = settings->environment;
2923 settings->environment = NULL;
2924 }
2925
2926 if ((arg_settings_mask & SETTING_USER) == 0 &&
2927 settings->user) {
2928 free(arg_user);
2929 arg_user = settings->user;
2930 settings->user = NULL;
2931 }
2932
2933 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2934 uint64_t plus;
f757855e 2935
0e265674
LP
2936 plus = settings->capability;
2937 if (settings_private_network(settings))
2938 plus |= (1ULL << CAP_NET_ADMIN);
2939
2940 if (!arg_settings_trusted && plus != 0) {
2941 if (settings->capability != 0)
2942 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2943 } else
520e0d54 2944 arg_caps_retain |= plus;
f757855e 2945
520e0d54 2946 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
2947 }
2948
2949 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2950 settings->kill_signal > 0)
2951 arg_kill_signal = settings->kill_signal;
2952
2953 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2954 settings->personality != PERSONALITY_INVALID)
2955 arg_personality = settings->personality;
2956
2957 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2958 !sd_id128_is_null(settings->machine_id)) {
2959
2960 if (!arg_settings_trusted)
2961 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2962 else
2963 arg_uuid = settings->machine_id;
2964 }
2965
2966 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2967 settings->read_only >= 0)
2968 arg_read_only = settings->read_only;
2969
2970 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2971 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2972 arg_volatile_mode = settings->volatile_mode;
2973
2974 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2975 settings->n_custom_mounts > 0) {
2976
2977 if (!arg_settings_trusted)
2978 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2979 else {
2980 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2981 arg_custom_mounts = settings->custom_mounts;
2982 arg_n_custom_mounts = settings->n_custom_mounts;
2983
2984 settings->custom_mounts = NULL;
2985 settings->n_custom_mounts = 0;
2986 }
2987 }
2988
2989 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2990 (settings->private_network >= 0 ||
2991 settings->network_veth >= 0 ||
2992 settings->network_bridge ||
22b28dfd 2993 settings->network_zone ||
f757855e
LP
2994 settings->network_interfaces ||
2995 settings->network_macvlan ||
f6d6bad1
LP
2996 settings->network_ipvlan ||
2997 settings->network_veth_extra)) {
f757855e
LP
2998
2999 if (!arg_settings_trusted)
3000 log_warning("Ignoring network settings, file %s is not trusted.", p);
3001 else {
f6d6bad1 3002 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3003 arg_private_network = settings_private_network(settings);
3004
f757855e
LP
3005 strv_free(arg_network_interfaces);
3006 arg_network_interfaces = settings->network_interfaces;
3007 settings->network_interfaces = NULL;
3008
3009 strv_free(arg_network_macvlan);
3010 arg_network_macvlan = settings->network_macvlan;
3011 settings->network_macvlan = NULL;
3012
3013 strv_free(arg_network_ipvlan);
3014 arg_network_ipvlan = settings->network_ipvlan;
3015 settings->network_ipvlan = NULL;
3016
f6d6bad1
LP
3017 strv_free(arg_network_veth_extra);
3018 arg_network_veth_extra = settings->network_veth_extra;
3019 settings->network_veth_extra = NULL;
3020
f757855e
LP
3021 free(arg_network_bridge);
3022 arg_network_bridge = settings->network_bridge;
3023 settings->network_bridge = NULL;
22b28dfd
LP
3024
3025 free(arg_network_zone);
3026 arg_network_zone = settings->network_zone;
3027 settings->network_zone = NULL;
f757855e
LP
3028 }
3029 }
3030
3031 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3032 settings->expose_ports) {
3033
3034 if (!arg_settings_trusted)
3035 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3036 else {
3037 expose_port_free_all(arg_expose_ports);
3038 arg_expose_ports = settings->expose_ports;
3039 settings->expose_ports = NULL;
3040 }
3041 }
3042
0de7acce
LP
3043 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3044 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3045
3046 if (!arg_settings_trusted)
3047 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3048 else {
3049 arg_userns_mode = settings->userns_mode;
3050 arg_uid_shift = settings->uid_shift;
3051 arg_uid_range = settings->uid_range;
3052 arg_userns_chown = settings->userns_chown;
3053 }
3054 }
3055
9c1e04d0
AP
3056 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3057 arg_notify_ready = settings->notify_ready;
3058
f757855e
LP
3059 return 0;
3060}
3061
b0067625
ZJS
3062static int run(int master,
3063 const char* console,
2d845785 3064 DissectedImage *dissected_image,
b0067625
ZJS
3065 bool interactive,
3066 bool secondary,
3067 FDSet *fds,
3068 char veth_name[IFNAMSIZ], bool *veth_created,
3069 union in_addr_union *exposed,
3070 pid_t *pid, int *ret) {
3071
3072 static const struct sigaction sa = {
3073 .sa_handler = nop_signal_handler,
e28c7cd0 3074 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3075 };
3076
3077 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3078 _cleanup_close_ int etc_passwd_lock = -1;
3079 _cleanup_close_pair_ int
3080 kmsg_socket_pair[2] = { -1, -1 },
3081 rtnl_socket_pair[2] = { -1, -1 },
3082 pid_socket_pair[2] = { -1, -1 },
3083 uuid_socket_pair[2] = { -1, -1 },
3084 notify_socket_pair[2] = { -1, -1 },
3085 uid_shift_socket_pair[2] = { -1, -1 };
3086 _cleanup_close_ int notify_socket= -1;
3087 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3088 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3089 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3090 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3091 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3092 ContainerStatus container_status = 0;
3093 char last_char = 0;
3094 int ifi = 0, r;
3095 ssize_t l;
3096 sigset_t mask_chld;
3097
3098 assert_se(sigemptyset(&mask_chld) == 0);
3099 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3100
3101 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3102 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3103 * check with getpwuid() if the specific user already exists. Note that /etc might be
3104 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3105 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3106 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3107 * really ours. */
3108
3109 etc_passwd_lock = take_etc_passwd_lock(NULL);
3110 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3111 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3112 }
3113
3114 r = barrier_create(&barrier);
3115 if (r < 0)
3116 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3117
3118 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3119 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3120
3121 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3122 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3123
3124 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3125 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3126
3127 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3128 return log_error_errno(errno, "Failed to create id socket pair: %m");
3129
3130 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3131 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3132
3133 if (arg_userns_mode != USER_NAMESPACE_NO)
3134 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3135 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3136
3137 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3138 * parent's blocking calls and give it a chance to call wait() and terminate. */
3139 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3140 if (r < 0)
3141 return log_error_errno(errno, "Failed to change the signal mask: %m");
3142
3143 r = sigaction(SIGCHLD, &sa, NULL);
3144 if (r < 0)
3145 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3146
3147 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3148 if (*pid < 0)
3149 return log_error_errno(errno, "clone() failed%s: %m",
3150 errno == EINVAL ?
3151 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3152
3153 if (*pid == 0) {
3154 /* The outer child only has a file system namespace. */
3155 barrier_set_role(&barrier, BARRIER_CHILD);
3156
3157 master = safe_close(master);
3158
3159 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3160 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3161 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3162 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3163 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3164 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3165
3166 (void) reset_all_signal_handlers();
3167 (void) reset_signal_mask();
3168
3169 r = outer_child(&barrier,
3170 arg_directory,
3171 console,
2d845785 3172 dissected_image,
b0067625
ZJS
3173 interactive,
3174 secondary,
3175 pid_socket_pair[1],
3176 uuid_socket_pair[1],
3177 notify_socket_pair[1],
3178 kmsg_socket_pair[1],
3179 rtnl_socket_pair[1],
3180 uid_shift_socket_pair[1],
3181 fds);
3182 if (r < 0)
3183 _exit(EXIT_FAILURE);
3184
3185 _exit(EXIT_SUCCESS);
3186 }
3187
3188 barrier_set_role(&barrier, BARRIER_PARENT);
3189
3190 fds = fdset_free(fds);
3191
3192 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3193 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3194 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3195 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3196 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3197 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3198
3199 if (arg_userns_mode != USER_NAMESPACE_NO) {
3200 /* The child just let us know the UID shift it might have read from the image. */
3201 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3202 if (l < 0)
3203 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3204 if (l != sizeof arg_uid_shift) {
3205 log_error("Short read while reading UID shift.");
3206 return -EIO;
3207 }
3208
3209 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3210 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3211 * image, but if that's already in use, pick a new one, and report back to the child,
3212 * which one we now picked. */
3213
3214 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3215 if (r < 0)
3216 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3217
3218 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3219 if (l < 0)
3220 return log_error_errno(errno, "Failed to send UID shift: %m");
3221 if (l != sizeof arg_uid_shift) {
3222 log_error("Short write while writing UID shift.");
3223 return -EIO;
3224 }
3225 }
3226 }
3227
3228 /* Wait for the outer child. */
3229 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3230 if (r != 0)
3231 return r < 0 ? r : -EIO;
3232
3233 /* And now retrieve the PID of the inner child. */
3234 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3235 if (l < 0)
3236 return log_error_errno(errno, "Failed to read inner child PID: %m");
3237 if (l != sizeof *pid) {
3238 log_error("Short read while reading inner child PID.");
3239 return -EIO;
3240 }
3241
3242 /* We also retrieve container UUID in case it was generated by outer child */
3243 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3244 if (l < 0)
3245 return log_error_errno(errno, "Failed to read container machine ID: %m");
3246 if (l != sizeof(arg_uuid)) {
3247 log_error("Short read while reading container machined ID.");
3248 return -EIO;
3249 }
3250
3251 /* We also retrieve the socket used for notifications generated by outer child */
3252 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3253 if (notify_socket < 0)
3254 return log_error_errno(notify_socket,
3255 "Failed to receive notification socket from the outer child: %m");
3256
3257 log_debug("Init process invoked as PID "PID_FMT, *pid);
3258
3259 if (arg_userns_mode != USER_NAMESPACE_NO) {
3260 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3261 log_error("Child died too early.");
3262 return -ESRCH;
3263 }
3264
3265 r = setup_uid_map(*pid);
3266 if (r < 0)
3267 return r;
3268
3269 (void) barrier_place(&barrier); /* #2 */
3270 }
3271
3272 if (arg_private_network) {
3273
3274 r = move_network_interfaces(*pid, arg_network_interfaces);
3275 if (r < 0)
3276 return r;
3277
3278 if (arg_network_veth) {
3279 r = setup_veth(arg_machine, *pid, veth_name,
3280 arg_network_bridge || arg_network_zone);
3281 if (r < 0)
3282 return r;
3283 else if (r > 0)
3284 ifi = r;
3285
3286 if (arg_network_bridge) {
3287 /* Add the interface to a bridge */
3288 r = setup_bridge(veth_name, arg_network_bridge, false);
3289 if (r < 0)
3290 return r;
3291 if (r > 0)
3292 ifi = r;
3293 } else if (arg_network_zone) {
3294 /* Add the interface to a bridge, possibly creating it */
3295 r = setup_bridge(veth_name, arg_network_zone, true);
3296 if (r < 0)
3297 return r;
3298 if (r > 0)
3299 ifi = r;
3300 }
3301 }
3302
3303 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3304 if (r < 0)
3305 return r;
3306
3307 /* We created the primary and extra veth links now; let's remember this, so that we know to
3308 remove them later on. Note that we don't bother with removing veth links that were created
3309 here when their setup failed half-way, because in that case the kernel should be able to
3310 remove them on its own, since they cannot be referenced by anything yet. */
3311 *veth_created = true;
3312
3313 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3314 if (r < 0)
3315 return r;
3316
3317 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3318 if (r < 0)
3319 return r;
3320 }
3321
3322 if (arg_register) {
3323 r = register_machine(
3324 arg_machine,
3325 *pid,
3326 arg_directory,
3327 arg_uuid,
3328 ifi,
3329 arg_slice,
3330 arg_custom_mounts, arg_n_custom_mounts,
3331 arg_kill_signal,
3332 arg_property,
3333 arg_keep_unit,
3334 arg_container_service_name);
3335 if (r < 0)
3336 return r;
3337 }
3338
f0bef277 3339 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3340 if (r < 0)
3341 return r;
3342
3343 if (arg_keep_unit) {
3344 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3345 if (r < 0)
3346 return r;
3347 }
3348
3349 r = chown_cgroup(*pid, arg_uid_shift);
3350 if (r < 0)
3351 return r;
3352
3353 /* Notify the child that the parent is ready with all
3354 * its setup (including cgroup-ification), and that
3355 * the child can now hand over control to the code to
3356 * run inside the container. */
3357 (void) barrier_place(&barrier); /* #3 */
3358
3359 /* Block SIGCHLD here, before notifying child.
3360 * process_pty() will handle it with the other signals. */
3361 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3362
3363 /* Reset signal to default */
3364 r = default_signals(SIGCHLD, -1);
3365 if (r < 0)
3366 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3367
3368 r = sd_event_new(&event);
3369 if (r < 0)
3370 return log_error_errno(r, "Failed to get default event source: %m");
3371
5773024d 3372 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3373 if (r < 0)
3374 return r;
3375
3376 /* Let the child know that we are ready and wait that the child is completely ready now. */
3377 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3378 log_error("Child died too early.");
3379 return -ESRCH;
3380 }
3381
3382 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3383 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3384 etc_passwd_lock = safe_close(etc_passwd_lock);
3385
3386 sd_notifyf(false,
3387 "STATUS=Container running.\n"
3388 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3389 if (!arg_notify_ready)
3390 sd_notify(false, "READY=1\n");
3391
3392 if (arg_kill_signal > 0) {
3393 /* Try to kill the init system on SIGINT or SIGTERM */
3394 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3395 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3396 } else {
3397 /* Immediately exit */
3398 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3399 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3400 }
3401
6916b164
AU
3402 /* Exit when the child exits */
3403 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3404
3405 if (arg_expose_ports) {
3406 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3407 if (r < 0)
3408 return r;
3409
3410 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3411 }
3412
3413 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3414
3415 r = pty_forward_new(event, master,
3416 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3417 &forward);
3418 if (r < 0)
3419 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3420
3421 r = sd_event_loop(event);
3422 if (r < 0)
3423 return log_error_errno(r, "Failed to run event loop: %m");
3424
3425 pty_forward_get_last_char(forward, &last_char);
3426
3427 forward = pty_forward_free(forward);
3428
3429 if (!arg_quiet && last_char != '\n')
3430 putc('\n', stdout);
3431
3432 /* Kill if it is not dead yet anyway */
3433 if (arg_register && !arg_keep_unit)
3434 terminate_machine(*pid);
3435
3436 /* Normally redundant, but better safe than sorry */
c67b0082 3437 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3438
3439 r = wait_for_container(*pid, &container_status);
3440 *pid = 0;
3441
3442 if (r < 0)
3443 /* We failed to wait for the container, or the container exited abnormally. */
3444 return r;
3445 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3446 /* r > 0 → The container exited with a non-zero status.
3447 * As a special case, we need to replace 133 with a different value,
3448 * because 133 is special-cased in the service file to reboot the container.
3449 * otherwise → The container exited with zero status and a reboot was not requested.
3450 */
2a49b612 3451 if (r == EXIT_FORCE_RESTART)
27e29a1e 3452 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3453 *ret = r;
b0067625
ZJS
3454 return 0; /* finito */
3455 }
3456
3457 /* CONTAINER_REBOOTED, loop again */
3458
3459 if (arg_keep_unit) {
3460 /* Special handling if we are running as a service: instead of simply
3461 * restarting the machine we want to restart the entire service, so let's
3462 * inform systemd about this with the special exit code 133. The service
3463 * file uses RestartForceExitStatus=133 so that this results in a full
3464 * nspawn restart. This is necessary since we might have cgroup parameters
3465 * set we want to have flushed out. */
2a49b612
ZJS
3466 *ret = EXIT_FORCE_RESTART;
3467 return 0; /* finito */
b0067625
ZJS
3468 }
3469
3470 expose_port_flush(arg_expose_ports, exposed);
3471
3472 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3473 *veth_created = false;
3474 return 1; /* loop again */
3475}
3476
4623e8e6 3477static int load_root_hash(const char *image) {
e0489532
ZJS
3478 _cleanup_free_ char *text = NULL, *fn = NULL;
3479 char *n, *e;
4623e8e6
LP
3480 void *k;
3481 size_t l;
3482 int r;
3483
3484 assert_se(image);
3485
3486 /* Try to load the root hash from a file next to the image file if it exists. */
3487
3488 if (arg_root_hash)
3489 return 0;
3490
3491 fn = new(char, strlen(image) + strlen(".roothash") + 1);
3492 if (!fn)
3493 return log_oom();
3494
3495 n = stpcpy(fn, image);
3496 e = endswith(fn, ".raw");
3497 if (e)
3498 n = e;
3499
3500 strcpy(n, ".roothash");
3501
3502 r = read_one_line_file(fn, &text);
3503 if (r == -ENOENT)
3504 return 0;
3505 if (r < 0) {
3506 log_warning_errno(r, "Failed to read %s, ignoring: %m", fn);
3507 return 0;
3508 }
3509
3510 r = unhexmem(text, strlen(text), &k, &l);
3511 if (r < 0)
3512 return log_error_errno(r, "Invalid root hash: %s", text);
3513 if (l < sizeof(sd_id128_t)) {
3514 free(k);
3515 return log_error_errno(r, "Root hash too short: %s", text);
3516 }
3517
3518 arg_root_hash = k;
3519 arg_root_hash_size = l;
3520
3521 return 0;
3522}
3523
03cfe0d5
LP
3524int main(int argc, char *argv[]) {
3525
2d845785
LP
3526 _cleanup_free_ char *console = NULL;
3527 _cleanup_close_ int master = -1;
03cfe0d5 3528 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 3529 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 3530 char veth_name[IFNAMSIZ] = "";
17cbb288 3531 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 3532 pid_t pid = 0;
03cfe0d5
LP
3533 union in_addr_union exposed = {};
3534 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
3535 bool interactive, veth_created = false, remove_tmprootdir = false;
3536 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 3537 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
3538 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3539 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
3540
3541 log_parse_environment();
3542 log_open();
3543
7732f92b
LP
3544 /* Make sure rename_process() in the stub init process can work */
3545 saved_argv = argv;
3546 saved_argc = argc;
3547
03cfe0d5
LP
3548 r = parse_argv(argc, argv);
3549 if (r <= 0)
3550 goto finish;
3551
03cfe0d5
LP
3552 if (geteuid() != 0) {
3553 log_error("Need to be root.");
3554 r = -EPERM;
3555 goto finish;
3556 }
f757855e
LP
3557 r = determine_names();
3558 if (r < 0)
3559 goto finish;
3560
3561 r = load_settings();
3562 if (r < 0)
3563 goto finish;
3564
3565 r = verify_arguments();
3566 if (r < 0)
3567 goto finish;
03cfe0d5
LP
3568
3569 n_fd_passed = sd_listen_fds(false);
3570 if (n_fd_passed > 0) {
3571 r = fdset_new_listen_fds(&fds, false);
3572 if (r < 0) {
3573 log_error_errno(r, "Failed to collect file descriptors: %m");
3574 goto finish;
3575 }
3576 }
3577
3578 if (arg_directory) {
3579 assert(!arg_image);
3580
3581 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3582 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3583 r = -EINVAL;
3584 goto finish;
3585 }
3586
3587 if (arg_ephemeral) {
3588 _cleanup_free_ char *np = NULL;
3589
8d4aa2bb 3590 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
3591 if (r < 0)
3592 goto finish;
3593
03cfe0d5
LP
3594 /* If the specified path is a mount point we
3595 * generate the new snapshot immediately
3596 * inside it under a random name. However if
3597 * the specified is not a mount point we
3598 * create the new snapshot in the parent
3599 * directory, just next to it. */
e1873695 3600 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
3601 if (r < 0) {
3602 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3603 goto finish;
3604 }
3605 if (r > 0)
770b5ce4 3606 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3607 else
770b5ce4 3608 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 3609 if (r < 0) {
0f3be6ca 3610 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
3611 goto finish;
3612 }
3613
3614 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3615 if (r < 0) {
3616 log_error_errno(r, "Failed to lock %s: %m", np);
3617 goto finish;
3618 }
3619
17cbb288
LP
3620 r = btrfs_subvol_snapshot(arg_directory, np,
3621 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3622 BTRFS_SNAPSHOT_FALLBACK_COPY |
3623 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3624 BTRFS_SNAPSHOT_RECURSIVE |
3625 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3626 if (r < 0) {
3627 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3628 goto finish;
ec16945e
LP
3629 }
3630
3631 free(arg_directory);
3632 arg_directory = np;
8a16a7b4 3633 np = NULL;
ec16945e 3634
17cbb288 3635 remove_directory = true;
30535c16
LP
3636
3637 } else {
cb638b5e 3638 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
3639 if (r < 0)
3640 goto finish;
3641
30535c16
LP
3642 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3643 if (r == -EBUSY) {
3644 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3645 goto finish;
3646 }
3647 if (r < 0) {
3648 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3649 goto finish;
30535c16
LP
3650 }
3651
3652 if (arg_template) {
8d4aa2bb 3653 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
3654 if (r < 0)
3655 goto finish;
3656
17cbb288
LP
3657 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3658 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3659 BTRFS_SNAPSHOT_FALLBACK_COPY |
3660 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3661 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3662 BTRFS_SNAPSHOT_RECURSIVE |
3663 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3664 if (r == -EEXIST) {
3665 if (!arg_quiet)
3666 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3667 } else if (r < 0) {
83521414 3668 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3669 goto finish;
3670 } else {
3671 if (!arg_quiet)
3672 log_info("Populated %s from template %s.", arg_directory, arg_template);
3673 }
3674 }
ec16945e
LP
3675 }
3676
7732f92b 3677 if (arg_start_mode == START_BOOT) {
1b9e5b12 3678 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3679 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3680 r = -EINVAL;
1b9e5b12
LP
3681 goto finish;
3682 }
3683 } else {
3684 const char *p;
3685
16fb773e
LP
3686 p = strjoina(arg_directory, "/usr/");
3687 if (laccess(p, F_OK) < 0) {
3688 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3689 r = -EINVAL;
1b9e5b12 3690 goto finish;
1b9e5b12
LP
3691 }
3692 }
ec16945e 3693
6b9132a9 3694 } else {
ec16945e
LP
3695 assert(arg_image);
3696 assert(!arg_template);
3697
8d4aa2bb 3698 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
3699 if (r < 0)
3700 goto finish;
3701
0f3be6ca
LP
3702 if (arg_ephemeral) {
3703 _cleanup_free_ char *np = NULL;
3704
3705 r = tempfn_random(arg_image, "machine.", &np);
3706 if (r < 0) {
3707 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3708 goto finish;
3709 }
3710
3711 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3712 if (r < 0) {
3713 r = log_error_errno(r, "Failed to create image lock: %m");
3714 goto finish;
3715 }
3716
3717 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL);
3718 if (r < 0) {
3719 r = log_error_errno(r, "Failed to copy image file: %m");
3720 goto finish;
3721 }
3722
3723 free(arg_image);
3724 arg_image = np;
3725 np = NULL;
3726
3727 remove_image = true;
3728 } else {
3729 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3730 if (r == -EBUSY) {
3731 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3732 goto finish;
3733 }
3734 if (r < 0) {
3735 r = log_error_errno(r, "Failed to create image lock: %m");
3736 goto finish;
3737 }
4623e8e6
LP
3738
3739 r = load_root_hash(arg_image);
3740 if (r < 0)
3741 goto finish;
30535c16
LP
3742 }
3743
c67b0082 3744 if (!mkdtemp(tmprootdir)) {
0f3be6ca 3745 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 3746 goto finish;
1b9e5b12 3747 }
6b9132a9 3748
c67b0082
LP
3749 remove_tmprootdir = true;
3750
3751 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
3752 if (!arg_directory) {
3753 r = log_oom();
3754 goto finish;
6b9132a9 3755 }
88213476 3756
2d845785
LP
3757 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3758 if (r < 0) {
3759 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
3760 goto finish;
3761 }
1b9e5b12 3762
e0f9e7bd
LP
3763 r = dissect_image(
3764 loop->fd,
3765 arg_root_hash, arg_root_hash_size,
3766 DISSECT_IMAGE_REQUIRE_ROOT,
3767 &dissected_image);
2d845785
LP
3768 if (r == -ENOPKG) {
3769 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3770
3771 log_notice("Note that the disk image needs to\n"
3772 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3773 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3774 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3775 " d) or contain a file system without a partition table\n"
3776 "in order to be bootable with systemd-nspawn.");
1b9e5b12 3777 goto finish;
2d845785 3778 }
4623e8e6
LP
3779 if (r == -EADDRNOTAVAIL) {
3780 log_error_errno(r, "No root partition for specified root hash found.");
3781 goto finish;
3782 }
2d845785
LP
3783 if (r == -EOPNOTSUPP) {
3784 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3785 goto finish;
3786 }
3787 if (r < 0) {
3788 log_error_errno(r, "Failed to dissect image: %m");
842f3b0f
LP
3789 goto finish;
3790 }
1b9e5b12 3791
4623e8e6
LP
3792 if (!arg_root_hash && dissected_image->can_verity)
3793 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3794
3795 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
3796 if (r < 0)
3797 goto finish;
0f3be6ca
LP
3798
3799 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3800 if (remove_image && unlink(arg_image) >= 0)
3801 remove_image = false;
842f3b0f 3802 }
842f3b0f 3803
86c0dd4a 3804 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
3805 if (r < 0)
3806 goto finish;
3807
03cfe0d5
LP
3808 interactive =
3809 isatty(STDIN_FILENO) > 0 &&
3810 isatty(STDOUT_FILENO) > 0;
9c857b9d 3811
db7feb7e
LP
3812 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3813 if (master < 0) {
ec16945e 3814 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3815 goto finish;
3816 }
3817
611b312b
LP
3818 r = ptsname_malloc(master, &console);
3819 if (r < 0) {
3820 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3821 goto finish;
68b02049
DW
3822 }
3823
3824 if (arg_selinux_apifs_context) {
3825 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3826 if (r < 0)
3827 goto finish;
a258bf26
LP
3828 }
3829
a258bf26 3830 if (unlockpt(master) < 0) {
ec16945e 3831 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3832 goto finish;
3833 }
3834
9c857b9d
LP
3835 if (!arg_quiet)
3836 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3837 arg_machine, arg_image ?: arg_directory);
3838
72c0a2c2 3839 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3840
03cfe0d5
LP
3841 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3842 r = log_error_errno(errno, "Failed to become subreaper: %m");
3843 goto finish;
3844 }
3845
d87be9b0 3846 for (;;) {
b0067625
ZJS
3847 r = run(master,
3848 console,
2d845785 3849 dissected_image,
b0067625
ZJS
3850 interactive, secondary,
3851 fds,
3852 veth_name, &veth_created,
3853 &exposed,
3854 &pid, &ret);
3855 if (r <= 0)
d87be9b0 3856 break;
d87be9b0 3857 }
88213476
LP
3858
3859finish:
af4ec430 3860 sd_notify(false,
2a49b612
ZJS
3861 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3862 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 3863
9444b1f2 3864 if (pid > 0)
c67b0082 3865 (void) kill(pid, SIGKILL);
88213476 3866
503546da 3867 /* Try to flush whatever is still queued in the pty */
6a0f896b 3868 if (master >= 0) {
59f448cf 3869 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
6a0f896b
LP
3870 master = safe_close(master);
3871 }
3872
3873 if (pid > 0)
3874 (void) wait_for_terminate(pid, NULL);
503546da 3875
17cbb288 3876 if (remove_directory && arg_directory) {
ec16945e
LP
3877 int k;
3878
17cbb288 3879 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 3880 if (k < 0)
17cbb288 3881 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
3882 }
3883
0f3be6ca
LP
3884 if (remove_image && arg_image) {
3885 if (unlink(arg_image) < 0)
3886 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3887 }
3888
c67b0082
LP
3889 if (remove_tmprootdir) {
3890 if (rmdir(tmprootdir) < 0)
3891 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3892 }
3893
785890ac
LP
3894 if (arg_machine) {
3895 const char *p;
3896
63c372cb 3897 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3898 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3899 }
3900
7a8f6325 3901 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
3902
3903 if (veth_created)
3904 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 3905 (void) remove_bridge(arg_network_zone);
f757855e 3906
04d391da 3907 free(arg_directory);
ec16945e
LP
3908 free(arg_template);
3909 free(arg_image);
7027ff61 3910 free(arg_machine);
c74e630d 3911 free(arg_user);
5f932eb9 3912 free(arg_chdir);
c74e630d 3913 strv_free(arg_setenv);
f757855e 3914 free(arg_network_bridge);
c74e630d
LP
3915 strv_free(arg_network_interfaces);
3916 strv_free(arg_network_macvlan);
4bbfe7ad 3917 strv_free(arg_network_ipvlan);
f6d6bad1 3918 strv_free(arg_network_veth_extra);
f757855e
LP
3919 strv_free(arg_parameters);
3920 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3921 expose_port_free_all(arg_expose_ports);
4623e8e6 3922 free(arg_root_hash);
6d0b55c2 3923
ec16945e 3924 return r < 0 ? EXIT_FAILURE : ret;
88213476 3925}