]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: unref the notify event source (#4941)
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
6916b164 41#include <sys/wait.h>
8fe0087e 42#include <unistd.h>
1b9e5b12 43
1f0cd86b 44#include "sd-daemon.h"
1f0cd86b 45#include "sd-id128.h"
8fe0087e 46
b5efdb8a 47#include "alloc-util.h"
8fe0087e
LP
48#include "barrier.h"
49#include "base-filesystem.h"
50#include "blkid-util.h"
51#include "btrfs-util.h"
8fe0087e 52#include "cap-list.h"
430f0182 53#include "capability-util.h"
04d391da 54#include "cgroup-util.h"
8fe0087e 55#include "copy.h"
4fc9982c 56#include "dev-setup.h"
2d845785 57#include "dissect-image.h"
8fe0087e 58#include "env-util.h"
3ffd4af2 59#include "fd-util.h"
842f3b0f 60#include "fdset.h"
a5c32cff 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
f4f15635 63#include "fs-util.h"
1b9e5b12 64#include "gpt.h"
4623e8e6 65#include "hexdecoct.h"
8fe0087e 66#include "hostname-util.h"
910fd145 67#include "id128-util.h"
8fe0087e 68#include "log.h"
2d845785 69#include "loop-util.h"
8fe0087e 70#include "loopback-setup.h"
1b9cebf6 71#include "machine-image.h"
8fe0087e
LP
72#include "macro.h"
73#include "missing.h"
74#include "mkdir.h"
4349cd7c 75#include "mount-util.h"
8fe0087e 76#include "netlink-util.h"
07630cea
LP
77#include "nspawn-cgroup.h"
78#include "nspawn-expose-ports.h"
79#include "nspawn-mount.h"
80#include "nspawn-network.h"
7336138e 81#include "nspawn-patch-uid.h"
07630cea 82#include "nspawn-register.h"
910fd145 83#include "nspawn-seccomp.h"
07630cea
LP
84#include "nspawn-settings.h"
85#include "nspawn-setuid.h"
7732f92b 86#include "nspawn-stub-pid1.h"
6bedfcbb 87#include "parse-util.h"
8fe0087e 88#include "path-util.h"
0b452006 89#include "process-util.h"
8fe0087e
LP
90#include "ptyfwd.h"
91#include "random-util.h"
8869a0b4 92#include "raw-clone.h"
8fe0087e 93#include "rm-rf.h"
68b02049 94#include "selinux-util.h"
8fe0087e 95#include "signal-util.h"
2583fbea 96#include "socket-util.h"
8fcde012 97#include "stat-util.h"
15a5e950 98#include "stdio-util.h"
07630cea 99#include "string-util.h"
8fe0087e
LP
100#include "strv.h"
101#include "terminal-util.h"
102#include "udev-util.h"
affb60b1 103#include "umask-util.h"
b1d4f8e1 104#include "user-util.h"
8fe0087e 105#include "util.h"
e9642be2 106
0e7ac751 107/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
108 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
109 * may have their own allocation ranges too. */
0e7ac751
LP
110#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
111#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 112
9c1e04d0
AP
113/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
114 * nspawn_notify_socket_path is relative to the container
115 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
116#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 117
2a49b612
ZJS
118#define EXIT_FORCE_RESTART 133
119
113cea80
DH
120typedef enum ContainerStatus {
121 CONTAINER_TERMINATED,
122 CONTAINER_REBOOTED
123} ContainerStatus;
124
57fb9fb5
LP
125typedef enum LinkJournal {
126 LINK_NO,
127 LINK_AUTO,
128 LINK_HOST,
129 LINK_GUEST
130} LinkJournal;
88213476
LP
131
132static char *arg_directory = NULL;
ec16945e 133static char *arg_template = NULL;
5f932eb9 134static char *arg_chdir = NULL;
687d0825 135static char *arg_user = NULL;
9444b1f2 136static sd_id128_t arg_uuid = {};
7027ff61 137static char *arg_machine = NULL;
c74e630d
LP
138static const char *arg_selinux_context = NULL;
139static const char *arg_selinux_apifs_context = NULL;
9444b1f2 140static const char *arg_slice = NULL;
ff01d048 141static bool arg_private_network = false;
bc2f673e 142static bool arg_read_only = false;
7732f92b 143static StartMode arg_start_mode = START_PID1;
ec16945e 144static bool arg_ephemeral = false;
57fb9fb5 145static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 146static bool arg_link_journal_try = false;
520e0d54 147static uint64_t arg_caps_retain =
50b52222
LP
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 159 (1ULL << CAP_MKNOD) |
5076f0cc
LP
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
5076f0cc 163 (1ULL << CAP_SETFCAP) |
50b52222 164 (1ULL << CAP_SETGID) |
5076f0cc
LP
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
50b52222 168 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 172 (1ULL << CAP_SYS_RESOURCE) |
50b52222 173 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
174static CustomMount *arg_custom_mounts = NULL;
175static unsigned arg_n_custom_mounts = 0;
f4889f65 176static char **arg_setenv = NULL;
284c0b91 177static bool arg_quiet = false;
eb91eb18 178static bool arg_register = true;
89f7c846 179static bool arg_keep_unit = false;
aa28aefe 180static char **arg_network_interfaces = NULL;
c74e630d 181static char **arg_network_macvlan = NULL;
4bbfe7ad 182static char **arg_network_ipvlan = NULL;
69c79d3c 183static bool arg_network_veth = false;
f6d6bad1 184static char **arg_network_veth_extra = NULL;
f757855e 185static char *arg_network_bridge = NULL;
22b28dfd 186static char *arg_network_zone = NULL;
050f7277 187static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 188static char *arg_image = NULL;
f757855e 189static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 190static ExposePort *arg_expose_ports = NULL;
f36933fe 191static char **arg_property = NULL;
0de7acce 192static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 193static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 194static bool arg_userns_chown = false;
c6c8f6e2 195static int arg_kill_signal = 0;
5da38d07 196static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
197static SettingsMask arg_settings_mask = 0;
198static int arg_settings_trusted = -1;
199static char **arg_parameters = NULL;
6aadfa4c 200static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 201static bool arg_notify_ready = false;
5a8ff0e6 202static bool arg_use_cgns = true;
0c582db0 203static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 204static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
205static void *arg_root_hash = NULL;
206static size_t arg_root_hash_size = 0;
88213476 207
601185b4 208static void help(void) {
88213476
LP
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
211 " -h --help Show this help\n"
212 " --version Print version string\n"
69c79d3c 213 " -q --quiet Do not show status information\n"
1b9e5b12 214 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 220 " --root-hash=HASH Specify verity root hash\n"
7732f92b 221 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 222 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 223 " --chdir=PATH Set working directory in the container\n"
a8828ed9 224 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 225 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 226 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 227 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 228 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 229 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 230 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 231 " Similar, but with user configured UID/GID range\n"
24597ee0 232 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
233 " --private-network Disable network in container\n"
234 " --network-interface=INTERFACE\n"
235 " Assign an existing network interface to the\n"
236 " container\n"
c74e630d
LP
237 " --network-macvlan=INTERFACE\n"
238 " Create a macvlan network interface based on an\n"
239 " existing network interface to the container\n"
4bbfe7ad
TG
240 " --network-ipvlan=INTERFACE\n"
241 " Create a ipvlan network interface based on an\n"
242 " existing network interface to the container\n"
a8eaaee7 243 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 244 " and container\n"
f6d6bad1
LP
245 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
246 " Add an additional virtual Ethernet link between\n"
247 " host and container\n"
ab046dde 248 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
249 " Add a virtual Ethernet connection to the container\n"
250 " and attach it to an existing bridge on the host\n"
251 " --network-zone=NAME Similar, but attach the new interface to an\n"
252 " an automatically managed bridge interface\n"
6d0b55c2 253 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 254 " Expose a container IP port on the host\n"
82adf6af
LP
255 " -Z --selinux-context=SECLABEL\n"
256 " Set the SELinux security context to be used by\n"
257 " processes in the container\n"
258 " -L --selinux-apifs-context=SECLABEL\n"
259 " Set the SELinux security context to be used by\n"
260 " API/tmpfs file systems in the container\n"
a8828ed9
DW
261 " --capability=CAP In addition to the default, retain specified\n"
262 " capability\n"
263 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 264 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
265 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
266 " host, try-guest, try-host\n"
574edc90 267 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 268 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
269 " --bind=PATH[:PATH[:OPTIONS]]\n"
270 " Bind mount a file or directory from the host into\n"
a8828ed9 271 " the container\n"
5e5bfa6e
EY
272 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
273 " Similar, but creates a read-only bind mount\n"
06c17c39 274 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
275 " --overlay=PATH[:PATH...]:PATH\n"
276 " Create an overlay mount from the host to \n"
277 " the container\n"
278 " --overlay-ro=PATH[:PATH...]:PATH\n"
279 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 280 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 281 " --register=BOOLEAN Register container as machine\n"
89f7c846 282 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 283 " the service unit nspawn is running in\n"
6d0b55c2 284 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 285 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 286 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 287 , program_invocation_short_name);
88213476
LP
288}
289
86c0dd4a 290static int custom_mount_check_all(void) {
5a8af538 291 unsigned i;
5a8af538 292
5a8af538
LP
293 for (i = 0; i < arg_n_custom_mounts; i++) {
294 CustomMount *m = &arg_custom_mounts[i];
295
0de7acce 296 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
297
298 if (arg_userns_chown) {
299 log_error("--private-users-chown may not be combined with custom root mounts.");
300 return -EINVAL;
301 } else if (arg_uid_shift == UID_INVALID) {
302 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
303 return -EINVAL;
304 }
825d5287 305 }
5a8af538
LP
306 }
307
308 return 0;
309}
310
0fd9563f 311static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 312 const char *e;
5da38d07
TH
313 int r, all_unified, systemd_unified;
314
efdb0237
LP
315 /* Allow the user to control whether the unified hierarchy is used */
316 e = getenv("UNIFIED_CGROUP_HIERARCHY");
317 if (e) {
318 r = parse_boolean(e);
319 if (r < 0)
320 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
321 if (r > 0)
322 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
323 else
324 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 325
efdb0237
LP
326 return 0;
327 }
328
98afd6af
ZJS
329 all_unified = cg_all_unified();
330 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
331
332 if (all_unified < 0 || systemd_unified < 0)
333 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
334 "Failed to determine whether the unified cgroups hierarchy is used: %m");
335
efdb0237 336 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
337 if (all_unified > 0) {
338 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
339 * routine only detects 231, so we'll have a false negative here for 230. */
340 r = systemd_installation_has_version(directory, 230);
341 if (r < 0)
342 return log_error_errno(r, "Failed to determine systemd version in container: %m");
343 if (r > 0)
344 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
345 else
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
347 } else if (systemd_unified > 0) {
348 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
349 r = systemd_installation_has_version(directory, 232);
350 if (r < 0)
351 return log_error_errno(r, "Failed to determine systemd version in container: %m");
352 if (r > 0)
353 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
354 else
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
356 } else
5da38d07 357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 358
efdb0237
LP
359 return 0;
360}
361
0c582db0
LB
362static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
363 int r;
364
365 r = getenv_bool(name);
366 if (r == -ENXIO)
367 return;
368 if (r < 0)
369 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
370 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
371}
372
4f086aab
SU
373static void parse_mount_settings_env(void) {
374 int r;
375 const char *e;
376
377 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
378 if (!e)
379 return;
380
381 if (streq(e, "network")) {
382 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
383 return;
384 }
385
386 r = parse_boolean(e);
387 if (r < 0) {
388 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
389 return;
390 } else if (r > 0)
391 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
392 else
393 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
394
395 arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
396}
397
88213476
LP
398static int parse_argv(int argc, char *argv[]) {
399
a41fe3a2 400 enum {
acbeb427
ZJS
401 ARG_VERSION = 0x100,
402 ARG_PRIVATE_NETWORK,
bc2f673e 403 ARG_UUID,
5076f0cc 404 ARG_READ_ONLY,
57fb9fb5 405 ARG_CAPABILITY,
420c7379 406 ARG_DROP_CAPABILITY,
17fe0523
LP
407 ARG_LINK_JOURNAL,
408 ARG_BIND,
f4889f65 409 ARG_BIND_RO,
06c17c39 410 ARG_TMPFS,
5a8af538
LP
411 ARG_OVERLAY,
412 ARG_OVERLAY_RO,
eb91eb18 413 ARG_SHARE_SYSTEM,
89f7c846 414 ARG_REGISTER,
aa28aefe 415 ARG_KEEP_UNIT,
69c79d3c 416 ARG_NETWORK_INTERFACE,
c74e630d 417 ARG_NETWORK_MACVLAN,
4bbfe7ad 418 ARG_NETWORK_IPVLAN,
ab046dde 419 ARG_NETWORK_BRIDGE,
22b28dfd 420 ARG_NETWORK_ZONE,
f6d6bad1 421 ARG_NETWORK_VETH_EXTRA,
6afc95b7 422 ARG_PERSONALITY,
4d9f07b4 423 ARG_VOLATILE,
ec16945e 424 ARG_TEMPLATE,
f36933fe 425 ARG_PROPERTY,
6dac160c 426 ARG_PRIVATE_USERS,
c6c8f6e2 427 ARG_KILL_SIGNAL,
f757855e 428 ARG_SETTINGS,
5f932eb9 429 ARG_CHDIR,
7336138e 430 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 431 ARG_NOTIFY_READY,
4623e8e6 432 ARG_ROOT_HASH,
a41fe3a2
LP
433 };
434
88213476 435 static const struct option options[] = {
27eb8e90
ZJS
436 { "help", no_argument, NULL, 'h' },
437 { "version", no_argument, NULL, ARG_VERSION },
438 { "directory", required_argument, NULL, 'D' },
439 { "template", required_argument, NULL, ARG_TEMPLATE },
440 { "ephemeral", no_argument, NULL, 'x' },
441 { "user", required_argument, NULL, 'u' },
442 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
443 { "as-pid2", no_argument, NULL, 'a' },
444 { "boot", no_argument, NULL, 'b' },
445 { "uuid", required_argument, NULL, ARG_UUID },
446 { "read-only", no_argument, NULL, ARG_READ_ONLY },
447 { "capability", required_argument, NULL, ARG_CAPABILITY },
448 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
449 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
450 { "bind", required_argument, NULL, ARG_BIND },
451 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
452 { "tmpfs", required_argument, NULL, ARG_TMPFS },
453 { "overlay", required_argument, NULL, ARG_OVERLAY },
454 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
455 { "machine", required_argument, NULL, 'M' },
456 { "slice", required_argument, NULL, 'S' },
457 { "setenv", required_argument, NULL, 'E' },
458 { "selinux-context", required_argument, NULL, 'Z' },
459 { "selinux-apifs-context", required_argument, NULL, 'L' },
460 { "quiet", no_argument, NULL, 'q' },
461 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
462 { "register", required_argument, NULL, ARG_REGISTER },
463 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
464 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
465 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
466 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
467 { "network-veth", no_argument, NULL, 'n' },
468 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
469 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
470 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
471 { "personality", required_argument, NULL, ARG_PERSONALITY },
472 { "image", required_argument, NULL, 'i' },
473 { "volatile", optional_argument, NULL, ARG_VOLATILE },
474 { "port", required_argument, NULL, 'p' },
475 { "property", required_argument, NULL, ARG_PROPERTY },
476 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
477 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
478 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
479 { "settings", required_argument, NULL, ARG_SETTINGS },
480 { "chdir", required_argument, NULL, ARG_CHDIR },
481 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
4623e8e6 482 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
eb9da376 483 {}
88213476
LP
484 };
485
9444b1f2 486 int c, r;
6aadfa4c 487 const char *p, *e;
a42c8b54 488 uint64_t plus = 0, minus = 0;
f757855e 489 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
490
491 assert(argc >= 0);
492 assert(argv);
493
2e1f244e 494 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
495
496 switch (c) {
497
498 case 'h':
601185b4
ZJS
499 help();
500 return 0;
88213476 501
acbeb427 502 case ARG_VERSION:
3f6fd1ba 503 return version();
acbeb427 504
88213476 505 case 'D':
0f03c2a4 506 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 507 if (r < 0)
0f03c2a4 508 return r;
ec16945e
LP
509 break;
510
511 case ARG_TEMPLATE:
0f03c2a4 512 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 513 if (r < 0)
0f03c2a4 514 return r;
88213476
LP
515 break;
516
1b9e5b12 517 case 'i':
0f03c2a4 518 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 519 if (r < 0)
0f03c2a4 520 return r;
ec16945e
LP
521 break;
522
523 case 'x':
524 arg_ephemeral = true;
1b9e5b12
LP
525 break;
526
687d0825 527 case 'u':
2fc09a9c
DM
528 r = free_and_strdup(&arg_user, optarg);
529 if (r < 0)
7027ff61 530 return log_oom();
687d0825 531
f757855e 532 arg_settings_mask |= SETTING_USER;
687d0825
MV
533 break;
534
22b28dfd
LP
535 case ARG_NETWORK_ZONE: {
536 char *j;
537
538 j = strappend("vz-", optarg);
539 if (!j)
540 return log_oom();
541
542 if (!ifname_valid(j)) {
543 log_error("Network zone name not valid: %s", j);
544 free(j);
545 return -EINVAL;
546 }
547
548 free(arg_network_zone);
549 arg_network_zone = j;
550
551 arg_network_veth = true;
552 arg_private_network = true;
553 arg_settings_mask |= SETTING_NETWORK;
554 break;
555 }
556
ab046dde 557 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
558
559 if (!ifname_valid(optarg)) {
560 log_error("Bridge interface name not valid: %s", optarg);
561 return -EINVAL;
562 }
563
f757855e
LP
564 r = free_and_strdup(&arg_network_bridge, optarg);
565 if (r < 0)
566 return log_oom();
ab046dde
TG
567
568 /* fall through */
569
0dfaa006 570 case 'n':
69c79d3c
LP
571 arg_network_veth = true;
572 arg_private_network = true;
f757855e 573 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
574 break;
575
f6d6bad1
LP
576 case ARG_NETWORK_VETH_EXTRA:
577 r = veth_extra_parse(&arg_network_veth_extra, optarg);
578 if (r < 0)
579 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
580
581 arg_private_network = true;
582 arg_settings_mask |= SETTING_NETWORK;
583 break;
584
aa28aefe 585 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
586
587 if (!ifname_valid(optarg)) {
588 log_error("Network interface name not valid: %s", optarg);
589 return -EINVAL;
590 }
591
c74e630d
LP
592 if (strv_extend(&arg_network_interfaces, optarg) < 0)
593 return log_oom();
594
595 arg_private_network = true;
f757855e 596 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
597 break;
598
599 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
600
601 if (!ifname_valid(optarg)) {
602 log_error("MACVLAN network interface name not valid: %s", optarg);
603 return -EINVAL;
604 }
605
c74e630d 606 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
607 return log_oom();
608
4bbfe7ad 609 arg_private_network = true;
f757855e 610 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
611 break;
612
613 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
614
615 if (!ifname_valid(optarg)) {
616 log_error("IPVLAN network interface name not valid: %s", optarg);
617 return -EINVAL;
618 }
619
4bbfe7ad
TG
620 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
621 return log_oom();
622
aa28aefe
LP
623 /* fall through */
624
ff01d048
LP
625 case ARG_PRIVATE_NETWORK:
626 arg_private_network = true;
f757855e 627 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
628 break;
629
0f0dbc46 630 case 'b':
7732f92b
LP
631 if (arg_start_mode == START_PID2) {
632 log_error("--boot and --as-pid2 may not be combined.");
633 return -EINVAL;
634 }
635
636 arg_start_mode = START_BOOT;
637 arg_settings_mask |= SETTING_START_MODE;
638 break;
639
640 case 'a':
641 if (arg_start_mode == START_BOOT) {
642 log_error("--boot and --as-pid2 may not be combined.");
643 return -EINVAL;
644 }
645
646 arg_start_mode = START_PID2;
647 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
648 break;
649
144f0fc0 650 case ARG_UUID:
9444b1f2 651 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
652 if (r < 0)
653 return log_error_errno(r, "Invalid UUID: %s", optarg);
654
655 if (sd_id128_is_null(arg_uuid)) {
656 log_error("Machine UUID may not be all zeroes.");
657 return -EINVAL;
aa96c6cb 658 }
f757855e
LP
659
660 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 661 break;
aa96c6cb 662
9444b1f2 663 case 'S':
c74e630d 664 arg_slice = optarg;
144f0fc0
LP
665 break;
666
7027ff61 667 case 'M':
c1521918 668 if (isempty(optarg))
97b11eed 669 arg_machine = mfree(arg_machine);
c1521918 670 else {
0c3c4284 671 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
672 log_error("Invalid machine name: %s", optarg);
673 return -EINVAL;
674 }
7027ff61 675
0c3c4284
LP
676 r = free_and_strdup(&arg_machine, optarg);
677 if (r < 0)
eb91eb18
LP
678 return log_oom();
679
680 break;
681 }
7027ff61 682
82adf6af
LP
683 case 'Z':
684 arg_selinux_context = optarg;
a8828ed9
DW
685 break;
686
82adf6af
LP
687 case 'L':
688 arg_selinux_apifs_context = optarg;
a8828ed9
DW
689 break;
690
bc2f673e
LP
691 case ARG_READ_ONLY:
692 arg_read_only = true;
f757855e 693 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
694 break;
695
420c7379
LP
696 case ARG_CAPABILITY:
697 case ARG_DROP_CAPABILITY: {
6cbe4ed1 698 p = optarg;
9ed794a3 699 for (;;) {
6cbe4ed1 700 _cleanup_free_ char *t = NULL;
5076f0cc 701
6cbe4ed1
SS
702 r = extract_first_word(&p, &t, ",", 0);
703 if (r < 0)
704 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 705
6cbe4ed1
SS
706 if (r == 0)
707 break;
5076f0cc 708
39ed67d1
LP
709 if (streq(t, "all")) {
710 if (c == ARG_CAPABILITY)
a42c8b54 711 plus = (uint64_t) -1;
39ed67d1 712 else
a42c8b54 713 minus = (uint64_t) -1;
39ed67d1 714 } else {
2822da4f
LP
715 int cap;
716
717 cap = capability_from_name(t);
718 if (cap < 0) {
39ed67d1
LP
719 log_error("Failed to parse capability %s.", t);
720 return -EINVAL;
721 }
722
723 if (c == ARG_CAPABILITY)
a42c8b54 724 plus |= 1ULL << (uint64_t) cap;
39ed67d1 725 else
a42c8b54 726 minus |= 1ULL << (uint64_t) cap;
5076f0cc 727 }
5076f0cc
LP
728 }
729
f757855e 730 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
731 break;
732 }
733
57fb9fb5
LP
734 case 'j':
735 arg_link_journal = LINK_GUEST;
574edc90 736 arg_link_journal_try = true;
57fb9fb5
LP
737 break;
738
739 case ARG_LINK_JOURNAL:
53e438e3 740 if (streq(optarg, "auto")) {
57fb9fb5 741 arg_link_journal = LINK_AUTO;
53e438e3
LP
742 arg_link_journal_try = false;
743 } else if (streq(optarg, "no")) {
57fb9fb5 744 arg_link_journal = LINK_NO;
53e438e3
LP
745 arg_link_journal_try = false;
746 } else if (streq(optarg, "guest")) {
57fb9fb5 747 arg_link_journal = LINK_GUEST;
53e438e3
LP
748 arg_link_journal_try = false;
749 } else if (streq(optarg, "host")) {
57fb9fb5 750 arg_link_journal = LINK_HOST;
53e438e3
LP
751 arg_link_journal_try = false;
752 } else if (streq(optarg, "try-guest")) {
574edc90
MP
753 arg_link_journal = LINK_GUEST;
754 arg_link_journal_try = true;
755 } else if (streq(optarg, "try-host")) {
756 arg_link_journal = LINK_HOST;
757 arg_link_journal_try = true;
758 } else {
57fb9fb5
LP
759 log_error("Failed to parse link journal mode %s", optarg);
760 return -EINVAL;
761 }
762
763 break;
764
17fe0523 765 case ARG_BIND:
f757855e
LP
766 case ARG_BIND_RO:
767 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
768 if (r < 0)
769 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 770
f757855e 771 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 772 break;
06c17c39 773
f757855e
LP
774 case ARG_TMPFS:
775 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
776 if (r < 0)
777 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 778
f757855e 779 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 780 break;
5a8af538
LP
781
782 case ARG_OVERLAY:
ad85779a
LP
783 case ARG_OVERLAY_RO:
784 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
785 if (r == -EADDRNOTAVAIL)
786 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
787 if (r < 0)
788 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 789
f757855e 790 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 791 break;
06c17c39 792
a5f1cb3b 793 case 'E': {
f4889f65
LP
794 char **n;
795
796 if (!env_assignment_is_valid(optarg)) {
797 log_error("Environment variable assignment '%s' is not valid.", optarg);
798 return -EINVAL;
799 }
800
801 n = strv_env_set(arg_setenv, optarg);
802 if (!n)
803 return log_oom();
804
805 strv_free(arg_setenv);
806 arg_setenv = n;
f757855e
LP
807
808 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
809 break;
810 }
811
284c0b91
LP
812 case 'q':
813 arg_quiet = true;
814 break;
815
8a96d94e 816 case ARG_SHARE_SYSTEM:
a6b5216c 817 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
818 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
819 arg_clone_ns_flags = 0;
8a96d94e
LP
820 break;
821
eb91eb18
LP
822 case ARG_REGISTER:
823 r = parse_boolean(optarg);
824 if (r < 0) {
825 log_error("Failed to parse --register= argument: %s", optarg);
826 return r;
827 }
828
829 arg_register = r;
830 break;
831
89f7c846
LP
832 case ARG_KEEP_UNIT:
833 arg_keep_unit = true;
834 break;
835
6afc95b7
LP
836 case ARG_PERSONALITY:
837
ac45f971 838 arg_personality = personality_from_string(optarg);
050f7277 839 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
840 log_error("Unknown or unsupported personality '%s'.", optarg);
841 return -EINVAL;
842 }
843
f757855e 844 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
845 break;
846
4d9f07b4
LP
847 case ARG_VOLATILE:
848
849 if (!optarg)
f757855e 850 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 851 else {
f757855e 852 VolatileMode m;
4d9f07b4 853
f757855e
LP
854 m = volatile_mode_from_string(optarg);
855 if (m < 0) {
856 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 857 return -EINVAL;
f757855e
LP
858 } else
859 arg_volatile_mode = m;
6d0b55c2
LP
860 }
861
f757855e
LP
862 arg_settings_mask |= SETTING_VOLATILE_MODE;
863 break;
6d0b55c2 864
f757855e
LP
865 case 'p':
866 r = expose_port_parse(&arg_expose_ports, optarg);
867 if (r == -EEXIST)
868 return log_error_errno(r, "Duplicate port specification: %s", optarg);
869 if (r < 0)
870 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 871
f757855e 872 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 873 break;
6d0b55c2 874
f36933fe
LP
875 case ARG_PROPERTY:
876 if (strv_extend(&arg_property, optarg) < 0)
877 return log_oom();
878
879 break;
880
ae209204
ZJS
881 case ARG_PRIVATE_USERS: {
882 int boolean = -1;
0de7acce 883
ae209204
ZJS
884 if (!optarg)
885 boolean = true;
886 else if (!in_charset(optarg, DIGITS))
887 /* do *not* parse numbers as booleans */
888 boolean = parse_boolean(optarg);
889
890 if (boolean == false) {
0de7acce
LP
891 /* no: User namespacing off */
892 arg_userns_mode = USER_NAMESPACE_NO;
893 arg_uid_shift = UID_INVALID;
894 arg_uid_range = UINT32_C(0x10000);
ae209204 895 } else if (boolean == true) {
0de7acce
LP
896 /* yes: User namespacing on, UID range is read from root dir */
897 arg_userns_mode = USER_NAMESPACE_FIXED;
898 arg_uid_shift = UID_INVALID;
899 arg_uid_range = UINT32_C(0x10000);
900 } else if (streq(optarg, "pick")) {
901 /* pick: User namespacing on, UID range is picked randomly */
902 arg_userns_mode = USER_NAMESPACE_PICK;
903 arg_uid_shift = UID_INVALID;
904 arg_uid_range = UINT32_C(0x10000);
905 } else {
6c2058b3 906 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
907 const char *range, *shift;
908
0de7acce
LP
909 /* anything else: User namespacing on, UID range is explicitly configured */
910
6dac160c
LP
911 range = strchr(optarg, ':');
912 if (range) {
6c2058b3
ZJS
913 buffer = strndup(optarg, range - optarg);
914 if (!buffer)
915 return log_oom();
916 shift = buffer;
6dac160c
LP
917
918 range++;
bfd292ec
ZJS
919 r = safe_atou32(range, &arg_uid_range);
920 if (r < 0)
be715731 921 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
922 } else
923 shift = optarg;
924
be715731
ZJS
925 r = parse_uid(shift, &arg_uid_shift);
926 if (r < 0)
927 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
928
929 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
930 }
931
be715731
ZJS
932 if (arg_uid_range <= 0) {
933 log_error("UID range cannot be 0.");
934 return -EINVAL;
935 }
936
0de7acce 937 arg_settings_mask |= SETTING_USERNS;
6dac160c 938 break;
ae209204 939 }
6dac160c 940
0de7acce 941 case 'U':
ccabee0d
LP
942 if (userns_supported()) {
943 arg_userns_mode = USER_NAMESPACE_PICK;
944 arg_uid_shift = UID_INVALID;
945 arg_uid_range = UINT32_C(0x10000);
946
947 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
948 }
949
7336138e
LP
950 break;
951
0de7acce 952 case ARG_PRIVATE_USERS_CHOWN:
19aac838 953 arg_userns_chown = true;
0de7acce
LP
954
955 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
956 break;
957
c6c8f6e2
LP
958 case ARG_KILL_SIGNAL:
959 arg_kill_signal = signal_from_string_try_harder(optarg);
960 if (arg_kill_signal < 0) {
961 log_error("Cannot parse signal: %s", optarg);
962 return -EINVAL;
963 }
964
f757855e
LP
965 arg_settings_mask |= SETTING_KILL_SIGNAL;
966 break;
967
968 case ARG_SETTINGS:
969
970 /* no → do not read files
971 * yes → read files, do not override cmdline, trust only subset
972 * override → read files, override cmdline, trust only subset
973 * trusted → read files, do not override cmdline, trust all
974 */
975
976 r = parse_boolean(optarg);
977 if (r < 0) {
978 if (streq(optarg, "trusted")) {
979 mask_all_settings = false;
980 mask_no_settings = false;
981 arg_settings_trusted = true;
982
983 } else if (streq(optarg, "override")) {
984 mask_all_settings = false;
985 mask_no_settings = true;
986 arg_settings_trusted = -1;
987 } else
988 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
989 } else if (r > 0) {
990 /* yes */
991 mask_all_settings = false;
992 mask_no_settings = false;
993 arg_settings_trusted = -1;
994 } else {
995 /* no */
996 mask_all_settings = true;
997 mask_no_settings = false;
998 arg_settings_trusted = false;
999 }
1000
c6c8f6e2
LP
1001 break;
1002
5f932eb9
LP
1003 case ARG_CHDIR:
1004 if (!path_is_absolute(optarg)) {
1005 log_error("Working directory %s is not an absolute path.", optarg);
1006 return -EINVAL;
1007 }
1008
1009 r = free_and_strdup(&arg_chdir, optarg);
1010 if (r < 0)
1011 return log_oom();
1012
1013 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1014 break;
1015
9c1e04d0
AP
1016 case ARG_NOTIFY_READY:
1017 r = parse_boolean(optarg);
1018 if (r < 0) {
1019 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1020 return -EINVAL;
1021 }
1022 arg_notify_ready = r;
1023 arg_settings_mask |= SETTING_NOTIFY_READY;
1024 break;
1025
4623e8e6
LP
1026 case ARG_ROOT_HASH: {
1027 void *k;
1028 size_t l;
1029
1030 r = unhexmem(optarg, strlen(optarg), &k, &l);
1031 if (r < 0)
1032 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1033 if (l < sizeof(sd_id128_t)) {
1034 log_error("Root hash must be at least 128bit long: %s", optarg);
1035 free(k);
1036 return -EINVAL;
1037 }
1038
1039 free(arg_root_hash);
1040 arg_root_hash = k;
1041 arg_root_hash_size = l;
1042 break;
1043 }
1044
88213476
LP
1045 case '?':
1046 return -EINVAL;
1047
1048 default:
eb9da376 1049 assert_not_reached("Unhandled option");
88213476 1050 }
88213476 1051
0c582db0
LB
1052 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1053 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1054 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1055 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1056
4f086aab
SU
1057 if (arg_userns_mode != USER_NAMESPACE_NO)
1058 arg_mount_settings |= MOUNT_USE_USERNS;
1059
1060 if (arg_private_network)
1061 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1062
1063 parse_mount_settings_env();
1064
48a8d337
LB
1065 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1066 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1067 arg_register = false;
0c582db0
LB
1068 if (arg_start_mode != START_PID1) {
1069 log_error("--boot cannot be used without namespacing.");
1070 return -EINVAL;
1071 }
1072 }
eb91eb18 1073
0de7acce 1074 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1075 arg_userns_chown = true;
1076
89f7c846
LP
1077 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1078 log_error("--keep-unit may not be used when invoked from a user session.");
1079 return -EINVAL;
1080 }
1081
1b9e5b12
LP
1082 if (arg_directory && arg_image) {
1083 log_error("--directory= and --image= may not be combined.");
1084 return -EINVAL;
1085 }
1086
ec16945e
LP
1087 if (arg_template && arg_image) {
1088 log_error("--template= and --image= may not be combined.");
1089 return -EINVAL;
1090 }
1091
8cd328d8
LP
1092 if (arg_ephemeral && arg_template && !arg_directory) {
1093 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1094 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1095 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1096 * --directory=". */
1097
1098 arg_directory = arg_template;
1099 arg_template = NULL;
1100 }
1101
ec16945e
LP
1102 if (arg_template && !(arg_directory || arg_machine)) {
1103 log_error("--template= needs --directory= or --machine=.");
1104 return -EINVAL;
1105 }
1106
1107 if (arg_ephemeral && arg_template) {
1108 log_error("--ephemeral and --template= may not be combined.");
1109 return -EINVAL;
1110 }
1111
df9a75e4
LP
1112 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1113 log_error("--ephemeral and --link-journal= may not be combined.");
1114 return -EINVAL;
1115 }
1116
ccabee0d 1117 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1118 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1119 return -EOPNOTSUPP;
1120 }
1121
1122 if (arg_userns_chown && arg_read_only) {
1123 log_error("--read-only and --private-users-chown may not be combined.");
1124 return -EINVAL;
1125 }
f757855e 1126
22b28dfd
LP
1127 if (arg_network_bridge && arg_network_zone) {
1128 log_error("--network-bridge= and --network-zone= may not be combined.");
1129 return -EINVAL;
1130 }
1131
f757855e
LP
1132 if (argc > optind) {
1133 arg_parameters = strv_copy(argv + optind);
1134 if (!arg_parameters)
1135 return log_oom();
1136
7732f92b 1137 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1138 }
1139
1140 /* Load all settings from .nspawn files */
1141 if (mask_no_settings)
1142 arg_settings_mask = 0;
1143
1144 /* Don't load any settings from .nspawn files */
1145 if (mask_all_settings)
1146 arg_settings_mask = _SETTINGS_MASK_ALL;
1147
520e0d54 1148 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1149
6aadfa4c
ILG
1150 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1151 if (e)
1152 arg_container_service_name = e;
1153
5a8ff0e6
CB
1154 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1155 if (r < 0)
1156 arg_use_cgns = cg_ns_supported();
1157 else
1158 arg_use_cgns = r;
1159
86c0dd4a
LP
1160 r = custom_mount_check_all();
1161 if (r < 0)
1162 return r;
1163
f757855e
LP
1164 return 1;
1165}
1166
1167static int verify_arguments(void) {
4f086aab
SU
1168 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1169 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1170 return -EINVAL;
1171 }
1172
1173 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1174 log_error("Cannot combine --private-users with read-write mounts.");
1175 return -EINVAL;
1176 }
f757855e
LP
1177
1178 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1179 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1180 return -EINVAL;
1181 }
1182
6d0b55c2
LP
1183 if (arg_expose_ports && !arg_private_network) {
1184 log_error("Cannot use --port= without private networking.");
1185 return -EINVAL;
1186 }
1187
1c1ea217
EV
1188#ifndef HAVE_LIBIPTC
1189 if (arg_expose_ports) {
1190 log_error("--port= is not supported, compiled without libiptc support.");
1191 return -EOPNOTSUPP;
1192 }
1193#endif
1194
7732f92b 1195 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1196 arg_kill_signal = SIGRTMIN+3;
1197
f757855e 1198 return 0;
88213476
LP
1199}
1200
03cfe0d5
LP
1201static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1202 assert(p);
1203
0de7acce 1204 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1205 return 0;
1206
1207 if (uid == UID_INVALID && gid == GID_INVALID)
1208 return 0;
1209
1210 if (uid != UID_INVALID) {
1211 uid += arg_uid_shift;
1212
1213 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1214 return -EOVERFLOW;
1215 }
1216
1217 if (gid != GID_INVALID) {
1218 gid += (gid_t) arg_uid_shift;
1219
1220 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1221 return -EOVERFLOW;
1222 }
1223
1224 if (lchown(p, uid, gid) < 0)
1225 return -errno;
b12afc8c
LP
1226
1227 return 0;
1228}
1229
03cfe0d5
LP
1230static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1231 const char *q;
1232
1233 q = prefix_roota(root, path);
1234 if (mkdir(q, mode) < 0) {
1235 if (errno == EEXIST)
1236 return 0;
1237 return -errno;
1238 }
1239
1240 return userns_lchown(q, uid, gid);
1241}
1242
e58a1277 1243static int setup_timezone(const char *dest) {
03cfe0d5
LP
1244 _cleanup_free_ char *p = NULL, *q = NULL;
1245 const char *where, *check, *what;
d4036145
LP
1246 char *z, *y;
1247 int r;
f8440af5 1248
e58a1277
LP
1249 assert(dest);
1250
1251 /* Fix the timezone, if possible */
d4036145
LP
1252 r = readlink_malloc("/etc/localtime", &p);
1253 if (r < 0) {
0b493a02
MP
1254 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1255 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1256 * with a symbolic link to a time zone data file.
0b493a02
MP
1257 *
1258 * Example:
21dc0227 1259 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1260 */
d4036145
LP
1261 return 0;
1262 }
1263
1264 z = path_startswith(p, "../usr/share/zoneinfo/");
1265 if (!z)
1266 z = path_startswith(p, "/usr/share/zoneinfo/");
1267 if (!z) {
1268 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1269 return 0;
1270 }
1271
03cfe0d5 1272 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1273 r = readlink_malloc(where, &q);
1274 if (r >= 0) {
1275 y = path_startswith(q, "../usr/share/zoneinfo/");
1276 if (!y)
1277 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1278
d4036145
LP
1279 /* Already pointing to the right place? Then do nothing .. */
1280 if (y && streq(y, z))
1281 return 0;
1282 }
1283
03cfe0d5 1284 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1285 check = prefix_roota(dest, check);
03cfe0d5 1286 if (laccess(check, F_OK) < 0) {
d4036145
LP
1287 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1288 return 0;
1289 }
68fb0892 1290
79d80fc1
TG
1291 r = unlink(where);
1292 if (r < 0 && errno != ENOENT) {
56f64d95 1293 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1294 return 0;
1295 }
4d9f07b4 1296
03cfe0d5 1297 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1298 if (symlink(what, where) < 0) {
56f64d95 1299 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1300 return 0;
1301 }
e58a1277 1302
03cfe0d5
LP
1303 r = userns_lchown(where, 0, 0);
1304 if (r < 0)
1305 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1306
e58a1277 1307 return 0;
88213476
LP
1308}
1309
2547bb41 1310static int setup_resolv_conf(const char *dest) {
03cfe0d5 1311 const char *where = NULL;
79d80fc1 1312 int r;
2547bb41
LP
1313
1314 assert(dest);
1315
1316 if (arg_private_network)
1317 return 0;
1318
1319 /* Fix resolv.conf, if possible */
03cfe0d5 1320 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1321
7debb05d
CH
1322 if (access("/run/systemd/resolve/resolv.conf", F_OK) >= 0 &&
1323 access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
3539724c
LP
1324 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1325 * container, so that the container can use the host's resolver. Given that network namespacing is
1326 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1327 * advantage that the container will be able to follow the host's DNS server configuration changes
1328 * transparently. */
1329
5367354d
FB
1330 (void) touch(where);
1331
60e76d48
ZJS
1332 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1333 if (r >= 0)
1334 return mount_verbose(LOG_ERR, NULL, where, NULL,
1335 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1336 }
1337
1338 /* If that didn't work, let's copy the file */
f2068bcc 1339 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1340 if (r < 0) {
3539724c
LP
1341 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1342 * resolved or something similar runs inside and the symlink points there.
68a313c5 1343 *
3539724c 1344 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1345 */
1346 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1347 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1348 return 0;
1349 }
2547bb41 1350
03cfe0d5
LP
1351 r = userns_lchown(where, 0, 0);
1352 if (r < 0)
3539724c 1353 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1354
2547bb41
LP
1355 return 0;
1356}
1357
04bc4a3f 1358static int setup_boot_id(const char *dest) {
3bbaff3e 1359 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1360 const char *from, *to;
04bc4a3f
LP
1361 int r;
1362
04bc4a3f
LP
1363 /* Generate a new randomized boot ID, so that each boot-up of
1364 * the container gets a new one */
1365
03cfe0d5
LP
1366 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1367 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1368
1369 r = sd_id128_randomize(&rnd);
f647962d
MS
1370 if (r < 0)
1371 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1372
15b1248a 1373 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1374 if (r < 0)
1375 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1376
60e76d48
ZJS
1377 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1378 if (r >= 0)
1379 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1380 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1381
3bbaff3e 1382 (void) unlink(from);
04bc4a3f
LP
1383 return r;
1384}
1385
e58a1277 1386static int copy_devnodes(const char *dest) {
88213476
LP
1387
1388 static const char devnodes[] =
1389 "null\0"
1390 "zero\0"
1391 "full\0"
1392 "random\0"
1393 "urandom\0"
85614d66
TG
1394 "tty\0"
1395 "net/tun\0";
88213476
LP
1396
1397 const char *d;
e58a1277 1398 int r = 0;
7fd1b19b 1399 _cleanup_umask_ mode_t u;
a258bf26
LP
1400
1401 assert(dest);
124640f1
LP
1402
1403 u = umask(0000);
88213476 1404
03cfe0d5
LP
1405 /* Create /dev/net, so that we can create /dev/net/tun in it */
1406 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1407 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1408
88213476 1409 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1410 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1411 struct stat st;
88213476 1412
7f112f50 1413 from = strappend("/dev/", d);
03cfe0d5 1414 to = prefix_root(dest, from);
88213476
LP
1415
1416 if (stat(from, &st) < 0) {
1417
4a62c710
MS
1418 if (errno != ENOENT)
1419 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1420
a258bf26 1421 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1422
03cfe0d5 1423 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1424 return -EIO;
a258bf26 1425
85614d66 1426 } else {
81f5049b 1427 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1428 /*
1429 * This is some sort of protection too against
1430 * recursive userns chown on shared /dev/
1431 */
1432 if (errno == EEXIST)
1433 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1434 if (errno != EPERM)
1435 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1436
1437 /* Some systems abusively restrict mknod but
1438 * allow bind mounts. */
1439 r = touch(to);
1440 if (r < 0)
1441 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1442 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1443 if (r < 0)
1444 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1445 }
6278cf60 1446
03cfe0d5
LP
1447 r = userns_lchown(to, 0, 0);
1448 if (r < 0)
1449 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1450 }
88213476
LP
1451 }
1452
e58a1277
LP
1453 return r;
1454}
88213476 1455
03cfe0d5
LP
1456static int setup_pts(const char *dest) {
1457 _cleanup_free_ char *options = NULL;
1458 const char *p;
709f6e46 1459 int r;
03cfe0d5
LP
1460
1461#ifdef HAVE_SELINUX
1462 if (arg_selinux_apifs_context)
1463 (void) asprintf(&options,
3dce8915 1464 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1465 arg_uid_shift + TTY_GID,
1466 arg_selinux_apifs_context);
1467 else
1468#endif
1469 (void) asprintf(&options,
3dce8915 1470 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1471 arg_uid_shift + TTY_GID);
f2d88580 1472
03cfe0d5 1473 if (!options)
f2d88580
LP
1474 return log_oom();
1475
03cfe0d5 1476 /* Mount /dev/pts itself */
cc9fce65 1477 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1478 if (mkdir(p, 0755) < 0)
1479 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1480 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1481 if (r < 0)
1482 return r;
709f6e46
MS
1483 r = userns_lchown(p, 0, 0);
1484 if (r < 0)
1485 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1486
1487 /* Create /dev/ptmx symlink */
1488 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1489 if (symlink("pts/ptmx", p) < 0)
1490 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1491 r = userns_lchown(p, 0, 0);
1492 if (r < 0)
1493 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1494
03cfe0d5
LP
1495 /* And fix /dev/pts/ptmx ownership */
1496 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1497 r = userns_lchown(p, 0, 0);
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1500
f2d88580
LP
1501 return 0;
1502}
1503
e58a1277 1504static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1505 _cleanup_umask_ mode_t u;
1506 const char *to;
e58a1277 1507 int r;
e58a1277
LP
1508
1509 assert(dest);
1510 assert(console);
1511
1512 u = umask(0000);
1513
03cfe0d5 1514 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1515 if (r < 0)
1516 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1517
a258bf26
LP
1518 /* We need to bind mount the right tty to /dev/console since
1519 * ptys can only exist on pts file systems. To have something
81f5049b 1520 * to bind mount things on we create a empty regular file. */
a258bf26 1521
03cfe0d5 1522 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1523 r = touch(to);
1524 if (r < 0)
1525 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1526
60e76d48 1527 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1528}
1529
1530static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1531 const char *from, *to;
7fd1b19b 1532 _cleanup_umask_ mode_t u;
d9603714 1533 int fd, r;
e58a1277 1534
e58a1277 1535 assert(kmsg_socket >= 0);
a258bf26 1536
e58a1277 1537 u = umask(0000);
a258bf26 1538
03cfe0d5 1539 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1540 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1541 * on the reading side behave very similar to /proc/kmsg,
1542 * their writing side behaves differently from /dev/kmsg in
1543 * that writing blocks when nothing is reading. In order to
1544 * avoid any problems with containers deadlocking due to this
1545 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1546 from = prefix_roota(dest, "/run/kmsg");
1547 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1548
4a62c710 1549 if (mkfifo(from, 0600) < 0)
03cfe0d5 1550 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1551 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1552 if (r < 0)
1553 return r;
e58a1277
LP
1554
1555 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1556 if (fd < 0)
1557 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1558
e58a1277
LP
1559 /* Store away the fd in the socket, so that it stays open as
1560 * long as we run the child */
3ee897d6 1561 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1562 safe_close(fd);
e58a1277 1563
d9603714
DH
1564 if (r < 0)
1565 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1566
03cfe0d5
LP
1567 /* And now make the FIFO unavailable as /run/kmsg... */
1568 (void) unlink(from);
1569
25ea79fe 1570 return 0;
88213476
LP
1571}
1572
1c4baffc 1573static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1574 union in_addr_union *exposed = userdata;
1575
1576 assert(rtnl);
1577 assert(m);
1578 assert(exposed);
1579
7a8f6325 1580 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1581 return 0;
1582}
1583
3a74cea5 1584static int setup_hostname(void) {
3a74cea5 1585
0c582db0 1586 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1587 return 0;
1588
605f81a8 1589 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1590 return -errno;
3a74cea5 1591
7027ff61 1592 return 0;
3a74cea5
LP
1593}
1594
57fb9fb5 1595static int setup_journal(const char *directory) {
e01ff70a 1596 sd_id128_t this_id;
0f5e1382 1597 _cleanup_free_ char *d = NULL;
e01ff70a 1598 const char *p, *q;
8054d749 1599 bool try;
e01ff70a 1600 char id[33];
57fb9fb5
LP
1601 int r;
1602
df9a75e4
LP
1603 /* Don't link journals in ephemeral mode */
1604 if (arg_ephemeral)
1605 return 0;
1606
8054d749
LP
1607 if (arg_link_journal == LINK_NO)
1608 return 0;
1609
1610 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1611
4d680aee 1612 r = sd_id128_get_machine(&this_id);
f647962d
MS
1613 if (r < 0)
1614 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1615
e01ff70a 1616 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1617 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1618 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1619 if (try)
4d680aee 1620 return 0;
df9a75e4 1621 return -EEXIST;
4d680aee
ZJS
1622 }
1623
03cfe0d5
LP
1624 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1625 if (r < 0)
1626 return log_error_errno(r, "Failed to create /var: %m");
1627
1628 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1629 if (r < 0)
1630 return log_error_errno(r, "Failed to create /var/log: %m");
1631
1632 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1633 if (r < 0)
1634 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1635
e01ff70a
MS
1636 (void) sd_id128_to_string(arg_uuid, id);
1637
03cfe0d5
LP
1638 p = strjoina("/var/log/journal/", id);
1639 q = prefix_roota(directory, p);
27407a01 1640
e1873695 1641 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1642 if (try)
1643 return 0;
27407a01 1644
8054d749
LP
1645 log_error("%s: already a mount point, refusing to use for journal", p);
1646 return -EEXIST;
57fb9fb5
LP
1647 }
1648
e1873695 1649 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1650 if (try)
1651 return 0;
57fb9fb5 1652
8054d749
LP
1653 log_error("%s: already a mount point, refusing to use for journal", q);
1654 return -EEXIST;
57fb9fb5
LP
1655 }
1656
1657 r = readlink_and_make_absolute(p, &d);
1658 if (r >= 0) {
1659 if ((arg_link_journal == LINK_GUEST ||
1660 arg_link_journal == LINK_AUTO) &&
1661 path_equal(d, q)) {
1662
03cfe0d5 1663 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1664 if (r < 0)
709f6e46 1665 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1666 return 0;
57fb9fb5
LP
1667 }
1668
4a62c710
MS
1669 if (unlink(p) < 0)
1670 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1671 } else if (r == -EINVAL) {
1672
1673 if (arg_link_journal == LINK_GUEST &&
1674 rmdir(p) < 0) {
1675
27407a01
ZJS
1676 if (errno == ENOTDIR) {
1677 log_error("%s already exists and is neither a symlink nor a directory", p);
1678 return r;
4314d33f
MS
1679 } else
1680 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1681 }
4314d33f
MS
1682 } else if (r != -ENOENT)
1683 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1684
1685 if (arg_link_journal == LINK_GUEST) {
1686
1687 if (symlink(q, p) < 0) {
8054d749 1688 if (try) {
56f64d95 1689 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1690 return 0;
4314d33f
MS
1691 } else
1692 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1693 }
1694
03cfe0d5 1695 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1696 if (r < 0)
709f6e46 1697 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1698 return 0;
57fb9fb5
LP
1699 }
1700
1701 if (arg_link_journal == LINK_HOST) {
ccddd104 1702 /* don't create parents here — if the host doesn't have
574edc90 1703 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1704
1705 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1706 if (try) {
56f64d95 1707 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1708 return 0;
4314d33f
MS
1709 } else
1710 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1711 }
1712
27407a01
ZJS
1713 } else if (access(p, F_OK) < 0)
1714 return 0;
57fb9fb5 1715
cdb2b9d0
LP
1716 if (dir_is_empty(q) == 0)
1717 log_warning("%s is not empty, proceeding anyway.", q);
1718
03cfe0d5 1719 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1720 if (r < 0)
1721 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1722
60e76d48
ZJS
1723 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1724 if (r < 0)
4a62c710 1725 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1726
27407a01 1727 return 0;
57fb9fb5
LP
1728}
1729
88213476 1730static int drop_capabilities(void) {
520e0d54 1731 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1732}
1733
db999e0f
LP
1734static int reset_audit_loginuid(void) {
1735 _cleanup_free_ char *p = NULL;
1736 int r;
1737
0c582db0 1738 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1739 return 0;
1740
1741 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1742 if (r == -ENOENT)
db999e0f 1743 return 0;
f647962d
MS
1744 if (r < 0)
1745 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1746
1747 /* Already reset? */
1748 if (streq(p, "4294967295"))
1749 return 0;
1750
ad118bda 1751 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1752 if (r < 0) {
10a87006
LP
1753 log_error_errno(r,
1754 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1755 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1756 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1757 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1758 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1759
db999e0f 1760 sleep(5);
77b6e194 1761 }
db999e0f
LP
1762
1763 return 0;
77b6e194
LP
1764}
1765
24fb1112 1766
785890ac
LP
1767static int setup_propagate(const char *root) {
1768 const char *p, *q;
709f6e46 1769 int r;
785890ac
LP
1770
1771 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1772 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1773 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1774 (void) mkdir_p(p, 0600);
1775
709f6e46
MS
1776 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1777 if (r < 0)
1778 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1779
709f6e46
MS
1780 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1781 if (r < 0)
1782 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1783
709f6e46
MS
1784 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1785 if (r < 0)
1786 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1787
03cfe0d5 1788 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1789 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1790 if (r < 0)
1791 return r;
785890ac 1792
60e76d48
ZJS
1793 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1794 if (r < 0)
1795 return r;
785890ac 1796
19caffac
AC
1797 /* machined will MS_MOVE into that directory, and that's only
1798 * supported for non-shared mounts. */
60e76d48 1799 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1800}
1801
317feb4d 1802static int setup_machine_id(const char *directory) {
691675ba
LP
1803 const char *etc_machine_id;
1804 sd_id128_t id;
3bbaff3e 1805 int r;
e01ff70a 1806
317feb4d
LP
1807 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1808 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1809 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1810 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1811 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1812 * container behaves nicely). */
1813
e01ff70a
MS
1814 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1815
691675ba 1816 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
1817 if (r < 0) {
1818 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1819 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 1820
317feb4d
LP
1821 if (sd_id128_is_null(arg_uuid)) {
1822 r = sd_id128_randomize(&arg_uuid);
1823 if (r < 0)
1824 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1825 }
1826 } else {
1827 if (sd_id128_is_null(id)) {
1828 log_error("Machine ID in container image is zero, refusing.");
1829 return -EINVAL;
1830 }
e01ff70a 1831
317feb4d
LP
1832 arg_uuid = id;
1833 }
691675ba 1834
e01ff70a
MS
1835 return 0;
1836}
1837
7336138e
LP
1838static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1839 int r;
1840
1841 assert(directory);
1842
0de7acce 1843 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
1844 return 0;
1845
1846 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1847 if (r == -EOPNOTSUPP)
1848 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1849 if (r == -EBADE)
1850 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1851 if (r < 0)
1852 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1853 if (r == 0)
1854 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1855 else
1856 log_debug("Patched directory tree to match UID/GID range.");
1857
1858 return r;
1859}
1860
113cea80 1861/*
6d416b9c
LS
1862 * Return values:
1863 * < 0 : wait_for_terminate() failed to get the state of the
1864 * container, the container was terminated by a signal, or
1865 * failed for an unknown reason. No change is made to the
1866 * container argument.
1867 * > 0 : The program executed in the container terminated with an
1868 * error. The exit code of the program executed in the
919699ec
LP
1869 * container is returned. The container argument has been set
1870 * to CONTAINER_TERMINATED.
6d416b9c
LS
1871 * 0 : The container is being rebooted, has been shut down or exited
1872 * successfully. The container argument has been set to either
1873 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 1874 *
6d416b9c
LS
1875 * That is, success is indicated by a return value of zero, and an
1876 * error is indicated by a non-zero value.
113cea80
DH
1877 */
1878static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 1879 siginfo_t status;
919699ec 1880 int r;
113cea80
DH
1881
1882 r = wait_for_terminate(pid, &status);
f647962d
MS
1883 if (r < 0)
1884 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
1885
1886 switch (status.si_code) {
fddbb89c 1887
113cea80 1888 case CLD_EXITED:
b5a2179b 1889 if (status.si_status == 0)
919699ec 1890 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 1891 else
919699ec 1892 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 1893
919699ec
LP
1894 *container = CONTAINER_TERMINATED;
1895 return status.si_status;
113cea80
DH
1896
1897 case CLD_KILLED:
1898 if (status.si_status == SIGINT) {
919699ec 1899 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 1900 *container = CONTAINER_TERMINATED;
919699ec
LP
1901 return 0;
1902
113cea80 1903 } else if (status.si_status == SIGHUP) {
919699ec 1904 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 1905 *container = CONTAINER_REBOOTED;
919699ec 1906 return 0;
113cea80 1907 }
919699ec 1908
113cea80
DH
1909 /* CLD_KILLED fallthrough */
1910
1911 case CLD_DUMPED:
fddbb89c 1912 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 1913 return -EIO;
113cea80
DH
1914
1915 default:
fddbb89c 1916 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 1917 return -EIO;
113cea80 1918 }
113cea80
DH
1919}
1920
023fb90b
LP
1921static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1922 pid_t pid;
1923
4a0b58c4 1924 pid = PTR_TO_PID(userdata);
023fb90b 1925 if (pid > 0) {
c6c8f6e2 1926 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
1927 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1928 sd_event_source_set_userdata(s, NULL);
1929 return 0;
1930 }
1931 }
1932
1933 sd_event_exit(sd_event_source_get_event(s), 0);
1934 return 0;
1935}
1936
6916b164
AU
1937static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
1938 for (;;) {
1939 siginfo_t si = {};
1940 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
1941 return log_error_errno(errno, "Failed to waitid(): %m");
1942 if (si.si_pid == 0) /* No pending children. */
1943 break;
1944 if (si.si_pid == PTR_TO_PID(userdata)) {
1945 /* The main process we care for has exited. Return from
1946 * signal handler but leave the zombie. */
1947 sd_event_exit(sd_event_source_get_event(s), 0);
1948 break;
1949 }
1950 /* Reap all other children. */
1951 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
1952 }
1953
1954 return 0;
1955}
1956
ec16945e 1957static int determine_names(void) {
1b9cebf6 1958 int r;
ec16945e 1959
c1521918
LP
1960 if (arg_template && !arg_directory && arg_machine) {
1961
1962 /* If --template= was specified then we should not
1963 * search for a machine, but instead create a new one
1964 * in /var/lib/machine. */
1965
605405c6 1966 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
1967 if (!arg_directory)
1968 return log_oom();
1969 }
1970
ec16945e 1971 if (!arg_image && !arg_directory) {
1b9cebf6
LP
1972 if (arg_machine) {
1973 _cleanup_(image_unrefp) Image *i = NULL;
1974
1975 r = image_find(arg_machine, &i);
1976 if (r < 0)
1977 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 1978 if (r == 0) {
1b9cebf6
LP
1979 log_error("No image for machine '%s': %m", arg_machine);
1980 return -ENOENT;
1981 }
1982
aceac2f0 1983 if (i->type == IMAGE_RAW)
0f03c2a4 1984 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 1985 else
0f03c2a4 1986 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 1987 if (r < 0)
0f3be6ca 1988 return log_oom();
1b9cebf6 1989
aee327b8
LP
1990 if (!arg_ephemeral)
1991 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 1992 } else
ec16945e
LP
1993 arg_directory = get_current_dir_name();
1994
0f3be6ca 1995 if (!arg_directory && !arg_image) {
1b9cebf6 1996 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
1997 return -EINVAL;
1998 }
1999 }
2000
2001 if (!arg_machine) {
4827ab48 2002
b9ba4dab
LP
2003 if (arg_directory && path_equal(arg_directory, "/"))
2004 arg_machine = gethostname_malloc();
4827ab48
LP
2005 else {
2006 if (arg_image) {
2007 char *e;
2008
2009 arg_machine = strdup(basename(arg_image));
2010
2011 /* Truncate suffix if there is one */
2012 e = endswith(arg_machine, ".raw");
2013 if (e)
2014 *e = 0;
2015 } else
2016 arg_machine = strdup(basename(arg_directory));
2017 }
ec16945e
LP
2018 if (!arg_machine)
2019 return log_oom();
2020
ae691c1d 2021 hostname_cleanup(arg_machine);
ec16945e
LP
2022 if (!machine_name_is_valid(arg_machine)) {
2023 log_error("Failed to determine machine name automatically, please use -M.");
2024 return -EINVAL;
2025 }
b9ba4dab
LP
2026
2027 if (arg_ephemeral) {
2028 char *b;
2029
2030 /* Add a random suffix when this is an
2031 * ephemeral machine, so that we can run many
2032 * instances at once without manually having
2033 * to specify -M each time. */
2034
2035 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2036 return log_oom();
2037
2038 free(arg_machine);
2039 arg_machine = b;
2040 }
ec16945e
LP
2041 }
2042
2043 return 0;
2044}
2045
8d4aa2bb 2046static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2047 char *chased;
2048 int r;
2049
2050 assert(p);
2051
2052 if (!*p)
2053 return 0;
2054
8d4aa2bb 2055 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2056 if (r < 0)
2057 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2058
2059 free(*p);
2060 *p = chased;
2061
2062 return 0;
2063}
2064
03cfe0d5 2065static int determine_uid_shift(const char *directory) {
6dac160c
LP
2066 int r;
2067
0de7acce 2068 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2069 arg_uid_shift = 0;
6dac160c 2070 return 0;
03cfe0d5 2071 }
6dac160c
LP
2072
2073 if (arg_uid_shift == UID_INVALID) {
2074 struct stat st;
2075
03cfe0d5 2076 r = stat(directory, &st);
6dac160c 2077 if (r < 0)
03cfe0d5 2078 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2079
2080 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2081
2082 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2083 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2084 return -EINVAL;
2085 }
2086
2087 arg_uid_range = UINT32_C(0x10000);
2088 }
2089
2090 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2091 log_error("UID base too high for UID range.");
2092 return -EINVAL;
2093 }
2094
6dac160c
LP
2095 return 0;
2096}
2097
03cfe0d5
LP
2098static int inner_child(
2099 Barrier *barrier,
2100 const char *directory,
2101 bool secondary,
2102 int kmsg_socket,
2103 int rtnl_socket,
f757855e 2104 FDSet *fds) {
69c79d3c 2105
03cfe0d5 2106 _cleanup_free_ char *home = NULL;
e01ff70a 2107 char as_uuid[37];
6aadfa4c 2108 unsigned n_env = 1;
03cfe0d5
LP
2109 const char *envp[] = {
2110 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2111 NULL, /* container */
03cfe0d5
LP
2112 NULL, /* TERM */
2113 NULL, /* HOME */
2114 NULL, /* USER */
2115 NULL, /* LOGNAME */
2116 NULL, /* container_uuid */
2117 NULL, /* LISTEN_FDS */
2118 NULL, /* LISTEN_PID */
9c1e04d0 2119 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2120 NULL
2121 };
88213476 2122
2371271c 2123 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2124 int r;
88213476 2125
03cfe0d5
LP
2126 assert(barrier);
2127 assert(directory);
2128 assert(kmsg_socket >= 0);
88213476 2129
efdb0237
LP
2130 cg_unified_flush();
2131
0de7acce 2132 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2133 /* Tell the parent, that it now can write the UID map. */
2134 (void) barrier_place(barrier); /* #1 */
7027ff61 2135
03cfe0d5
LP
2136 /* Wait until the parent wrote the UID map */
2137 if (!barrier_place_and_sync(barrier)) { /* #2 */
2138 log_error("Parent died too early");
2139 return -ESRCH;
2140 }
88213476
LP
2141 }
2142
6d66bd3b
EV
2143 r = reset_uid_gid();
2144 if (r < 0)
2145 return log_error_errno(r, "Couldn't become new root: %m");
2146
0de7acce 2147 r = mount_all(NULL,
4f086aab 2148 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2149 arg_uid_shift,
2150 arg_uid_range,
2151 arg_selinux_apifs_context);
2152
03cfe0d5
LP
2153 if (r < 0)
2154 return r;
2155
4f086aab 2156 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2157 if (r < 0)
2158 return r;
2159
03cfe0d5
LP
2160 /* Wait until we are cgroup-ified, so that we
2161 * can mount the right cgroup path writable */
2162 if (!barrier_place_and_sync(barrier)) { /* #3 */
2163 log_error("Parent died too early");
2164 return -ESRCH;
88213476
LP
2165 }
2166
5a8ff0e6 2167 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2168 r = unshare(CLONE_NEWCGROUP);
2169 if (r < 0)
2170 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2171 r = mount_cgroups(
2172 "",
2173 arg_unified_cgroup_hierarchy,
2174 arg_userns_mode != USER_NAMESPACE_NO,
2175 arg_uid_shift,
2176 arg_uid_range,
5a8ff0e6 2177 arg_selinux_apifs_context,
ada54120 2178 true);
0996ef00
CB
2179 if (r < 0)
2180 return r;
2181 } else {
2182 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2183 if (r < 0)
2184 return r;
2185 }
ec16945e 2186
03cfe0d5
LP
2187 r = setup_boot_id(NULL);
2188 if (r < 0)
2189 return r;
ec16945e 2190
03cfe0d5
LP
2191 r = setup_kmsg(NULL, kmsg_socket);
2192 if (r < 0)
2193 return r;
2194 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2195
03cfe0d5 2196 umask(0022);
30535c16 2197
03cfe0d5
LP
2198 if (setsid() < 0)
2199 return log_error_errno(errno, "setsid() failed: %m");
2200
2201 if (arg_private_network)
2202 loopback_setup();
2203
7a8f6325
LP
2204 if (arg_expose_ports) {
2205 r = expose_port_send_rtnl(rtnl_socket);
2206 if (r < 0)
2207 return r;
2208 rtnl_socket = safe_close(rtnl_socket);
2209 }
03cfe0d5 2210
709f6e46
MS
2211 r = drop_capabilities();
2212 if (r < 0)
2213 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2214
2215 setup_hostname();
2216
050f7277 2217 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2218 if (personality(arg_personality) < 0)
2219 return log_error_errno(errno, "personality() failed: %m");
2220 } else if (secondary) {
2221 if (personality(PER_LINUX32) < 0)
2222 return log_error_errno(errno, "personality() failed: %m");
2223 }
2224
2225#ifdef HAVE_SELINUX
2226 if (arg_selinux_context)
2ed96880 2227 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2228 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2229#endif
2230
ee645080 2231 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2232 if (r < 0)
2233 return r;
2234
6aadfa4c
ILG
2235 /* LXC sets container=lxc, so follow the scheme here */
2236 envp[n_env++] = strjoina("container=", arg_container_service_name);
2237
03cfe0d5
LP
2238 envp[n_env] = strv_find_prefix(environ, "TERM=");
2239 if (envp[n_env])
313cefa1 2240 n_env++;
03cfe0d5
LP
2241
2242 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2243 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2244 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2245 return log_oom();
2246
3bbaff3e 2247 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2248
691675ba 2249 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2250 return log_oom();
03cfe0d5
LP
2251
2252 if (fdset_size(fds) > 0) {
2253 r = fdset_cloexec(fds, false);
2254 if (r < 0)
2255 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2256
2257 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2258 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2259 return log_oom();
2260 }
9c1e04d0
AP
2261 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2262 return log_oom();
03cfe0d5 2263
2371271c
TG
2264 env_use = strv_env_merge(2, envp, arg_setenv);
2265 if (!env_use)
2266 return log_oom();
03cfe0d5
LP
2267
2268 /* Let the parent know that we are ready and
2269 * wait until the parent is ready with the
2270 * setup, too... */
2271 if (!barrier_place_and_sync(barrier)) { /* #4 */
2272 log_error("Parent died too early");
2273 return -ESRCH;
2274 }
2275
5f932eb9
LP
2276 if (arg_chdir)
2277 if (chdir(arg_chdir) < 0)
2278 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2279
7732f92b 2280 if (arg_start_mode == START_PID2) {
75bf701f 2281 r = stub_pid1(arg_uuid);
7732f92b
LP
2282 if (r < 0)
2283 return r;
2284 }
2285
03cfe0d5
LP
2286 /* Now, explicitly close the log, so that we
2287 * then can close all remaining fds. Closing
2288 * the log explicitly first has the benefit
2289 * that the logging subsystem knows about it,
2290 * and is thus ready to be reopened should we
2291 * need it again. Note that the other fds
2292 * closed here are at least the locking and
2293 * barrier fds. */
2294 log_close();
2295 (void) fdset_close_others(fds);
2296
7732f92b 2297 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2298 char **a;
2299 size_t m;
2300
2301 /* Automatically search for the init system */
2302
75f32f04
ZJS
2303 m = strv_length(arg_parameters);
2304 a = newa(char*, m + 2);
2305 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2306 a[1 + m] = NULL;
03cfe0d5
LP
2307
2308 a[0] = (char*) "/usr/lib/systemd/systemd";
2309 execve(a[0], a, env_use);
2310
2311 a[0] = (char*) "/lib/systemd/systemd";
2312 execve(a[0], a, env_use);
2313
2314 a[0] = (char*) "/sbin/init";
2315 execve(a[0], a, env_use);
f757855e
LP
2316 } else if (!strv_isempty(arg_parameters))
2317 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2318 else {
5f932eb9 2319 if (!arg_chdir)
d929b0f9
ZJS
2320 /* If we cannot change the directory, we'll end up in /, that is expected. */
2321 (void) chdir(home ?: "/root");
5f932eb9 2322
03cfe0d5
LP
2323 execle("/bin/bash", "-bash", NULL, env_use);
2324 execle("/bin/sh", "-sh", NULL, env_use);
2325 }
2326
35607a8d 2327 r = -errno;
03cfe0d5 2328 (void) log_open();
35607a8d 2329 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2330}
2331
9c1e04d0
AP
2332static int setup_sd_notify_child(void) {
2333 static const int one = 1;
2334 int fd = -1;
2335 union sockaddr_union sa = {
2336 .sa.sa_family = AF_UNIX,
2337 };
2338 int r;
2339
2340 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2341 if (fd < 0)
2342 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2343
2344 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2345 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2346
2347 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2348 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2349 if (r < 0) {
2350 safe_close(fd);
2351 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2352 }
2353
2354 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2355 if (r < 0) {
2356 safe_close(fd);
2357 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2358 }
2359
2360 return fd;
2361}
2362
03cfe0d5
LP
2363static int outer_child(
2364 Barrier *barrier,
2365 const char *directory,
2366 const char *console,
2d845785 2367 DissectedImage *dissected_image,
03cfe0d5
LP
2368 bool interactive,
2369 bool secondary,
2370 int pid_socket,
e01ff70a 2371 int uuid_socket,
9c1e04d0 2372 int notify_socket,
03cfe0d5
LP
2373 int kmsg_socket,
2374 int rtnl_socket,
825d5287 2375 int uid_shift_socket,
f757855e 2376 FDSet *fds) {
03cfe0d5
LP
2377
2378 pid_t pid;
2379 ssize_t l;
2380 int r;
9c1e04d0 2381 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2382
2383 assert(barrier);
2384 assert(directory);
2385 assert(console);
2386 assert(pid_socket >= 0);
e01ff70a 2387 assert(uuid_socket >= 0);
9c1e04d0 2388 assert(notify_socket >= 0);
03cfe0d5
LP
2389 assert(kmsg_socket >= 0);
2390
efdb0237
LP
2391 cg_unified_flush();
2392
03cfe0d5
LP
2393 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2394 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2395
2396 if (interactive) {
2397 close_nointr(STDIN_FILENO);
2398 close_nointr(STDOUT_FILENO);
2399 close_nointr(STDERR_FILENO);
2400
2401 r = open_terminal(console, O_RDWR);
2402 if (r != STDIN_FILENO) {
2403 if (r >= 0) {
2404 safe_close(r);
2405 r = -EINVAL;
2406 }
2407
2408 return log_error_errno(r, "Failed to open console: %m");
2409 }
2410
2411 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2412 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2413 return log_error_errno(errno, "Failed to duplicate console: %m");
2414 }
2415
2416 r = reset_audit_loginuid();
2417 if (r < 0)
2418 return r;
2419
2420 /* Mark everything as slave, so that we still
2421 * receive mounts from the real root, but don't
2422 * propagate mounts to the real root. */
60e76d48
ZJS
2423 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2424 if (r < 0)
2425 return r;
03cfe0d5 2426
2d845785 2427 if (dissected_image) {
18b5886e 2428 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2429 if (r < 0)
2430 return r;
2431 }
03cfe0d5 2432
391567f4
LP
2433 r = determine_uid_shift(directory);
2434 if (r < 0)
2435 return r;
2436
0fd9563f
ZJS
2437 r = detect_unified_cgroup_hierarchy(directory);
2438 if (r < 0)
2439 return r;
2440
0de7acce 2441 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2442 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2443 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2444 if (l < 0)
2445 return log_error_errno(errno, "Failed to send UID shift: %m");
2446 if (l != sizeof(arg_uid_shift)) {
2447 log_error("Short write while sending UID shift.");
2448 return -EIO;
2449 }
0e7ac751 2450
0de7acce 2451 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2452 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2453 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2454 * not it will pick a different one, and send it back to us. */
2455
2456 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2457 if (l < 0)
2458 return log_error_errno(errno, "Failed to recv UID shift: %m");
2459 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2460 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2461 return -EIO;
2462 }
2463 }
2464
2465 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2466 }
2467
03cfe0d5 2468 /* Turn directory into bind mount */
60e76d48
ZJS
2469 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2470 if (r < 0)
2471 return r;
03cfe0d5 2472
19caffac
AC
2473 /* Mark everything as shared so our mounts get propagated down. This is
2474 * required to make new bind mounts available in systemd services
2475 * inside the containter that create a new mount namespace.
2476 * See https://github.com/systemd/systemd/issues/3860
2477 * Further submounts (such as /dev) done after this will inherit the
2478 * shared propagation mode.*/
60e76d48
ZJS
2479 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2480 if (r < 0)
2481 return r;
19caffac 2482
7336138e 2483 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
2484 if (r < 0)
2485 return r;
2486
0de7acce
LP
2487 r = setup_volatile(
2488 directory,
2489 arg_volatile_mode,
2490 arg_userns_mode != USER_NAMESPACE_NO,
2491 arg_uid_shift,
2492 arg_uid_range,
2493 arg_selinux_context);
03cfe0d5
LP
2494 if (r < 0)
2495 return r;
2496
0de7acce
LP
2497 r = setup_volatile_state(
2498 directory,
2499 arg_volatile_mode,
2500 arg_userns_mode != USER_NAMESPACE_NO,
2501 arg_uid_shift,
2502 arg_uid_range,
2503 arg_selinux_context);
03cfe0d5
LP
2504 if (r < 0)
2505 return r;
2506
03cfe0d5
LP
2507 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2508 if (r < 0)
2509 return r;
2510
03cfe0d5 2511 if (arg_read_only) {
6b7c9f8b 2512 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2513 if (r < 0)
2514 return log_error_errno(r, "Failed to make tree read-only: %m");
2515 }
2516
0de7acce 2517 r = mount_all(directory,
4f086aab 2518 arg_mount_settings,
0de7acce
LP
2519 arg_uid_shift,
2520 arg_uid_range,
2521 arg_selinux_apifs_context);
03cfe0d5
LP
2522 if (r < 0)
2523 return r;
2524
07fa00f9
LP
2525 r = copy_devnodes(directory);
2526 if (r < 0)
03cfe0d5
LP
2527 return r;
2528
2529 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2530
07fa00f9
LP
2531 r = setup_pts(directory);
2532 if (r < 0)
03cfe0d5
LP
2533 return r;
2534
2535 r = setup_propagate(directory);
2536 if (r < 0)
2537 return r;
2538
2539 r = setup_dev_console(directory, console);
2540 if (r < 0)
2541 return r;
2542
520e0d54 2543 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
2544 if (r < 0)
2545 return r;
2546
2547 r = setup_timezone(directory);
2548 if (r < 0)
2549 return r;
2550
2551 r = setup_resolv_conf(directory);
2552 if (r < 0)
2553 return r;
2554
e01ff70a
MS
2555 r = setup_machine_id(directory);
2556 if (r < 0)
2557 return r;
2558
03cfe0d5
LP
2559 r = setup_journal(directory);
2560 if (r < 0)
2561 return r;
2562
0de7acce
LP
2563 r = mount_custom(
2564 directory,
2565 arg_custom_mounts,
2566 arg_n_custom_mounts,
2567 arg_userns_mode != USER_NAMESPACE_NO,
2568 arg_uid_shift,
2569 arg_uid_range,
2570 arg_selinux_apifs_context);
03cfe0d5
LP
2571 if (r < 0)
2572 return r;
2573
5a8ff0e6 2574 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2575 r = mount_cgroups(
2576 directory,
2577 arg_unified_cgroup_hierarchy,
2578 arg_userns_mode != USER_NAMESPACE_NO,
2579 arg_uid_shift,
2580 arg_uid_range,
5a8ff0e6 2581 arg_selinux_apifs_context,
ada54120 2582 false);
0996ef00
CB
2583 if (r < 0)
2584 return r;
2585 }
03cfe0d5
LP
2586
2587 r = mount_move_root(directory);
2588 if (r < 0)
2589 return log_error_errno(r, "Failed to move root directory: %m");
2590
9c1e04d0
AP
2591 fd = setup_sd_notify_child();
2592 if (fd < 0)
2593 return fd;
2594
03cfe0d5 2595 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2596 arg_clone_ns_flags |
03cfe0d5 2597 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2598 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2599 if (pid < 0)
2600 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2601 if (pid == 0) {
2602 pid_socket = safe_close(pid_socket);
e01ff70a 2603 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2604 notify_socket = safe_close(notify_socket);
825d5287 2605 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2606
2607 /* The inner child has all namespaces that are
2608 * requested, so that we all are owned by the user if
2609 * user namespaces are turned on. */
2610
f757855e 2611 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2612 if (r < 0)
2613 _exit(EXIT_FAILURE);
2614
2615 _exit(EXIT_SUCCESS);
2616 }
2617
2618 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2619 if (l < 0)
2620 return log_error_errno(errno, "Failed to send PID: %m");
2621 if (l != sizeof(pid)) {
2622 log_error("Short write while sending PID.");
2623 return -EIO;
2624 }
2625
e01ff70a
MS
2626 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2627 if (l < 0)
2628 return log_error_errno(errno, "Failed to send machine ID: %m");
2629 if (l != sizeof(arg_uuid)) {
2630 log_error("Short write while sending machine ID.");
2631 return -EIO;
2632 }
2633
9c1e04d0
AP
2634 l = send_one_fd(notify_socket, fd, 0);
2635 if (l < 0)
2636 return log_error_errno(errno, "Failed to send notify fd: %m");
2637
03cfe0d5 2638 pid_socket = safe_close(pid_socket);
e01ff70a 2639 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2640 notify_socket = safe_close(notify_socket);
327e26d6
KN
2641 kmsg_socket = safe_close(kmsg_socket);
2642 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2643
2644 return 0;
2645}
2646
0e7ac751
LP
2647static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2648 unsigned n_tries = 100;
2649 uid_t candidate;
2650 int r;
2651
2652 assert(shift);
2653 assert(ret_lock_file);
0de7acce 2654 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2655 assert(arg_uid_range == 0x10000U);
2656
2657 candidate = *shift;
2658
2659 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2660
2661 for (;;) {
2662 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2663 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2664
2665 if (--n_tries <= 0)
2666 return -EBUSY;
2667
2668 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2669 goto next;
2670 if ((candidate & UINT32_C(0xFFFF)) != 0)
2671 goto next;
2672
2673 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2674 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2675 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2676 goto next;
2677 if (r < 0)
2678 return r;
2679
2680 /* Make some superficial checks whether the range is currently known in the user database */
2681 if (getpwuid(candidate))
2682 goto next;
2683 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2684 goto next;
2685 if (getgrgid(candidate))
2686 goto next;
2687 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2688 goto next;
2689
2690 *ret_lock_file = lf;
2691 lf = (struct LockFile) LOCK_FILE_INIT;
2692 *shift = candidate;
2693 return 0;
2694
2695 next:
2696 random_bytes(&candidate, sizeof(candidate));
2697 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2698 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2699 }
2700}
2701
03cfe0d5
LP
2702static int setup_uid_map(pid_t pid) {
2703 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2704 int r;
2705
2706 assert(pid > 1);
2707
2708 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2709 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2710 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2711 if (r < 0)
2712 return log_error_errno(r, "Failed to write UID map: %m");
2713
2714 /* We always assign the same UID and GID ranges */
2715 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2716 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2717 if (r < 0)
2718 return log_error_errno(r, "Failed to write GID map: %m");
2719
2720 return 0;
2721}
2722
9c1e04d0 2723static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
2724 char buf[NOTIFY_BUFFER_MAX+1];
2725 char *p = NULL;
2726 struct iovec iovec = {
2727 .iov_base = buf,
2728 .iov_len = sizeof(buf)-1,
2729 };
2730 union {
2731 struct cmsghdr cmsghdr;
2732 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2733 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2734 } control = {};
2735 struct msghdr msghdr = {
2736 .msg_iov = &iovec,
2737 .msg_iovlen = 1,
2738 .msg_control = &control,
2739 .msg_controllen = sizeof(control),
2740 };
2741 struct cmsghdr *cmsg;
2742 struct ucred *ucred = NULL;
2743 ssize_t n;
2744 pid_t inner_child_pid;
2745 _cleanup_strv_free_ char **tags = NULL;
2746
2747 assert(userdata);
2748
2749 inner_child_pid = PTR_TO_PID(userdata);
2750
2751 if (revents != EPOLLIN) {
2752 log_warning("Got unexpected poll event for notify fd.");
2753 return 0;
2754 }
2755
2756 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2757 if (n < 0) {
2758 if (errno == EAGAIN || errno == EINTR)
2759 return 0;
2760
2761 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2762 }
2763 cmsg_close_all(&msghdr);
2764
2765 CMSG_FOREACH(cmsg, &msghdr) {
2766 if (cmsg->cmsg_level == SOL_SOCKET &&
2767 cmsg->cmsg_type == SCM_CREDENTIALS &&
2768 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2769
2770 ucred = (struct ucred*) CMSG_DATA(cmsg);
2771 }
2772 }
2773
2774 if (!ucred || ucred->pid != inner_child_pid) {
2775 log_warning("Received notify message without valid credentials. Ignoring.");
2776 return 0;
2777 }
2778
2779 if ((size_t) n >= sizeof(buf)) {
2780 log_warning("Received notify message exceeded maximum size. Ignoring.");
2781 return 0;
2782 }
2783
2784 buf[n] = 0;
2785 tags = strv_split(buf, "\n\r");
2786 if (!tags)
2787 return log_oom();
2788
2789 if (strv_find(tags, "READY=1"))
2790 sd_notifyf(false, "READY=1\n");
2791
2792 p = strv_find_startswith(tags, "STATUS=");
2793 if (p)
2794 sd_notifyf(false, "STATUS=Container running: %s", p);
2795
2796 return 0;
2797}
2798
5773024d 2799static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 2800 int r;
9c1e04d0 2801
5773024d 2802 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
2803 if (r < 0)
2804 return log_error_errno(r, "Failed to allocate notify event source: %m");
2805
5773024d 2806 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
2807
2808 return 0;
2809}
2810
f757855e
LP
2811static int load_settings(void) {
2812 _cleanup_(settings_freep) Settings *settings = NULL;
2813 _cleanup_fclose_ FILE *f = NULL;
2814 _cleanup_free_ char *p = NULL;
2815 const char *fn, *i;
2816 int r;
2817
2818 /* If all settings are masked, there's no point in looking for
2819 * the settings file */
2820 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2821 return 0;
2822
2823 fn = strjoina(arg_machine, ".nspawn");
2824
2825 /* We first look in the admin's directories in /etc and /run */
2826 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2827 _cleanup_free_ char *j = NULL;
2828
605405c6 2829 j = strjoin(i, "/", fn);
f757855e
LP
2830 if (!j)
2831 return log_oom();
2832
2833 f = fopen(j, "re");
2834 if (f) {
2835 p = j;
2836 j = NULL;
2837
b938cb90 2838 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2839 if (arg_settings_trusted < 0)
2840 arg_settings_trusted = true;
2841
2842 break;
2843 }
2844
2845 if (errno != ENOENT)
2846 return log_error_errno(errno, "Failed to open %s: %m", j);
2847 }
2848
2849 if (!f) {
2850 /* After that, let's look for a file next to the
2851 * actual image we shall boot. */
2852
2853 if (arg_image) {
2854 p = file_in_same_dir(arg_image, fn);
2855 if (!p)
2856 return log_oom();
2857 } else if (arg_directory) {
2858 p = file_in_same_dir(arg_directory, fn);
2859 if (!p)
2860 return log_oom();
2861 }
2862
2863 if (p) {
2864 f = fopen(p, "re");
2865 if (!f && errno != ENOENT)
2866 return log_error_errno(errno, "Failed to open %s: %m", p);
2867
b938cb90 2868 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2869 if (arg_settings_trusted < 0)
2870 arg_settings_trusted = false;
2871 }
2872 }
2873
2874 if (!f)
2875 return 0;
2876
2877 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2878
2879 r = settings_load(f, p, &settings);
2880 if (r < 0)
2881 return r;
2882
2883 /* Copy over bits from the settings, unless they have been
2884 * explicitly masked by command line switches. */
2885
7732f92b
LP
2886 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2887 settings->start_mode >= 0) {
2888 arg_start_mode = settings->start_mode;
f757855e
LP
2889
2890 strv_free(arg_parameters);
2891 arg_parameters = settings->parameters;
2892 settings->parameters = NULL;
2893 }
2894
5f932eb9
LP
2895 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2896 settings->working_directory) {
2897 free(arg_chdir);
2898 arg_chdir = settings->working_directory;
2899 settings->working_directory = NULL;
2900 }
2901
f757855e
LP
2902 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2903 settings->environment) {
2904 strv_free(arg_setenv);
2905 arg_setenv = settings->environment;
2906 settings->environment = NULL;
2907 }
2908
2909 if ((arg_settings_mask & SETTING_USER) == 0 &&
2910 settings->user) {
2911 free(arg_user);
2912 arg_user = settings->user;
2913 settings->user = NULL;
2914 }
2915
2916 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2917 uint64_t plus;
f757855e 2918
0e265674
LP
2919 plus = settings->capability;
2920 if (settings_private_network(settings))
2921 plus |= (1ULL << CAP_NET_ADMIN);
2922
2923 if (!arg_settings_trusted && plus != 0) {
2924 if (settings->capability != 0)
2925 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2926 } else
520e0d54 2927 arg_caps_retain |= plus;
f757855e 2928
520e0d54 2929 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
2930 }
2931
2932 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2933 settings->kill_signal > 0)
2934 arg_kill_signal = settings->kill_signal;
2935
2936 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2937 settings->personality != PERSONALITY_INVALID)
2938 arg_personality = settings->personality;
2939
2940 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2941 !sd_id128_is_null(settings->machine_id)) {
2942
2943 if (!arg_settings_trusted)
2944 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2945 else
2946 arg_uuid = settings->machine_id;
2947 }
2948
2949 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2950 settings->read_only >= 0)
2951 arg_read_only = settings->read_only;
2952
2953 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2954 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2955 arg_volatile_mode = settings->volatile_mode;
2956
2957 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2958 settings->n_custom_mounts > 0) {
2959
2960 if (!arg_settings_trusted)
2961 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2962 else {
2963 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2964 arg_custom_mounts = settings->custom_mounts;
2965 arg_n_custom_mounts = settings->n_custom_mounts;
2966
2967 settings->custom_mounts = NULL;
2968 settings->n_custom_mounts = 0;
2969 }
2970 }
2971
2972 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2973 (settings->private_network >= 0 ||
2974 settings->network_veth >= 0 ||
2975 settings->network_bridge ||
22b28dfd 2976 settings->network_zone ||
f757855e
LP
2977 settings->network_interfaces ||
2978 settings->network_macvlan ||
f6d6bad1
LP
2979 settings->network_ipvlan ||
2980 settings->network_veth_extra)) {
f757855e
LP
2981
2982 if (!arg_settings_trusted)
2983 log_warning("Ignoring network settings, file %s is not trusted.", p);
2984 else {
f6d6bad1 2985 arg_network_veth = settings_network_veth(settings);
0e265674
LP
2986 arg_private_network = settings_private_network(settings);
2987
f757855e
LP
2988 strv_free(arg_network_interfaces);
2989 arg_network_interfaces = settings->network_interfaces;
2990 settings->network_interfaces = NULL;
2991
2992 strv_free(arg_network_macvlan);
2993 arg_network_macvlan = settings->network_macvlan;
2994 settings->network_macvlan = NULL;
2995
2996 strv_free(arg_network_ipvlan);
2997 arg_network_ipvlan = settings->network_ipvlan;
2998 settings->network_ipvlan = NULL;
2999
f6d6bad1
LP
3000 strv_free(arg_network_veth_extra);
3001 arg_network_veth_extra = settings->network_veth_extra;
3002 settings->network_veth_extra = NULL;
3003
f757855e
LP
3004 free(arg_network_bridge);
3005 arg_network_bridge = settings->network_bridge;
3006 settings->network_bridge = NULL;
22b28dfd
LP
3007
3008 free(arg_network_zone);
3009 arg_network_zone = settings->network_zone;
3010 settings->network_zone = NULL;
f757855e
LP
3011 }
3012 }
3013
3014 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3015 settings->expose_ports) {
3016
3017 if (!arg_settings_trusted)
3018 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3019 else {
3020 expose_port_free_all(arg_expose_ports);
3021 arg_expose_ports = settings->expose_ports;
3022 settings->expose_ports = NULL;
3023 }
3024 }
3025
0de7acce
LP
3026 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3027 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3028
3029 if (!arg_settings_trusted)
3030 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3031 else {
3032 arg_userns_mode = settings->userns_mode;
3033 arg_uid_shift = settings->uid_shift;
3034 arg_uid_range = settings->uid_range;
3035 arg_userns_chown = settings->userns_chown;
3036 }
3037 }
3038
9c1e04d0
AP
3039 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3040 arg_notify_ready = settings->notify_ready;
3041
f757855e
LP
3042 return 0;
3043}
3044
b0067625
ZJS
3045static int run(int master,
3046 const char* console,
2d845785 3047 DissectedImage *dissected_image,
b0067625
ZJS
3048 bool interactive,
3049 bool secondary,
3050 FDSet *fds,
3051 char veth_name[IFNAMSIZ], bool *veth_created,
3052 union in_addr_union *exposed,
3053 pid_t *pid, int *ret) {
3054
3055 static const struct sigaction sa = {
3056 .sa_handler = nop_signal_handler,
e28c7cd0 3057 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3058 };
3059
3060 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3061 _cleanup_close_ int etc_passwd_lock = -1;
3062 _cleanup_close_pair_ int
3063 kmsg_socket_pair[2] = { -1, -1 },
3064 rtnl_socket_pair[2] = { -1, -1 },
3065 pid_socket_pair[2] = { -1, -1 },
3066 uuid_socket_pair[2] = { -1, -1 },
3067 notify_socket_pair[2] = { -1, -1 },
3068 uid_shift_socket_pair[2] = { -1, -1 };
3069 _cleanup_close_ int notify_socket= -1;
3070 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3071 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3072 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3073 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3074 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3075 ContainerStatus container_status = 0;
3076 char last_char = 0;
3077 int ifi = 0, r;
3078 ssize_t l;
3079 sigset_t mask_chld;
3080
3081 assert_se(sigemptyset(&mask_chld) == 0);
3082 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3083
3084 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3085 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3086 * check with getpwuid() if the specific user already exists. Note that /etc might be
3087 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3088 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3089 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3090 * really ours. */
3091
3092 etc_passwd_lock = take_etc_passwd_lock(NULL);
3093 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3094 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3095 }
3096
3097 r = barrier_create(&barrier);
3098 if (r < 0)
3099 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3100
3101 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3102 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3103
3104 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3105 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3106
3107 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3108 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3109
3110 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3111 return log_error_errno(errno, "Failed to create id socket pair: %m");
3112
3113 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3114 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3115
3116 if (arg_userns_mode != USER_NAMESPACE_NO)
3117 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3118 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3119
3120 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3121 * parent's blocking calls and give it a chance to call wait() and terminate. */
3122 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3123 if (r < 0)
3124 return log_error_errno(errno, "Failed to change the signal mask: %m");
3125
3126 r = sigaction(SIGCHLD, &sa, NULL);
3127 if (r < 0)
3128 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3129
3130 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3131 if (*pid < 0)
3132 return log_error_errno(errno, "clone() failed%s: %m",
3133 errno == EINVAL ?
3134 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3135
3136 if (*pid == 0) {
3137 /* The outer child only has a file system namespace. */
3138 barrier_set_role(&barrier, BARRIER_CHILD);
3139
3140 master = safe_close(master);
3141
3142 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3143 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3144 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3145 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3146 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3147 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3148
3149 (void) reset_all_signal_handlers();
3150 (void) reset_signal_mask();
3151
3152 r = outer_child(&barrier,
3153 arg_directory,
3154 console,
2d845785 3155 dissected_image,
b0067625
ZJS
3156 interactive,
3157 secondary,
3158 pid_socket_pair[1],
3159 uuid_socket_pair[1],
3160 notify_socket_pair[1],
3161 kmsg_socket_pair[1],
3162 rtnl_socket_pair[1],
3163 uid_shift_socket_pair[1],
3164 fds);
3165 if (r < 0)
3166 _exit(EXIT_FAILURE);
3167
3168 _exit(EXIT_SUCCESS);
3169 }
3170
3171 barrier_set_role(&barrier, BARRIER_PARENT);
3172
3173 fds = fdset_free(fds);
3174
3175 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3176 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3177 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3178 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3179 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3180 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3181
3182 if (arg_userns_mode != USER_NAMESPACE_NO) {
3183 /* The child just let us know the UID shift it might have read from the image. */
3184 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3185 if (l < 0)
3186 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3187 if (l != sizeof arg_uid_shift) {
3188 log_error("Short read while reading UID shift.");
3189 return -EIO;
3190 }
3191
3192 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3193 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3194 * image, but if that's already in use, pick a new one, and report back to the child,
3195 * which one we now picked. */
3196
3197 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3198 if (r < 0)
3199 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3200
3201 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3202 if (l < 0)
3203 return log_error_errno(errno, "Failed to send UID shift: %m");
3204 if (l != sizeof arg_uid_shift) {
3205 log_error("Short write while writing UID shift.");
3206 return -EIO;
3207 }
3208 }
3209 }
3210
3211 /* Wait for the outer child. */
3212 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3213 if (r != 0)
3214 return r < 0 ? r : -EIO;
3215
3216 /* And now retrieve the PID of the inner child. */
3217 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3218 if (l < 0)
3219 return log_error_errno(errno, "Failed to read inner child PID: %m");
3220 if (l != sizeof *pid) {
3221 log_error("Short read while reading inner child PID.");
3222 return -EIO;
3223 }
3224
3225 /* We also retrieve container UUID in case it was generated by outer child */
3226 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3227 if (l < 0)
3228 return log_error_errno(errno, "Failed to read container machine ID: %m");
3229 if (l != sizeof(arg_uuid)) {
3230 log_error("Short read while reading container machined ID.");
3231 return -EIO;
3232 }
3233
3234 /* We also retrieve the socket used for notifications generated by outer child */
3235 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3236 if (notify_socket < 0)
3237 return log_error_errno(notify_socket,
3238 "Failed to receive notification socket from the outer child: %m");
3239
3240 log_debug("Init process invoked as PID "PID_FMT, *pid);
3241
3242 if (arg_userns_mode != USER_NAMESPACE_NO) {
3243 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3244 log_error("Child died too early.");
3245 return -ESRCH;
3246 }
3247
3248 r = setup_uid_map(*pid);
3249 if (r < 0)
3250 return r;
3251
3252 (void) barrier_place(&barrier); /* #2 */
3253 }
3254
3255 if (arg_private_network) {
3256
3257 r = move_network_interfaces(*pid, arg_network_interfaces);
3258 if (r < 0)
3259 return r;
3260
3261 if (arg_network_veth) {
3262 r = setup_veth(arg_machine, *pid, veth_name,
3263 arg_network_bridge || arg_network_zone);
3264 if (r < 0)
3265 return r;
3266 else if (r > 0)
3267 ifi = r;
3268
3269 if (arg_network_bridge) {
3270 /* Add the interface to a bridge */
3271 r = setup_bridge(veth_name, arg_network_bridge, false);
3272 if (r < 0)
3273 return r;
3274 if (r > 0)
3275 ifi = r;
3276 } else if (arg_network_zone) {
3277 /* Add the interface to a bridge, possibly creating it */
3278 r = setup_bridge(veth_name, arg_network_zone, true);
3279 if (r < 0)
3280 return r;
3281 if (r > 0)
3282 ifi = r;
3283 }
3284 }
3285
3286 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3287 if (r < 0)
3288 return r;
3289
3290 /* We created the primary and extra veth links now; let's remember this, so that we know to
3291 remove them later on. Note that we don't bother with removing veth links that were created
3292 here when their setup failed half-way, because in that case the kernel should be able to
3293 remove them on its own, since they cannot be referenced by anything yet. */
3294 *veth_created = true;
3295
3296 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3297 if (r < 0)
3298 return r;
3299
3300 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3301 if (r < 0)
3302 return r;
3303 }
3304
3305 if (arg_register) {
3306 r = register_machine(
3307 arg_machine,
3308 *pid,
3309 arg_directory,
3310 arg_uuid,
3311 ifi,
3312 arg_slice,
3313 arg_custom_mounts, arg_n_custom_mounts,
3314 arg_kill_signal,
3315 arg_property,
3316 arg_keep_unit,
3317 arg_container_service_name);
3318 if (r < 0)
3319 return r;
3320 }
3321
f0bef277 3322 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3323 if (r < 0)
3324 return r;
3325
3326 if (arg_keep_unit) {
3327 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3328 if (r < 0)
3329 return r;
3330 }
3331
3332 r = chown_cgroup(*pid, arg_uid_shift);
3333 if (r < 0)
3334 return r;
3335
3336 /* Notify the child that the parent is ready with all
3337 * its setup (including cgroup-ification), and that
3338 * the child can now hand over control to the code to
3339 * run inside the container. */
3340 (void) barrier_place(&barrier); /* #3 */
3341
3342 /* Block SIGCHLD here, before notifying child.
3343 * process_pty() will handle it with the other signals. */
3344 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3345
3346 /* Reset signal to default */
3347 r = default_signals(SIGCHLD, -1);
3348 if (r < 0)
3349 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3350
3351 r = sd_event_new(&event);
3352 if (r < 0)
3353 return log_error_errno(r, "Failed to get default event source: %m");
3354
5773024d 3355 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3356 if (r < 0)
3357 return r;
3358
3359 /* Let the child know that we are ready and wait that the child is completely ready now. */
3360 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3361 log_error("Child died too early.");
3362 return -ESRCH;
3363 }
3364
3365 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3366 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3367 etc_passwd_lock = safe_close(etc_passwd_lock);
3368
3369 sd_notifyf(false,
3370 "STATUS=Container running.\n"
3371 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3372 if (!arg_notify_ready)
3373 sd_notify(false, "READY=1\n");
3374
3375 if (arg_kill_signal > 0) {
3376 /* Try to kill the init system on SIGINT or SIGTERM */
3377 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3378 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3379 } else {
3380 /* Immediately exit */
3381 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3382 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3383 }
3384
6916b164
AU
3385 /* Exit when the child exits */
3386 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3387
3388 if (arg_expose_ports) {
3389 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3390 if (r < 0)
3391 return r;
3392
3393 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3394 }
3395
3396 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3397
3398 r = pty_forward_new(event, master,
3399 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3400 &forward);
3401 if (r < 0)
3402 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3403
3404 r = sd_event_loop(event);
3405 if (r < 0)
3406 return log_error_errno(r, "Failed to run event loop: %m");
3407
3408 pty_forward_get_last_char(forward, &last_char);
3409
3410 forward = pty_forward_free(forward);
3411
3412 if (!arg_quiet && last_char != '\n')
3413 putc('\n', stdout);
3414
3415 /* Kill if it is not dead yet anyway */
3416 if (arg_register && !arg_keep_unit)
3417 terminate_machine(*pid);
3418
3419 /* Normally redundant, but better safe than sorry */
c67b0082 3420 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3421
3422 r = wait_for_container(*pid, &container_status);
3423 *pid = 0;
3424
3425 if (r < 0)
3426 /* We failed to wait for the container, or the container exited abnormally. */
3427 return r;
3428 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3429 /* r > 0 → The container exited with a non-zero status.
3430 * As a special case, we need to replace 133 with a different value,
3431 * because 133 is special-cased in the service file to reboot the container.
3432 * otherwise → The container exited with zero status and a reboot was not requested.
3433 */
2a49b612 3434 if (r == EXIT_FORCE_RESTART)
27e29a1e 3435 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3436 *ret = r;
b0067625
ZJS
3437 return 0; /* finito */
3438 }
3439
3440 /* CONTAINER_REBOOTED, loop again */
3441
3442 if (arg_keep_unit) {
3443 /* Special handling if we are running as a service: instead of simply
3444 * restarting the machine we want to restart the entire service, so let's
3445 * inform systemd about this with the special exit code 133. The service
3446 * file uses RestartForceExitStatus=133 so that this results in a full
3447 * nspawn restart. This is necessary since we might have cgroup parameters
3448 * set we want to have flushed out. */
2a49b612
ZJS
3449 *ret = EXIT_FORCE_RESTART;
3450 return 0; /* finito */
b0067625
ZJS
3451 }
3452
3453 expose_port_flush(arg_expose_ports, exposed);
3454
3455 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3456 *veth_created = false;
3457 return 1; /* loop again */
3458}
3459
4623e8e6
LP
3460static int load_root_hash(const char *image) {
3461 _cleanup_free_ char *text = NULL;
3462 char *fn, *n, *e;
3463 void *k;
3464 size_t l;
3465 int r;
3466
3467 assert_se(image);
3468
3469 /* Try to load the root hash from a file next to the image file if it exists. */
3470
3471 if (arg_root_hash)
3472 return 0;
3473
3474 fn = new(char, strlen(image) + strlen(".roothash") + 1);
3475 if (!fn)
3476 return log_oom();
3477
3478 n = stpcpy(fn, image);
3479 e = endswith(fn, ".raw");
3480 if (e)
3481 n = e;
3482
3483 strcpy(n, ".roothash");
3484
3485 r = read_one_line_file(fn, &text);
3486 if (r == -ENOENT)
3487 return 0;
3488 if (r < 0) {
3489 log_warning_errno(r, "Failed to read %s, ignoring: %m", fn);
3490 return 0;
3491 }
3492
3493 r = unhexmem(text, strlen(text), &k, &l);
3494 if (r < 0)
3495 return log_error_errno(r, "Invalid root hash: %s", text);
3496 if (l < sizeof(sd_id128_t)) {
3497 free(k);
3498 return log_error_errno(r, "Root hash too short: %s", text);
3499 }
3500
3501 arg_root_hash = k;
3502 arg_root_hash_size = l;
3503
3504 return 0;
3505}
3506
03cfe0d5
LP
3507int main(int argc, char *argv[]) {
3508
2d845785
LP
3509 _cleanup_free_ char *console = NULL;
3510 _cleanup_close_ int master = -1;
03cfe0d5 3511 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 3512 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 3513 char veth_name[IFNAMSIZ] = "";
17cbb288 3514 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 3515 pid_t pid = 0;
03cfe0d5
LP
3516 union in_addr_union exposed = {};
3517 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
3518 bool interactive, veth_created = false, remove_tmprootdir = false;
3519 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 3520 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
3521 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3522 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
3523
3524 log_parse_environment();
3525 log_open();
3526
7732f92b
LP
3527 /* Make sure rename_process() in the stub init process can work */
3528 saved_argv = argv;
3529 saved_argc = argc;
3530
03cfe0d5
LP
3531 r = parse_argv(argc, argv);
3532 if (r <= 0)
3533 goto finish;
3534
03cfe0d5
LP
3535 if (geteuid() != 0) {
3536 log_error("Need to be root.");
3537 r = -EPERM;
3538 goto finish;
3539 }
f757855e
LP
3540 r = determine_names();
3541 if (r < 0)
3542 goto finish;
3543
3544 r = load_settings();
3545 if (r < 0)
3546 goto finish;
3547
3548 r = verify_arguments();
3549 if (r < 0)
3550 goto finish;
03cfe0d5
LP
3551
3552 n_fd_passed = sd_listen_fds(false);
3553 if (n_fd_passed > 0) {
3554 r = fdset_new_listen_fds(&fds, false);
3555 if (r < 0) {
3556 log_error_errno(r, "Failed to collect file descriptors: %m");
3557 goto finish;
3558 }
3559 }
3560
3561 if (arg_directory) {
3562 assert(!arg_image);
3563
3564 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3565 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3566 r = -EINVAL;
3567 goto finish;
3568 }
3569
3570 if (arg_ephemeral) {
3571 _cleanup_free_ char *np = NULL;
3572
8d4aa2bb 3573 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
3574 if (r < 0)
3575 goto finish;
3576
03cfe0d5
LP
3577 /* If the specified path is a mount point we
3578 * generate the new snapshot immediately
3579 * inside it under a random name. However if
3580 * the specified is not a mount point we
3581 * create the new snapshot in the parent
3582 * directory, just next to it. */
e1873695 3583 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
3584 if (r < 0) {
3585 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3586 goto finish;
3587 }
3588 if (r > 0)
770b5ce4 3589 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3590 else
770b5ce4 3591 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 3592 if (r < 0) {
0f3be6ca 3593 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
3594 goto finish;
3595 }
3596
3597 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3598 if (r < 0) {
3599 log_error_errno(r, "Failed to lock %s: %m", np);
3600 goto finish;
3601 }
3602
17cbb288
LP
3603 r = btrfs_subvol_snapshot(arg_directory, np,
3604 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3605 BTRFS_SNAPSHOT_FALLBACK_COPY |
3606 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3607 BTRFS_SNAPSHOT_RECURSIVE |
3608 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3609 if (r < 0) {
3610 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3611 goto finish;
ec16945e
LP
3612 }
3613
3614 free(arg_directory);
3615 arg_directory = np;
8a16a7b4 3616 np = NULL;
ec16945e 3617
17cbb288 3618 remove_directory = true;
30535c16
LP
3619
3620 } else {
cb638b5e 3621 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
3622 if (r < 0)
3623 goto finish;
3624
30535c16
LP
3625 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3626 if (r == -EBUSY) {
3627 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3628 goto finish;
3629 }
3630 if (r < 0) {
3631 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3632 goto finish;
30535c16
LP
3633 }
3634
3635 if (arg_template) {
8d4aa2bb 3636 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
3637 if (r < 0)
3638 goto finish;
3639
17cbb288
LP
3640 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3641 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3642 BTRFS_SNAPSHOT_FALLBACK_COPY |
3643 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3644 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3645 BTRFS_SNAPSHOT_RECURSIVE |
3646 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3647 if (r == -EEXIST) {
3648 if (!arg_quiet)
3649 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3650 } else if (r < 0) {
83521414 3651 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3652 goto finish;
3653 } else {
3654 if (!arg_quiet)
3655 log_info("Populated %s from template %s.", arg_directory, arg_template);
3656 }
3657 }
ec16945e
LP
3658 }
3659
7732f92b 3660 if (arg_start_mode == START_BOOT) {
1b9e5b12 3661 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3662 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3663 r = -EINVAL;
1b9e5b12
LP
3664 goto finish;
3665 }
3666 } else {
3667 const char *p;
3668
16fb773e
LP
3669 p = strjoina(arg_directory, "/usr/");
3670 if (laccess(p, F_OK) < 0) {
3671 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3672 r = -EINVAL;
1b9e5b12 3673 goto finish;
1b9e5b12
LP
3674 }
3675 }
ec16945e 3676
6b9132a9 3677 } else {
ec16945e
LP
3678 assert(arg_image);
3679 assert(!arg_template);
3680
8d4aa2bb 3681 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
3682 if (r < 0)
3683 goto finish;
3684
0f3be6ca
LP
3685 if (arg_ephemeral) {
3686 _cleanup_free_ char *np = NULL;
3687
3688 r = tempfn_random(arg_image, "machine.", &np);
3689 if (r < 0) {
3690 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3691 goto finish;
3692 }
3693
3694 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3695 if (r < 0) {
3696 r = log_error_errno(r, "Failed to create image lock: %m");
3697 goto finish;
3698 }
3699
3700 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL);
3701 if (r < 0) {
3702 r = log_error_errno(r, "Failed to copy image file: %m");
3703 goto finish;
3704 }
3705
3706 free(arg_image);
3707 arg_image = np;
3708 np = NULL;
3709
3710 remove_image = true;
3711 } else {
3712 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3713 if (r == -EBUSY) {
3714 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3715 goto finish;
3716 }
3717 if (r < 0) {
3718 r = log_error_errno(r, "Failed to create image lock: %m");
3719 goto finish;
3720 }
4623e8e6
LP
3721
3722 r = load_root_hash(arg_image);
3723 if (r < 0)
3724 goto finish;
30535c16
LP
3725 }
3726
c67b0082 3727 if (!mkdtemp(tmprootdir)) {
0f3be6ca 3728 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 3729 goto finish;
1b9e5b12 3730 }
6b9132a9 3731
c67b0082
LP
3732 remove_tmprootdir = true;
3733
3734 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
3735 if (!arg_directory) {
3736 r = log_oom();
3737 goto finish;
6b9132a9 3738 }
88213476 3739
2d845785
LP
3740 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3741 if (r < 0) {
3742 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
3743 goto finish;
3744 }
1b9e5b12 3745
4623e8e6 3746 r = dissect_image(loop->fd, arg_root_hash, arg_root_hash_size, &dissected_image);
2d845785
LP
3747 if (r == -ENOPKG) {
3748 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3749
3750 log_notice("Note that the disk image needs to\n"
3751 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3752 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3753 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3754 " d) or contain a file system without a partition table\n"
3755 "in order to be bootable with systemd-nspawn.");
1b9e5b12 3756 goto finish;
2d845785 3757 }
4623e8e6
LP
3758 if (r == -EADDRNOTAVAIL) {
3759 log_error_errno(r, "No root partition for specified root hash found.");
3760 goto finish;
3761 }
2d845785
LP
3762 if (r == -EOPNOTSUPP) {
3763 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3764 goto finish;
3765 }
3766 if (r < 0) {
3767 log_error_errno(r, "Failed to dissect image: %m");
842f3b0f
LP
3768 goto finish;
3769 }
1b9e5b12 3770
4623e8e6
LP
3771 if (!arg_root_hash && dissected_image->can_verity)
3772 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3773
3774 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
3775 if (r < 0)
3776 goto finish;
0f3be6ca
LP
3777
3778 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3779 if (remove_image && unlink(arg_image) >= 0)
3780 remove_image = false;
842f3b0f 3781 }
842f3b0f 3782
86c0dd4a 3783 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
3784 if (r < 0)
3785 goto finish;
3786
03cfe0d5
LP
3787 interactive =
3788 isatty(STDIN_FILENO) > 0 &&
3789 isatty(STDOUT_FILENO) > 0;
9c857b9d 3790
db7feb7e
LP
3791 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3792 if (master < 0) {
ec16945e 3793 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3794 goto finish;
3795 }
3796
611b312b
LP
3797 r = ptsname_malloc(master, &console);
3798 if (r < 0) {
3799 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3800 goto finish;
68b02049
DW
3801 }
3802
3803 if (arg_selinux_apifs_context) {
3804 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3805 if (r < 0)
3806 goto finish;
a258bf26
LP
3807 }
3808
a258bf26 3809 if (unlockpt(master) < 0) {
ec16945e 3810 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3811 goto finish;
3812 }
3813
9c857b9d
LP
3814 if (!arg_quiet)
3815 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3816 arg_machine, arg_image ?: arg_directory);
3817
72c0a2c2 3818 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3819
03cfe0d5
LP
3820 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3821 r = log_error_errno(errno, "Failed to become subreaper: %m");
3822 goto finish;
3823 }
3824
d87be9b0 3825 for (;;) {
b0067625
ZJS
3826 r = run(master,
3827 console,
2d845785 3828 dissected_image,
b0067625
ZJS
3829 interactive, secondary,
3830 fds,
3831 veth_name, &veth_created,
3832 &exposed,
3833 &pid, &ret);
3834 if (r <= 0)
d87be9b0 3835 break;
d87be9b0 3836 }
88213476
LP
3837
3838finish:
af4ec430 3839 sd_notify(false,
2a49b612
ZJS
3840 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3841 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 3842
9444b1f2 3843 if (pid > 0)
c67b0082 3844 (void) kill(pid, SIGKILL);
88213476 3845
503546da 3846 /* Try to flush whatever is still queued in the pty */
6a0f896b 3847 if (master >= 0) {
59f448cf 3848 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
6a0f896b
LP
3849 master = safe_close(master);
3850 }
3851
3852 if (pid > 0)
3853 (void) wait_for_terminate(pid, NULL);
503546da 3854
17cbb288 3855 if (remove_directory && arg_directory) {
ec16945e
LP
3856 int k;
3857
17cbb288 3858 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 3859 if (k < 0)
17cbb288 3860 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
3861 }
3862
0f3be6ca
LP
3863 if (remove_image && arg_image) {
3864 if (unlink(arg_image) < 0)
3865 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3866 }
3867
c67b0082
LP
3868 if (remove_tmprootdir) {
3869 if (rmdir(tmprootdir) < 0)
3870 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3871 }
3872
785890ac
LP
3873 if (arg_machine) {
3874 const char *p;
3875
63c372cb 3876 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3877 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3878 }
3879
7a8f6325 3880 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
3881
3882 if (veth_created)
3883 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 3884 (void) remove_bridge(arg_network_zone);
f757855e 3885
04d391da 3886 free(arg_directory);
ec16945e
LP
3887 free(arg_template);
3888 free(arg_image);
7027ff61 3889 free(arg_machine);
c74e630d 3890 free(arg_user);
5f932eb9 3891 free(arg_chdir);
c74e630d 3892 strv_free(arg_setenv);
f757855e 3893 free(arg_network_bridge);
c74e630d
LP
3894 strv_free(arg_network_interfaces);
3895 strv_free(arg_network_macvlan);
4bbfe7ad 3896 strv_free(arg_network_ipvlan);
f6d6bad1 3897 strv_free(arg_network_veth_extra);
f757855e
LP
3898 strv_free(arg_parameters);
3899 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3900 expose_port_free_all(arg_expose_ports);
4623e8e6 3901 free(arg_root_hash);
6d0b55c2 3902
ec16945e 3903 return r < 0 ? EXIT_FAILURE : ret;
88213476 3904}