]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
88213476
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
88213476 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
88213476
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
349cc4a5 21#if HAVE_BLKID
6b5cf3ea 22#include <blkid.h>
8fe0087e 23#endif
88213476 24#include <errno.h>
88213476 25#include <getopt.h>
0e7ac751 26#include <grp.h>
1b9e5b12 27#include <linux/loop.h>
0e7ac751 28#include <pwd.h>
8fe0087e 29#include <sched.h>
349cc4a5 30#if HAVE_SELINUX
8fe0087e 31#include <selinux/selinux.h>
1b9e5b12 32#endif
8fe0087e
LP
33#include <signal.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <sys/file.h>
38#include <sys/mount.h>
39#include <sys/personality.h>
40#include <sys/prctl.h>
41#include <sys/types.h>
6916b164 42#include <sys/wait.h>
8fe0087e 43#include <unistd.h>
1b9e5b12 44
b053cd5f 45#include "sd-bus.h"
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e 48
b5efdb8a 49#include "alloc-util.h"
8fe0087e
LP
50#include "barrier.h"
51#include "base-filesystem.h"
52#include "blkid-util.h"
53#include "btrfs-util.h"
b053cd5f 54#include "bus-util.h"
8fe0087e 55#include "cap-list.h"
430f0182 56#include "capability-util.h"
04d391da 57#include "cgroup-util.h"
8fe0087e 58#include "copy.h"
4fc9982c 59#include "dev-setup.h"
2d845785 60#include "dissect-image.h"
8fe0087e 61#include "env-util.h"
3ffd4af2 62#include "fd-util.h"
842f3b0f 63#include "fdset.h"
a5c32cff 64#include "fileio.h"
f97b34a6 65#include "format-util.h"
f4f15635 66#include "fs-util.h"
1b9e5b12 67#include "gpt.h"
4623e8e6 68#include "hexdecoct.h"
8fe0087e 69#include "hostname-util.h"
910fd145 70#include "id128-util.h"
8fe0087e 71#include "log.h"
2d845785 72#include "loop-util.h"
8fe0087e 73#include "loopback-setup.h"
1b9cebf6 74#include "machine-image.h"
8fe0087e
LP
75#include "macro.h"
76#include "missing.h"
77#include "mkdir.h"
4349cd7c 78#include "mount-util.h"
8fe0087e 79#include "netlink-util.h"
07630cea 80#include "nspawn-cgroup.h"
3603efde 81#include "nspawn-def.h"
07630cea
LP
82#include "nspawn-expose-ports.h"
83#include "nspawn-mount.h"
84#include "nspawn-network.h"
7336138e 85#include "nspawn-patch-uid.h"
07630cea 86#include "nspawn-register.h"
910fd145 87#include "nspawn-seccomp.h"
07630cea
LP
88#include "nspawn-settings.h"
89#include "nspawn-setuid.h"
7732f92b 90#include "nspawn-stub-pid1.h"
6bedfcbb 91#include "parse-util.h"
8fe0087e 92#include "path-util.h"
0b452006 93#include "process-util.h"
8fe0087e
LP
94#include "ptyfwd.h"
95#include "random-util.h"
8869a0b4 96#include "raw-clone.h"
8fe0087e 97#include "rm-rf.h"
68b02049 98#include "selinux-util.h"
8fe0087e 99#include "signal-util.h"
2583fbea 100#include "socket-util.h"
8fcde012 101#include "stat-util.h"
15a5e950 102#include "stdio-util.h"
07630cea 103#include "string-util.h"
8fe0087e
LP
104#include "strv.h"
105#include "terminal-util.h"
106#include "udev-util.h"
affb60b1 107#include "umask-util.h"
b1d4f8e1 108#include "user-util.h"
8fe0087e 109#include "util.h"
e9642be2 110
9c1e04d0
AP
111/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
112 * nspawn_notify_socket_path is relative to the container
113 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
114#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 115
2a49b612
ZJS
116#define EXIT_FORCE_RESTART 133
117
113cea80
DH
118typedef enum ContainerStatus {
119 CONTAINER_TERMINATED,
120 CONTAINER_REBOOTED
121} ContainerStatus;
122
57fb9fb5
LP
123typedef enum LinkJournal {
124 LINK_NO,
125 LINK_AUTO,
126 LINK_HOST,
127 LINK_GUEST
128} LinkJournal;
88213476
LP
129
130static char *arg_directory = NULL;
ec16945e 131static char *arg_template = NULL;
5f932eb9 132static char *arg_chdir = NULL;
b53ede69
PW
133static char *arg_pivot_root_new = NULL;
134static char *arg_pivot_root_old = NULL;
687d0825 135static char *arg_user = NULL;
9444b1f2 136static sd_id128_t arg_uuid = {};
7027ff61 137static char *arg_machine = NULL;
c74e630d
LP
138static const char *arg_selinux_context = NULL;
139static const char *arg_selinux_apifs_context = NULL;
9444b1f2 140static const char *arg_slice = NULL;
ff01d048 141static bool arg_private_network = false;
bc2f673e 142static bool arg_read_only = false;
7732f92b 143static StartMode arg_start_mode = START_PID1;
ec16945e 144static bool arg_ephemeral = false;
57fb9fb5 145static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 146static bool arg_link_journal_try = false;
520e0d54 147static uint64_t arg_caps_retain =
50b52222
LP
148 (1ULL << CAP_AUDIT_CONTROL) |
149 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
150 (1ULL << CAP_CHOWN) |
151 (1ULL << CAP_DAC_OVERRIDE) |
152 (1ULL << CAP_DAC_READ_SEARCH) |
153 (1ULL << CAP_FOWNER) |
154 (1ULL << CAP_FSETID) |
155 (1ULL << CAP_IPC_OWNER) |
156 (1ULL << CAP_KILL) |
157 (1ULL << CAP_LEASE) |
158 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 159 (1ULL << CAP_MKNOD) |
5076f0cc
LP
160 (1ULL << CAP_NET_BIND_SERVICE) |
161 (1ULL << CAP_NET_BROADCAST) |
162 (1ULL << CAP_NET_RAW) |
5076f0cc 163 (1ULL << CAP_SETFCAP) |
50b52222 164 (1ULL << CAP_SETGID) |
5076f0cc
LP
165 (1ULL << CAP_SETPCAP) |
166 (1ULL << CAP_SETUID) |
167 (1ULL << CAP_SYS_ADMIN) |
50b52222 168 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
169 (1ULL << CAP_SYS_CHROOT) |
170 (1ULL << CAP_SYS_NICE) |
171 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 172 (1ULL << CAP_SYS_RESOURCE) |
50b52222 173 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
174static CustomMount *arg_custom_mounts = NULL;
175static unsigned arg_n_custom_mounts = 0;
f4889f65 176static char **arg_setenv = NULL;
284c0b91 177static bool arg_quiet = false;
eb91eb18 178static bool arg_register = true;
89f7c846 179static bool arg_keep_unit = false;
aa28aefe 180static char **arg_network_interfaces = NULL;
c74e630d 181static char **arg_network_macvlan = NULL;
4bbfe7ad 182static char **arg_network_ipvlan = NULL;
69c79d3c 183static bool arg_network_veth = false;
f6d6bad1 184static char **arg_network_veth_extra = NULL;
f757855e 185static char *arg_network_bridge = NULL;
22b28dfd 186static char *arg_network_zone = NULL;
050f7277 187static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 188static char *arg_image = NULL;
f757855e 189static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 190static ExposePort *arg_expose_ports = NULL;
f36933fe 191static char **arg_property = NULL;
0de7acce 192static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 193static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 194static bool arg_userns_chown = false;
c6c8f6e2 195static int arg_kill_signal = 0;
5da38d07 196static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
197static SettingsMask arg_settings_mask = 0;
198static int arg_settings_trusted = -1;
199static char **arg_parameters = NULL;
6aadfa4c 200static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 201static bool arg_notify_ready = false;
5a8ff0e6 202static bool arg_use_cgns = true;
0c582db0 203static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
4f086aab 204static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
4623e8e6
LP
205static void *arg_root_hash = NULL;
206static size_t arg_root_hash_size = 0;
960e4569
LP
207static char **arg_syscall_whitelist = NULL;
208static char **arg_syscall_blacklist = NULL;
88213476 209
601185b4 210static void help(void) {
88213476
LP
211 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
212 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
213 " -h --help Show this help\n"
214 " --version Print version string\n"
69c79d3c 215 " -q --quiet Do not show status information\n"
1b9e5b12 216 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
217 " --template=PATH Initialize root directory from template directory,\n"
218 " if missing\n"
219 " -x --ephemeral Run container with snapshot of root directory, and\n"
220 " remove it after exit\n"
221 " -i --image=PATH File system device or disk image for the container\n"
4623e8e6 222 " --root-hash=HASH Specify verity root hash\n"
7732f92b 223 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 224 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 225 " --chdir=PATH Set working directory in the container\n"
b53ede69
PW
226 " --pivot-root=PATH[:PATH]\n"
227 " Pivot root to given directory in the container\n"
a8828ed9 228 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 229 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 230 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 231 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 232 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 233 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 234 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 235 " Similar, but with user configured UID/GID range\n"
24597ee0 236 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
237 " --private-network Disable network in container\n"
238 " --network-interface=INTERFACE\n"
239 " Assign an existing network interface to the\n"
240 " container\n"
c74e630d
LP
241 " --network-macvlan=INTERFACE\n"
242 " Create a macvlan network interface based on an\n"
243 " existing network interface to the container\n"
4bbfe7ad
TG
244 " --network-ipvlan=INTERFACE\n"
245 " Create a ipvlan network interface based on an\n"
246 " existing network interface to the container\n"
a8eaaee7 247 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 248 " and container\n"
f6d6bad1
LP
249 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
250 " Add an additional virtual Ethernet link between\n"
251 " host and container\n"
ab046dde 252 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
253 " Add a virtual Ethernet connection to the container\n"
254 " and attach it to an existing bridge on the host\n"
255 " --network-zone=NAME Similar, but attach the new interface to an\n"
256 " an automatically managed bridge interface\n"
6d0b55c2 257 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 258 " Expose a container IP port on the host\n"
82adf6af
LP
259 " -Z --selinux-context=SECLABEL\n"
260 " Set the SELinux security context to be used by\n"
261 " processes in the container\n"
262 " -L --selinux-apifs-context=SECLABEL\n"
263 " Set the SELinux security context to be used by\n"
264 " API/tmpfs file systems in the container\n"
a8828ed9
DW
265 " --capability=CAP In addition to the default, retain specified\n"
266 " capability\n"
267 " --drop-capability=CAP Drop the specified capability from the default set\n"
960e4569
LP
268 " --system-call-filter=LIST|~LIST\n"
269 " Permit/prohibit specific system calls\n"
c6c8f6e2 270 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
271 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
272 " host, try-guest, try-host\n"
574edc90 273 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 274 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
275 " --bind=PATH[:PATH[:OPTIONS]]\n"
276 " Bind mount a file or directory from the host into\n"
a8828ed9 277 " the container\n"
5e5bfa6e
EY
278 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
279 " Similar, but creates a read-only bind mount\n"
06c17c39 280 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
281 " --overlay=PATH[:PATH...]:PATH\n"
282 " Create an overlay mount from the host to \n"
283 " the container\n"
284 " --overlay-ro=PATH[:PATH...]:PATH\n"
285 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 286 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 287 " --register=BOOLEAN Register container as machine\n"
89f7c846 288 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 289 " the service unit nspawn is running in\n"
6d0b55c2 290 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 291 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 292 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 293 , program_invocation_short_name);
88213476
LP
294}
295
86c0dd4a 296static int custom_mount_check_all(void) {
5a8af538 297 unsigned i;
5a8af538 298
5a8af538
LP
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
0de7acce 302 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
303
304 if (arg_userns_chown) {
305 log_error("--private-users-chown may not be combined with custom root mounts.");
306 return -EINVAL;
307 } else if (arg_uid_shift == UID_INVALID) {
308 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
309 return -EINVAL;
310 }
825d5287 311 }
5a8af538
LP
312 }
313
314 return 0;
315}
316
0fd9563f 317static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 318 const char *e;
415fc41c 319 int r;
5da38d07 320
efdb0237
LP
321 /* Allow the user to control whether the unified hierarchy is used */
322 e = getenv("UNIFIED_CGROUP_HIERARCHY");
323 if (e) {
324 r = parse_boolean(e);
325 if (r < 0)
326 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
327 if (r > 0)
328 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
329 else
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 331
efdb0237
LP
332 return 0;
333 }
334
335 /* Otherwise inherit the default from the host system */
b4cccbc1
LP
336 r = cg_all_unified();
337 if (r < 0)
338 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
339 if (r > 0) {
a8725a06
ZJS
340 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
341 * routine only detects 231, so we'll have a false negative here for 230. */
342 r = systemd_installation_has_version(directory, 230);
343 if (r < 0)
344 return log_error_errno(r, "Failed to determine systemd version in container: %m");
345 if (r > 0)
346 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
347 else
348 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
c22800e4 349 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
2977724b
TH
350 /* Mixed cgroup hierarchy support was added in 233 */
351 r = systemd_installation_has_version(directory, 233);
0fd9563f
ZJS
352 if (r < 0)
353 return log_error_errno(r, "Failed to determine systemd version in container: %m");
354 if (r > 0)
355 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
356 else
357 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
358 } else
5da38d07 359 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 360
efdb0237
LP
361 return 0;
362}
363
0c582db0
LB
364static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
365 int r;
366
367 r = getenv_bool(name);
368 if (r == -ENXIO)
369 return;
370 if (r < 0)
371 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
372 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
373}
374
4f086aab
SU
375static void parse_mount_settings_env(void) {
376 int r;
377 const char *e;
378
379 e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
380 if (!e)
381 return;
382
383 if (streq(e, "network")) {
384 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
385 return;
386 }
387
388 r = parse_boolean(e);
389 if (r < 0) {
390 log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
391 return;
ab8ee0f2 392 }
4f086aab 393
ab8ee0f2
ZJS
394 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
395 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
4f086aab
SU
396}
397
88213476
LP
398static int parse_argv(int argc, char *argv[]) {
399
a41fe3a2 400 enum {
acbeb427
ZJS
401 ARG_VERSION = 0x100,
402 ARG_PRIVATE_NETWORK,
bc2f673e 403 ARG_UUID,
5076f0cc 404 ARG_READ_ONLY,
57fb9fb5 405 ARG_CAPABILITY,
420c7379 406 ARG_DROP_CAPABILITY,
17fe0523
LP
407 ARG_LINK_JOURNAL,
408 ARG_BIND,
f4889f65 409 ARG_BIND_RO,
06c17c39 410 ARG_TMPFS,
5a8af538
LP
411 ARG_OVERLAY,
412 ARG_OVERLAY_RO,
eb91eb18 413 ARG_SHARE_SYSTEM,
89f7c846 414 ARG_REGISTER,
aa28aefe 415 ARG_KEEP_UNIT,
69c79d3c 416 ARG_NETWORK_INTERFACE,
c74e630d 417 ARG_NETWORK_MACVLAN,
4bbfe7ad 418 ARG_NETWORK_IPVLAN,
ab046dde 419 ARG_NETWORK_BRIDGE,
22b28dfd 420 ARG_NETWORK_ZONE,
f6d6bad1 421 ARG_NETWORK_VETH_EXTRA,
6afc95b7 422 ARG_PERSONALITY,
4d9f07b4 423 ARG_VOLATILE,
ec16945e 424 ARG_TEMPLATE,
f36933fe 425 ARG_PROPERTY,
6dac160c 426 ARG_PRIVATE_USERS,
c6c8f6e2 427 ARG_KILL_SIGNAL,
f757855e 428 ARG_SETTINGS,
5f932eb9 429 ARG_CHDIR,
b53ede69 430 ARG_PIVOT_ROOT,
7336138e 431 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 432 ARG_NOTIFY_READY,
4623e8e6 433 ARG_ROOT_HASH,
960e4569 434 ARG_SYSTEM_CALL_FILTER,
a41fe3a2
LP
435 };
436
88213476 437 static const struct option options[] = {
27eb8e90
ZJS
438 { "help", no_argument, NULL, 'h' },
439 { "version", no_argument, NULL, ARG_VERSION },
440 { "directory", required_argument, NULL, 'D' },
441 { "template", required_argument, NULL, ARG_TEMPLATE },
442 { "ephemeral", no_argument, NULL, 'x' },
443 { "user", required_argument, NULL, 'u' },
444 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
445 { "as-pid2", no_argument, NULL, 'a' },
446 { "boot", no_argument, NULL, 'b' },
447 { "uuid", required_argument, NULL, ARG_UUID },
448 { "read-only", no_argument, NULL, ARG_READ_ONLY },
449 { "capability", required_argument, NULL, ARG_CAPABILITY },
450 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
451 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
452 { "bind", required_argument, NULL, ARG_BIND },
453 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
454 { "tmpfs", required_argument, NULL, ARG_TMPFS },
455 { "overlay", required_argument, NULL, ARG_OVERLAY },
456 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
457 { "machine", required_argument, NULL, 'M' },
458 { "slice", required_argument, NULL, 'S' },
459 { "setenv", required_argument, NULL, 'E' },
460 { "selinux-context", required_argument, NULL, 'Z' },
461 { "selinux-apifs-context", required_argument, NULL, 'L' },
462 { "quiet", no_argument, NULL, 'q' },
463 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
464 { "register", required_argument, NULL, ARG_REGISTER },
465 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
466 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
467 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
468 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
469 { "network-veth", no_argument, NULL, 'n' },
470 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
471 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
472 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
473 { "personality", required_argument, NULL, ARG_PERSONALITY },
474 { "image", required_argument, NULL, 'i' },
475 { "volatile", optional_argument, NULL, ARG_VOLATILE },
476 { "port", required_argument, NULL, 'p' },
477 { "property", required_argument, NULL, ARG_PROPERTY },
478 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
479 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
480 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
481 { "settings", required_argument, NULL, ARG_SETTINGS },
482 { "chdir", required_argument, NULL, ARG_CHDIR },
b53ede69 483 { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
27eb8e90 484 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
4623e8e6 485 { "root-hash", required_argument, NULL, ARG_ROOT_HASH },
960e4569 486 { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
eb9da376 487 {}
88213476
LP
488 };
489
9444b1f2 490 int c, r;
6aadfa4c 491 const char *p, *e;
a42c8b54 492 uint64_t plus = 0, minus = 0;
f757855e 493 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
494
495 assert(argc >= 0);
496 assert(argv);
497
2e1f244e 498 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:", options, NULL)) >= 0)
88213476
LP
499
500 switch (c) {
501
502 case 'h':
601185b4
ZJS
503 help();
504 return 0;
88213476 505
acbeb427 506 case ARG_VERSION:
3f6fd1ba 507 return version();
acbeb427 508
88213476 509 case 'D':
0f03c2a4 510 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 511 if (r < 0)
0f03c2a4 512 return r;
ec16945e
LP
513 break;
514
515 case ARG_TEMPLATE:
0f03c2a4 516 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 517 if (r < 0)
0f03c2a4 518 return r;
88213476
LP
519 break;
520
1b9e5b12 521 case 'i':
0f03c2a4 522 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 523 if (r < 0)
0f03c2a4 524 return r;
ec16945e
LP
525 break;
526
527 case 'x':
528 arg_ephemeral = true;
1b9e5b12
LP
529 break;
530
687d0825 531 case 'u':
2fc09a9c
DM
532 r = free_and_strdup(&arg_user, optarg);
533 if (r < 0)
7027ff61 534 return log_oom();
687d0825 535
f757855e 536 arg_settings_mask |= SETTING_USER;
687d0825
MV
537 break;
538
22b28dfd
LP
539 case ARG_NETWORK_ZONE: {
540 char *j;
541
542 j = strappend("vz-", optarg);
543 if (!j)
544 return log_oom();
545
546 if (!ifname_valid(j)) {
547 log_error("Network zone name not valid: %s", j);
548 free(j);
549 return -EINVAL;
550 }
551
552 free(arg_network_zone);
553 arg_network_zone = j;
554
555 arg_network_veth = true;
556 arg_private_network = true;
557 arg_settings_mask |= SETTING_NETWORK;
558 break;
559 }
560
ab046dde 561 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
562
563 if (!ifname_valid(optarg)) {
564 log_error("Bridge interface name not valid: %s", optarg);
565 return -EINVAL;
566 }
567
f757855e
LP
568 r = free_and_strdup(&arg_network_bridge, optarg);
569 if (r < 0)
570 return log_oom();
ab046dde
TG
571
572 /* fall through */
573
0dfaa006 574 case 'n':
69c79d3c
LP
575 arg_network_veth = true;
576 arg_private_network = true;
f757855e 577 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
578 break;
579
f6d6bad1
LP
580 case ARG_NETWORK_VETH_EXTRA:
581 r = veth_extra_parse(&arg_network_veth_extra, optarg);
582 if (r < 0)
583 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
584
585 arg_private_network = true;
586 arg_settings_mask |= SETTING_NETWORK;
587 break;
588
aa28aefe 589 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
590
591 if (!ifname_valid(optarg)) {
592 log_error("Network interface name not valid: %s", optarg);
593 return -EINVAL;
594 }
595
c74e630d
LP
596 if (strv_extend(&arg_network_interfaces, optarg) < 0)
597 return log_oom();
598
599 arg_private_network = true;
f757855e 600 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
601 break;
602
603 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
604
605 if (!ifname_valid(optarg)) {
606 log_error("MACVLAN network interface name not valid: %s", optarg);
607 return -EINVAL;
608 }
609
c74e630d 610 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
611 return log_oom();
612
4bbfe7ad 613 arg_private_network = true;
f757855e 614 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
615 break;
616
617 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
618
619 if (!ifname_valid(optarg)) {
620 log_error("IPVLAN network interface name not valid: %s", optarg);
621 return -EINVAL;
622 }
623
4bbfe7ad
TG
624 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
625 return log_oom();
626
aa28aefe
LP
627 /* fall through */
628
ff01d048
LP
629 case ARG_PRIVATE_NETWORK:
630 arg_private_network = true;
f757855e 631 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
632 break;
633
0f0dbc46 634 case 'b':
7732f92b
LP
635 if (arg_start_mode == START_PID2) {
636 log_error("--boot and --as-pid2 may not be combined.");
637 return -EINVAL;
638 }
639
640 arg_start_mode = START_BOOT;
641 arg_settings_mask |= SETTING_START_MODE;
642 break;
643
644 case 'a':
645 if (arg_start_mode == START_BOOT) {
646 log_error("--boot and --as-pid2 may not be combined.");
647 return -EINVAL;
648 }
649
650 arg_start_mode = START_PID2;
651 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
652 break;
653
144f0fc0 654 case ARG_UUID:
9444b1f2 655 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
656 if (r < 0)
657 return log_error_errno(r, "Invalid UUID: %s", optarg);
658
659 if (sd_id128_is_null(arg_uuid)) {
660 log_error("Machine UUID may not be all zeroes.");
661 return -EINVAL;
aa96c6cb 662 }
f757855e
LP
663
664 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 665 break;
aa96c6cb 666
9444b1f2 667 case 'S':
c74e630d 668 arg_slice = optarg;
144f0fc0
LP
669 break;
670
7027ff61 671 case 'M':
c1521918 672 if (isempty(optarg))
97b11eed 673 arg_machine = mfree(arg_machine);
c1521918 674 else {
0c3c4284 675 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
676 log_error("Invalid machine name: %s", optarg);
677 return -EINVAL;
678 }
7027ff61 679
0c3c4284
LP
680 r = free_and_strdup(&arg_machine, optarg);
681 if (r < 0)
eb91eb18 682 return log_oom();
eb91eb18 683 }
9ce6d1b3 684 break;
7027ff61 685
82adf6af
LP
686 case 'Z':
687 arg_selinux_context = optarg;
a8828ed9
DW
688 break;
689
82adf6af
LP
690 case 'L':
691 arg_selinux_apifs_context = optarg;
a8828ed9
DW
692 break;
693
bc2f673e
LP
694 case ARG_READ_ONLY:
695 arg_read_only = true;
f757855e 696 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
697 break;
698
420c7379
LP
699 case ARG_CAPABILITY:
700 case ARG_DROP_CAPABILITY: {
6cbe4ed1 701 p = optarg;
9ed794a3 702 for (;;) {
6cbe4ed1 703 _cleanup_free_ char *t = NULL;
5076f0cc 704
6cbe4ed1
SS
705 r = extract_first_word(&p, &t, ",", 0);
706 if (r < 0)
707 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 708
6cbe4ed1
SS
709 if (r == 0)
710 break;
5076f0cc 711
39ed67d1
LP
712 if (streq(t, "all")) {
713 if (c == ARG_CAPABILITY)
a42c8b54 714 plus = (uint64_t) -1;
39ed67d1 715 else
a42c8b54 716 minus = (uint64_t) -1;
39ed67d1 717 } else {
2822da4f
LP
718 int cap;
719
720 cap = capability_from_name(t);
721 if (cap < 0) {
39ed67d1
LP
722 log_error("Failed to parse capability %s.", t);
723 return -EINVAL;
724 }
725
726 if (c == ARG_CAPABILITY)
a42c8b54 727 plus |= 1ULL << (uint64_t) cap;
39ed67d1 728 else
a42c8b54 729 minus |= 1ULL << (uint64_t) cap;
5076f0cc 730 }
5076f0cc
LP
731 }
732
f757855e 733 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
734 break;
735 }
736
57fb9fb5
LP
737 case 'j':
738 arg_link_journal = LINK_GUEST;
574edc90 739 arg_link_journal_try = true;
57fb9fb5
LP
740 break;
741
742 case ARG_LINK_JOURNAL:
53e438e3 743 if (streq(optarg, "auto")) {
57fb9fb5 744 arg_link_journal = LINK_AUTO;
53e438e3
LP
745 arg_link_journal_try = false;
746 } else if (streq(optarg, "no")) {
57fb9fb5 747 arg_link_journal = LINK_NO;
53e438e3
LP
748 arg_link_journal_try = false;
749 } else if (streq(optarg, "guest")) {
57fb9fb5 750 arg_link_journal = LINK_GUEST;
53e438e3
LP
751 arg_link_journal_try = false;
752 } else if (streq(optarg, "host")) {
57fb9fb5 753 arg_link_journal = LINK_HOST;
53e438e3
LP
754 arg_link_journal_try = false;
755 } else if (streq(optarg, "try-guest")) {
574edc90
MP
756 arg_link_journal = LINK_GUEST;
757 arg_link_journal_try = true;
758 } else if (streq(optarg, "try-host")) {
759 arg_link_journal = LINK_HOST;
760 arg_link_journal_try = true;
761 } else {
57fb9fb5
LP
762 log_error("Failed to parse link journal mode %s", optarg);
763 return -EINVAL;
764 }
765
766 break;
767
17fe0523 768 case ARG_BIND:
f757855e
LP
769 case ARG_BIND_RO:
770 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
771 if (r < 0)
772 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 773
f757855e 774 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 775 break;
06c17c39 776
f757855e
LP
777 case ARG_TMPFS:
778 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
779 if (r < 0)
780 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 781
f757855e 782 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 783 break;
5a8af538
LP
784
785 case ARG_OVERLAY:
ad85779a
LP
786 case ARG_OVERLAY_RO:
787 r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
788 if (r == -EADDRNOTAVAIL)
789 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
790 if (r < 0)
791 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
06c17c39 792
f757855e 793 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39 794 break;
06c17c39 795
a5f1cb3b 796 case 'E': {
f4889f65
LP
797 char **n;
798
799 if (!env_assignment_is_valid(optarg)) {
800 log_error("Environment variable assignment '%s' is not valid.", optarg);
801 return -EINVAL;
802 }
803
804 n = strv_env_set(arg_setenv, optarg);
805 if (!n)
806 return log_oom();
807
808 strv_free(arg_setenv);
809 arg_setenv = n;
f757855e
LP
810
811 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
812 break;
813 }
814
284c0b91
LP
815 case 'q':
816 arg_quiet = true;
817 break;
818
8a96d94e 819 case ARG_SHARE_SYSTEM:
a6b5216c 820 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
821 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
822 arg_clone_ns_flags = 0;
8a96d94e
LP
823 break;
824
eb91eb18
LP
825 case ARG_REGISTER:
826 r = parse_boolean(optarg);
827 if (r < 0) {
828 log_error("Failed to parse --register= argument: %s", optarg);
829 return r;
830 }
831
832 arg_register = r;
833 break;
834
89f7c846
LP
835 case ARG_KEEP_UNIT:
836 arg_keep_unit = true;
837 break;
838
6afc95b7
LP
839 case ARG_PERSONALITY:
840
ac45f971 841 arg_personality = personality_from_string(optarg);
050f7277 842 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
843 log_error("Unknown or unsupported personality '%s'.", optarg);
844 return -EINVAL;
845 }
846
f757855e 847 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
848 break;
849
4d9f07b4
LP
850 case ARG_VOLATILE:
851
852 if (!optarg)
f757855e 853 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 854 else {
f757855e 855 VolatileMode m;
4d9f07b4 856
f757855e
LP
857 m = volatile_mode_from_string(optarg);
858 if (m < 0) {
859 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 860 return -EINVAL;
f757855e
LP
861 } else
862 arg_volatile_mode = m;
6d0b55c2
LP
863 }
864
f757855e
LP
865 arg_settings_mask |= SETTING_VOLATILE_MODE;
866 break;
6d0b55c2 867
f757855e
LP
868 case 'p':
869 r = expose_port_parse(&arg_expose_ports, optarg);
870 if (r == -EEXIST)
871 return log_error_errno(r, "Duplicate port specification: %s", optarg);
872 if (r < 0)
873 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 874
f757855e 875 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 876 break;
6d0b55c2 877
f36933fe
LP
878 case ARG_PROPERTY:
879 if (strv_extend(&arg_property, optarg) < 0)
880 return log_oom();
881
882 break;
883
ae209204
ZJS
884 case ARG_PRIVATE_USERS: {
885 int boolean = -1;
0de7acce 886
ae209204
ZJS
887 if (!optarg)
888 boolean = true;
889 else if (!in_charset(optarg, DIGITS))
890 /* do *not* parse numbers as booleans */
891 boolean = parse_boolean(optarg);
892
893 if (boolean == false) {
0de7acce
LP
894 /* no: User namespacing off */
895 arg_userns_mode = USER_NAMESPACE_NO;
896 arg_uid_shift = UID_INVALID;
897 arg_uid_range = UINT32_C(0x10000);
ae209204 898 } else if (boolean == true) {
0de7acce
LP
899 /* yes: User namespacing on, UID range is read from root dir */
900 arg_userns_mode = USER_NAMESPACE_FIXED;
901 arg_uid_shift = UID_INVALID;
902 arg_uid_range = UINT32_C(0x10000);
903 } else if (streq(optarg, "pick")) {
904 /* pick: User namespacing on, UID range is picked randomly */
905 arg_userns_mode = USER_NAMESPACE_PICK;
906 arg_uid_shift = UID_INVALID;
907 arg_uid_range = UINT32_C(0x10000);
908 } else {
6c2058b3 909 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
910 const char *range, *shift;
911
0de7acce
LP
912 /* anything else: User namespacing on, UID range is explicitly configured */
913
6dac160c
LP
914 range = strchr(optarg, ':');
915 if (range) {
6c2058b3
ZJS
916 buffer = strndup(optarg, range - optarg);
917 if (!buffer)
918 return log_oom();
919 shift = buffer;
6dac160c
LP
920
921 range++;
bfd292ec
ZJS
922 r = safe_atou32(range, &arg_uid_range);
923 if (r < 0)
be715731 924 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
925 } else
926 shift = optarg;
927
be715731
ZJS
928 r = parse_uid(shift, &arg_uid_shift);
929 if (r < 0)
930 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
931
932 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
933 }
934
be715731
ZJS
935 if (arg_uid_range <= 0) {
936 log_error("UID range cannot be 0.");
937 return -EINVAL;
938 }
939
0de7acce 940 arg_settings_mask |= SETTING_USERNS;
6dac160c 941 break;
ae209204 942 }
6dac160c 943
0de7acce 944 case 'U':
ccabee0d
LP
945 if (userns_supported()) {
946 arg_userns_mode = USER_NAMESPACE_PICK;
947 arg_uid_shift = UID_INVALID;
948 arg_uid_range = UINT32_C(0x10000);
949
950 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
951 }
952
7336138e
LP
953 break;
954
0de7acce 955 case ARG_PRIVATE_USERS_CHOWN:
19aac838 956 arg_userns_chown = true;
0de7acce
LP
957
958 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
959 break;
960
c6c8f6e2
LP
961 case ARG_KILL_SIGNAL:
962 arg_kill_signal = signal_from_string_try_harder(optarg);
963 if (arg_kill_signal < 0) {
964 log_error("Cannot parse signal: %s", optarg);
965 return -EINVAL;
966 }
967
f757855e
LP
968 arg_settings_mask |= SETTING_KILL_SIGNAL;
969 break;
970
971 case ARG_SETTINGS:
972
973 /* no → do not read files
974 * yes → read files, do not override cmdline, trust only subset
975 * override → read files, override cmdline, trust only subset
976 * trusted → read files, do not override cmdline, trust all
977 */
978
979 r = parse_boolean(optarg);
980 if (r < 0) {
981 if (streq(optarg, "trusted")) {
982 mask_all_settings = false;
983 mask_no_settings = false;
984 arg_settings_trusted = true;
985
986 } else if (streq(optarg, "override")) {
987 mask_all_settings = false;
988 mask_no_settings = true;
989 arg_settings_trusted = -1;
990 } else
991 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
992 } else if (r > 0) {
993 /* yes */
994 mask_all_settings = false;
995 mask_no_settings = false;
996 arg_settings_trusted = -1;
997 } else {
998 /* no */
999 mask_all_settings = true;
1000 mask_no_settings = false;
1001 arg_settings_trusted = false;
1002 }
1003
c6c8f6e2
LP
1004 break;
1005
5f932eb9
LP
1006 case ARG_CHDIR:
1007 if (!path_is_absolute(optarg)) {
1008 log_error("Working directory %s is not an absolute path.", optarg);
1009 return -EINVAL;
1010 }
1011
1012 r = free_and_strdup(&arg_chdir, optarg);
1013 if (r < 0)
1014 return log_oom();
1015
1016 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1017 break;
1018
b53ede69
PW
1019 case ARG_PIVOT_ROOT:
1020 r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1021 if (r < 0)
1022 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1023
1024 arg_settings_mask |= SETTING_PIVOT_ROOT;
1025 break;
1026
9c1e04d0
AP
1027 case ARG_NOTIFY_READY:
1028 r = parse_boolean(optarg);
1029 if (r < 0) {
1030 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1031 return -EINVAL;
1032 }
1033 arg_notify_ready = r;
1034 arg_settings_mask |= SETTING_NOTIFY_READY;
1035 break;
1036
4623e8e6
LP
1037 case ARG_ROOT_HASH: {
1038 void *k;
1039 size_t l;
1040
1041 r = unhexmem(optarg, strlen(optarg), &k, &l);
1042 if (r < 0)
1043 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1044 if (l < sizeof(sd_id128_t)) {
1045 log_error("Root hash must be at least 128bit long: %s", optarg);
1046 free(k);
1047 return -EINVAL;
1048 }
1049
1050 free(arg_root_hash);
1051 arg_root_hash = k;
1052 arg_root_hash_size = l;
1053 break;
1054 }
1055
960e4569
LP
1056 case ARG_SYSTEM_CALL_FILTER: {
1057 bool negative;
1058 const char *items;
1059
1060 negative = optarg[0] == '~';
1061 items = negative ? optarg + 1 : optarg;
1062
1063 for (;;) {
1064 _cleanup_free_ char *word = NULL;
1065
1066 r = extract_first_word(&items, &word, NULL, 0);
1067 if (r == 0)
1068 break;
1069 if (r == -ENOMEM)
1070 return log_oom();
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to parse system call filter: %m");
1073
1074 if (negative)
1075 r = strv_extend(&arg_syscall_blacklist, word);
1076 else
1077 r = strv_extend(&arg_syscall_whitelist, word);
1078 if (r < 0)
1079 return log_oom();
1080 }
1081
1082 arg_settings_mask |= SETTING_SYSCALL_FILTER;
1083 break;
1084 }
1085
88213476
LP
1086 case '?':
1087 return -EINVAL;
1088
1089 default:
eb9da376 1090 assert_not_reached("Unhandled option");
88213476 1091 }
88213476 1092
0c582db0
LB
1093 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1094 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1095 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1096 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1097
4f086aab
SU
1098 if (arg_userns_mode != USER_NAMESPACE_NO)
1099 arg_mount_settings |= MOUNT_USE_USERNS;
1100
1101 if (arg_private_network)
1102 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1103
1104 parse_mount_settings_env();
1105
48a8d337
LB
1106 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1107 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1108 arg_register = false;
0c582db0
LB
1109 if (arg_start_mode != START_PID1) {
1110 log_error("--boot cannot be used without namespacing.");
1111 return -EINVAL;
1112 }
1113 }
eb91eb18 1114
0de7acce 1115 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1116 arg_userns_chown = true;
1117
cd2dfc6f 1118 if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) {
8d9c2bca
AJ
1119 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1120 * The latter is not technically a user session, but we don't need to labour the point. */
cd2dfc6f 1121 log_error("--keep-unit --register=yes may not be used when invoked from a user session.");
89f7c846
LP
1122 return -EINVAL;
1123 }
1124
1b9e5b12
LP
1125 if (arg_directory && arg_image) {
1126 log_error("--directory= and --image= may not be combined.");
1127 return -EINVAL;
1128 }
1129
ec16945e
LP
1130 if (arg_template && arg_image) {
1131 log_error("--template= and --image= may not be combined.");
1132 return -EINVAL;
1133 }
1134
8cd328d8
LP
1135 if (arg_ephemeral && arg_template && !arg_directory) {
1136 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1137 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1138 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1139 * --directory=". */
1140
1141 arg_directory = arg_template;
1142 arg_template = NULL;
1143 }
1144
ec16945e
LP
1145 if (arg_template && !(arg_directory || arg_machine)) {
1146 log_error("--template= needs --directory= or --machine=.");
1147 return -EINVAL;
1148 }
1149
1150 if (arg_ephemeral && arg_template) {
1151 log_error("--ephemeral and --template= may not be combined.");
1152 return -EINVAL;
1153 }
1154
df9a75e4
LP
1155 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1156 log_error("--ephemeral and --link-journal= may not be combined.");
1157 return -EINVAL;
1158 }
1159
ccabee0d 1160 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1161 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1162 return -EOPNOTSUPP;
1163 }
1164
1165 if (arg_userns_chown && arg_read_only) {
1166 log_error("--read-only and --private-users-chown may not be combined.");
1167 return -EINVAL;
1168 }
f757855e 1169
22b28dfd
LP
1170 if (arg_network_bridge && arg_network_zone) {
1171 log_error("--network-bridge= and --network-zone= may not be combined.");
1172 return -EINVAL;
1173 }
1174
f757855e
LP
1175 if (argc > optind) {
1176 arg_parameters = strv_copy(argv + optind);
1177 if (!arg_parameters)
1178 return log_oom();
1179
7732f92b 1180 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1181 }
1182
1183 /* Load all settings from .nspawn files */
1184 if (mask_no_settings)
1185 arg_settings_mask = 0;
1186
1187 /* Don't load any settings from .nspawn files */
1188 if (mask_all_settings)
1189 arg_settings_mask = _SETTINGS_MASK_ALL;
1190
520e0d54 1191 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1192
399e391f
ZJS
1193 r = cg_unified_flush();
1194 if (r < 0)
1195 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
1196
6aadfa4c
ILG
1197 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1198 if (e)
1199 arg_container_service_name = e;
1200
5a8ff0e6
CB
1201 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1202 if (r < 0)
1203 arg_use_cgns = cg_ns_supported();
1204 else
1205 arg_use_cgns = r;
1206
86c0dd4a
LP
1207 r = custom_mount_check_all();
1208 if (r < 0)
1209 return r;
1210
f757855e
LP
1211 return 1;
1212}
1213
1214static int verify_arguments(void) {
4f086aab
SU
1215 if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
1216 log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1217 return -EINVAL;
1218 }
1219
1220 if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
1221 log_error("Cannot combine --private-users with read-write mounts.");
1222 return -EINVAL;
1223 }
f757855e
LP
1224
1225 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1226 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1227 return -EINVAL;
1228 }
1229
6d0b55c2
LP
1230 if (arg_expose_ports && !arg_private_network) {
1231 log_error("Cannot use --port= without private networking.");
1232 return -EINVAL;
1233 }
1234
349cc4a5 1235#if ! HAVE_LIBIPTC
1c1ea217
EV
1236 if (arg_expose_ports) {
1237 log_error("--port= is not supported, compiled without libiptc support.");
1238 return -EOPNOTSUPP;
1239 }
1240#endif
1241
7732f92b 1242 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1243 arg_kill_signal = SIGRTMIN+3;
1244
f757855e 1245 return 0;
88213476
LP
1246}
1247
03cfe0d5
LP
1248static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1249 assert(p);
1250
0de7acce 1251 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1252 return 0;
1253
1254 if (uid == UID_INVALID && gid == GID_INVALID)
1255 return 0;
1256
1257 if (uid != UID_INVALID) {
1258 uid += arg_uid_shift;
1259
1260 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1261 return -EOVERFLOW;
1262 }
1263
1264 if (gid != GID_INVALID) {
1265 gid += (gid_t) arg_uid_shift;
1266
1267 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1268 return -EOVERFLOW;
1269 }
1270
1271 if (lchown(p, uid, gid) < 0)
1272 return -errno;
b12afc8c
LP
1273
1274 return 0;
1275}
1276
03cfe0d5
LP
1277static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1278 const char *q;
1279
1280 q = prefix_roota(root, path);
1281 if (mkdir(q, mode) < 0) {
1282 if (errno == EEXIST)
1283 return 0;
1284 return -errno;
1285 }
1286
1287 return userns_lchown(q, uid, gid);
1288}
1289
e58a1277 1290static int setup_timezone(const char *dest) {
03cfe0d5
LP
1291 _cleanup_free_ char *p = NULL, *q = NULL;
1292 const char *where, *check, *what;
d4036145
LP
1293 char *z, *y;
1294 int r;
f8440af5 1295
e58a1277
LP
1296 assert(dest);
1297
1298 /* Fix the timezone, if possible */
d4036145
LP
1299 r = readlink_malloc("/etc/localtime", &p);
1300 if (r < 0) {
0b493a02
MP
1301 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1302 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1303 * with a symbolic link to a time zone data file.
0b493a02
MP
1304 *
1305 * Example:
21dc0227 1306 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1307 */
d4036145
LP
1308 return 0;
1309 }
1310
1311 z = path_startswith(p, "../usr/share/zoneinfo/");
1312 if (!z)
1313 z = path_startswith(p, "/usr/share/zoneinfo/");
1314 if (!z) {
1315 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1316 return 0;
1317 }
1318
03cfe0d5 1319 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1320 r = readlink_malloc(where, &q);
1321 if (r >= 0) {
1322 y = path_startswith(q, "../usr/share/zoneinfo/");
1323 if (!y)
1324 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1325
d4036145
LP
1326 /* Already pointing to the right place? Then do nothing .. */
1327 if (y && streq(y, z))
1328 return 0;
1329 }
1330
03cfe0d5 1331 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1332 check = prefix_roota(dest, check);
03cfe0d5 1333 if (laccess(check, F_OK) < 0) {
d4036145
LP
1334 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1335 return 0;
1336 }
68fb0892 1337
8ccf7e9e
LP
1338 if (unlink(where) < 0 && errno != ENOENT) {
1339 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1340 errno,
1341 "Failed to remove existing timezone info %s in container, ignoring: %m", where);
79d80fc1
TG
1342 return 0;
1343 }
4d9f07b4 1344
03cfe0d5 1345 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1346 if (symlink(what, where) < 0) {
8ccf7e9e
LP
1347 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1348 errno,
1349 "Failed to correct timezone of container, ignoring: %m");
d4036145
LP
1350 return 0;
1351 }
e58a1277 1352
03cfe0d5
LP
1353 r = userns_lchown(where, 0, 0);
1354 if (r < 0)
1355 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1356
e58a1277 1357 return 0;
88213476
LP
1358}
1359
7357272e 1360static int resolved_listening(void) {
b053cd5f 1361 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
7357272e 1362 _cleanup_free_ char *dns_stub_listener_mode = NULL;
b053cd5f
LP
1363 int r;
1364
7357272e 1365 /* Check if resolved is listening */
b053cd5f
LP
1366
1367 r = sd_bus_open_system(&bus);
1368 if (r < 0)
1369 return r;
1370
7357272e
DM
1371 r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
1372 if (r <= 0)
1373 return r;
1374
1375 r = sd_bus_get_property_string(bus,
1376 "org.freedesktop.resolve1",
1377 "/org/freedesktop/resolve1",
1378 "org.freedesktop.resolve1.Manager",
1379 "DNSStubListener",
1380 NULL,
1381 &dns_stub_listener_mode);
1382 if (r < 0)
1383 return r;
1384
1385 return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
b053cd5f
LP
1386}
1387
2547bb41 1388static int setup_resolv_conf(const char *dest) {
87447ae4
LP
1389 _cleanup_free_ char *resolved = NULL, *etc = NULL;
1390 const char *where;
1391 int r, found;
2547bb41
LP
1392
1393 assert(dest);
1394
1395 if (arg_private_network)
1396 return 0;
1397
87447ae4
LP
1398 r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc);
1399 if (r < 0) {
1400 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1401 return 0;
1402 }
1403
1404 where = strjoina(etc, "/resolv.conf");
1405 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved);
1406 if (found < 0) {
1407 log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
1408 return 0;
1409 }
79d80fc1 1410
b053cd5f 1411 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 &&
7357272e 1412 resolved_listening() > 0) {
87447ae4 1413
3539724c
LP
1414 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1415 * container, so that the container can use the host's resolver. Given that network namespacing is
1416 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1417 * advantage that the container will be able to follow the host's DNS server configuration changes
1418 * transparently. */
1419
87447ae4
LP
1420 if (found == 0) /* missing? */
1421 (void) touch(resolved);
5367354d 1422
87447ae4 1423 r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL);
60e76d48 1424 if (r >= 0)
87447ae4 1425 return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1426 }
1427
1428 /* If that didn't work, let's copy the file */
1c876927 1429 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0, COPY_REFLINK);
79d80fc1 1430 if (r < 0) {
3539724c
LP
1431 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1432 * resolved or something similar runs inside and the symlink points there.
68a313c5 1433 *
3539724c 1434 * If the disk image is read-only, there's also no point in complaining.
68a313c5 1435 */
87447ae4 1436 log_full_errno(IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1437 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1438 return 0;
1439 }
2547bb41 1440
03cfe0d5
LP
1441 r = userns_lchown(where, 0, 0);
1442 if (r < 0)
3539724c 1443 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1444
2547bb41
LP
1445 return 0;
1446}
1447
04bc4a3f 1448static int setup_boot_id(const char *dest) {
3bbaff3e 1449 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1450 const char *from, *to;
04bc4a3f
LP
1451 int r;
1452
04bc4a3f
LP
1453 /* Generate a new randomized boot ID, so that each boot-up of
1454 * the container gets a new one */
1455
03cfe0d5
LP
1456 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1457 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1458
1459 r = sd_id128_randomize(&rnd);
f647962d
MS
1460 if (r < 0)
1461 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1462
15b1248a 1463 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1464 if (r < 0)
1465 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1466
60e76d48
ZJS
1467 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1468 if (r >= 0)
1469 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1470 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1471
3bbaff3e 1472 (void) unlink(from);
04bc4a3f
LP
1473 return r;
1474}
1475
e58a1277 1476static int copy_devnodes(const char *dest) {
88213476
LP
1477
1478 static const char devnodes[] =
1479 "null\0"
1480 "zero\0"
1481 "full\0"
1482 "random\0"
1483 "urandom\0"
85614d66
TG
1484 "tty\0"
1485 "net/tun\0";
88213476
LP
1486
1487 const char *d;
e58a1277 1488 int r = 0;
7fd1b19b 1489 _cleanup_umask_ mode_t u;
a258bf26
LP
1490
1491 assert(dest);
124640f1
LP
1492
1493 u = umask(0000);
88213476 1494
03cfe0d5
LP
1495 /* Create /dev/net, so that we can create /dev/net/tun in it */
1496 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1497 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1498
88213476 1499 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1500 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1501 struct stat st;
88213476 1502
7f112f50 1503 from = strappend("/dev/", d);
03cfe0d5 1504 to = prefix_root(dest, from);
88213476
LP
1505
1506 if (stat(from, &st) < 0) {
1507
4a62c710
MS
1508 if (errno != ENOENT)
1509 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1510
a258bf26 1511 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1512
03cfe0d5 1513 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1514 return -EIO;
a258bf26 1515
85614d66 1516 } else {
81f5049b 1517 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
8dbf71ec 1518 /* Explicitly warn the user when /dev is already populated. */
41eb4362 1519 if (errno == EEXIST)
8dbf71ec 1520 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
81f5049b
AC
1521 if (errno != EPERM)
1522 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1523
1524 /* Some systems abusively restrict mknod but
1525 * allow bind mounts. */
1526 r = touch(to);
1527 if (r < 0)
1528 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1529 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1530 if (r < 0)
1531 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1532 }
6278cf60 1533
03cfe0d5
LP
1534 r = userns_lchown(to, 0, 0);
1535 if (r < 0)
1536 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1537 }
88213476
LP
1538 }
1539
e58a1277
LP
1540 return r;
1541}
88213476 1542
03cfe0d5
LP
1543static int setup_pts(const char *dest) {
1544 _cleanup_free_ char *options = NULL;
1545 const char *p;
709f6e46 1546 int r;
03cfe0d5 1547
349cc4a5 1548#if HAVE_SELINUX
03cfe0d5
LP
1549 if (arg_selinux_apifs_context)
1550 (void) asprintf(&options,
3dce8915 1551 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1552 arg_uid_shift + TTY_GID,
1553 arg_selinux_apifs_context);
1554 else
1555#endif
1556 (void) asprintf(&options,
3dce8915 1557 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1558 arg_uid_shift + TTY_GID);
f2d88580 1559
03cfe0d5 1560 if (!options)
f2d88580
LP
1561 return log_oom();
1562
03cfe0d5 1563 /* Mount /dev/pts itself */
cc9fce65 1564 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1565 if (mkdir(p, 0755) < 0)
1566 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1567 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1568 if (r < 0)
1569 return r;
709f6e46
MS
1570 r = userns_lchown(p, 0, 0);
1571 if (r < 0)
1572 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1573
1574 /* Create /dev/ptmx symlink */
1575 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1576 if (symlink("pts/ptmx", p) < 0)
1577 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1578 r = userns_lchown(p, 0, 0);
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1581
03cfe0d5
LP
1582 /* And fix /dev/pts/ptmx ownership */
1583 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1584 r = userns_lchown(p, 0, 0);
1585 if (r < 0)
1586 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1587
f2d88580
LP
1588 return 0;
1589}
1590
e58a1277 1591static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1592 _cleanup_umask_ mode_t u;
1593 const char *to;
e58a1277 1594 int r;
e58a1277
LP
1595
1596 assert(dest);
1597 assert(console);
1598
1599 u = umask(0000);
1600
03cfe0d5 1601 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1602 if (r < 0)
1603 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1604
a258bf26
LP
1605 /* We need to bind mount the right tty to /dev/console since
1606 * ptys can only exist on pts file systems. To have something
81f5049b 1607 * to bind mount things on we create a empty regular file. */
a258bf26 1608
03cfe0d5 1609 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1610 r = touch(to);
1611 if (r < 0)
1612 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1613
60e76d48 1614 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1615}
1616
8e5430c4
LP
1617static int setup_keyring(void) {
1618 key_serial_t keyring;
1619
1620 /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn
1621 * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key()
1622 * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist
1623 * these system calls let's make sure we don't leak anything into the container. */
1624
1625 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
1626 if (keyring == -1) {
1627 if (errno == ENOSYS)
1628 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
1629 else if (IN_SET(errno, EACCES, EPERM))
1630 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
1631 else
1632 return log_error_errno(errno, "Setting up kernel keyring failed: %m");
1633 }
1634
1635 return 0;
1636}
1637
e58a1277 1638static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1639 const char *from, *to;
7fd1b19b 1640 _cleanup_umask_ mode_t u;
d9603714 1641 int fd, r;
e58a1277 1642
e58a1277 1643 assert(kmsg_socket >= 0);
a258bf26 1644
e58a1277 1645 u = umask(0000);
a258bf26 1646
03cfe0d5 1647 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1648 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1649 * on the reading side behave very similar to /proc/kmsg,
1650 * their writing side behaves differently from /dev/kmsg in
1651 * that writing blocks when nothing is reading. In order to
1652 * avoid any problems with containers deadlocking due to this
1653 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1654 from = prefix_roota(dest, "/run/kmsg");
1655 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1656
4a62c710 1657 if (mkfifo(from, 0600) < 0)
03cfe0d5 1658 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1659 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1660 if (r < 0)
1661 return r;
e58a1277
LP
1662
1663 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1664 if (fd < 0)
1665 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1666
e58a1277
LP
1667 /* Store away the fd in the socket, so that it stays open as
1668 * long as we run the child */
3ee897d6 1669 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1670 safe_close(fd);
e58a1277 1671
d9603714
DH
1672 if (r < 0)
1673 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1674
03cfe0d5
LP
1675 /* And now make the FIFO unavailable as /run/kmsg... */
1676 (void) unlink(from);
1677
25ea79fe 1678 return 0;
88213476
LP
1679}
1680
1c4baffc 1681static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1682 union in_addr_union *exposed = userdata;
1683
1684 assert(rtnl);
1685 assert(m);
1686 assert(exposed);
1687
7a8f6325 1688 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1689 return 0;
1690}
1691
3a74cea5 1692static int setup_hostname(void) {
3a74cea5 1693
0c582db0 1694 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1695 return 0;
1696
605f81a8 1697 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1698 return -errno;
3a74cea5 1699
7027ff61 1700 return 0;
3a74cea5
LP
1701}
1702
57fb9fb5 1703static int setup_journal(const char *directory) {
e01ff70a 1704 sd_id128_t this_id;
0f5e1382 1705 _cleanup_free_ char *d = NULL;
e01ff70a 1706 const char *p, *q;
8054d749 1707 bool try;
e01ff70a 1708 char id[33];
57fb9fb5
LP
1709 int r;
1710
df9a75e4
LP
1711 /* Don't link journals in ephemeral mode */
1712 if (arg_ephemeral)
1713 return 0;
1714
8054d749
LP
1715 if (arg_link_journal == LINK_NO)
1716 return 0;
1717
1718 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1719
4d680aee 1720 r = sd_id128_get_machine(&this_id);
f647962d
MS
1721 if (r < 0)
1722 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1723
e01ff70a 1724 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1725 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1726 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1727 if (try)
4d680aee 1728 return 0;
df9a75e4 1729 return -EEXIST;
4d680aee
ZJS
1730 }
1731
03cfe0d5
LP
1732 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1733 if (r < 0)
1734 return log_error_errno(r, "Failed to create /var: %m");
1735
1736 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1737 if (r < 0)
1738 return log_error_errno(r, "Failed to create /var/log: %m");
1739
1740 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1741 if (r < 0)
1742 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1743
e01ff70a
MS
1744 (void) sd_id128_to_string(arg_uuid, id);
1745
03cfe0d5
LP
1746 p = strjoina("/var/log/journal/", id);
1747 q = prefix_roota(directory, p);
27407a01 1748
e1873695 1749 if (path_is_mount_point(p, NULL, 0) > 0) {
8054d749
LP
1750 if (try)
1751 return 0;
27407a01 1752
8054d749
LP
1753 log_error("%s: already a mount point, refusing to use for journal", p);
1754 return -EEXIST;
57fb9fb5
LP
1755 }
1756
e1873695 1757 if (path_is_mount_point(q, NULL, 0) > 0) {
8054d749
LP
1758 if (try)
1759 return 0;
57fb9fb5 1760
8054d749
LP
1761 log_error("%s: already a mount point, refusing to use for journal", q);
1762 return -EEXIST;
57fb9fb5
LP
1763 }
1764
1765 r = readlink_and_make_absolute(p, &d);
1766 if (r >= 0) {
3742095b 1767 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
57fb9fb5
LP
1768 path_equal(d, q)) {
1769
03cfe0d5 1770 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1771 if (r < 0)
709f6e46 1772 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1773 return 0;
57fb9fb5
LP
1774 }
1775
4a62c710
MS
1776 if (unlink(p) < 0)
1777 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1778 } else if (r == -EINVAL) {
1779
1780 if (arg_link_journal == LINK_GUEST &&
1781 rmdir(p) < 0) {
1782
27407a01
ZJS
1783 if (errno == ENOTDIR) {
1784 log_error("%s already exists and is neither a symlink nor a directory", p);
1785 return r;
4314d33f
MS
1786 } else
1787 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1788 }
4314d33f
MS
1789 } else if (r != -ENOENT)
1790 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1791
1792 if (arg_link_journal == LINK_GUEST) {
1793
1794 if (symlink(q, p) < 0) {
8054d749 1795 if (try) {
56f64d95 1796 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1797 return 0;
4314d33f
MS
1798 } else
1799 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1800 }
1801
03cfe0d5 1802 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1803 if (r < 0)
709f6e46 1804 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1805 return 0;
57fb9fb5
LP
1806 }
1807
1808 if (arg_link_journal == LINK_HOST) {
ccddd104 1809 /* don't create parents here — if the host doesn't have
574edc90 1810 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1811
1812 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1813 if (try) {
56f64d95 1814 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1815 return 0;
4314d33f
MS
1816 } else
1817 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1818 }
1819
27407a01
ZJS
1820 } else if (access(p, F_OK) < 0)
1821 return 0;
57fb9fb5 1822
cdb2b9d0
LP
1823 if (dir_is_empty(q) == 0)
1824 log_warning("%s is not empty, proceeding anyway.", q);
1825
03cfe0d5 1826 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1827 if (r < 0)
1828 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1829
60e76d48
ZJS
1830 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1831 if (r < 0)
4a62c710 1832 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1833
27407a01 1834 return 0;
57fb9fb5
LP
1835}
1836
88213476 1837static int drop_capabilities(void) {
520e0d54 1838 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1839}
1840
db999e0f
LP
1841static int reset_audit_loginuid(void) {
1842 _cleanup_free_ char *p = NULL;
1843 int r;
1844
0c582db0 1845 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1846 return 0;
1847
1848 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1849 if (r == -ENOENT)
db999e0f 1850 return 0;
f647962d
MS
1851 if (r < 0)
1852 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1853
1854 /* Already reset? */
1855 if (streq(p, "4294967295"))
1856 return 0;
1857
ad118bda 1858 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1859 if (r < 0) {
10a87006
LP
1860 log_error_errno(r,
1861 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1862 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1863 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1864 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1865 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1866
db999e0f 1867 sleep(5);
77b6e194 1868 }
db999e0f
LP
1869
1870 return 0;
77b6e194
LP
1871}
1872
24fb1112 1873
785890ac
LP
1874static int setup_propagate(const char *root) {
1875 const char *p, *q;
709f6e46 1876 int r;
785890ac
LP
1877
1878 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1879 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1880 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1881 (void) mkdir_p(p, 0600);
1882
709f6e46
MS
1883 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1884 if (r < 0)
1885 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1886
709f6e46
MS
1887 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1888 if (r < 0)
1889 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1890
709f6e46
MS
1891 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1892 if (r < 0)
1893 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1894
03cfe0d5 1895 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1896 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1897 if (r < 0)
1898 return r;
785890ac 1899
60e76d48
ZJS
1900 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1901 if (r < 0)
1902 return r;
785890ac 1903
19caffac
AC
1904 /* machined will MS_MOVE into that directory, and that's only
1905 * supported for non-shared mounts. */
60e76d48 1906 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1907}
1908
317feb4d 1909static int setup_machine_id(const char *directory) {
691675ba
LP
1910 const char *etc_machine_id;
1911 sd_id128_t id;
3bbaff3e 1912 int r;
e01ff70a 1913
317feb4d
LP
1914 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
1915 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
1916 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
1917 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
1918 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
1919 * container behaves nicely). */
1920
e01ff70a
MS
1921 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1922
691675ba 1923 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
1924 if (r < 0) {
1925 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
1926 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 1927
317feb4d
LP
1928 if (sd_id128_is_null(arg_uuid)) {
1929 r = sd_id128_randomize(&arg_uuid);
1930 if (r < 0)
1931 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
1932 }
1933 } else {
1934 if (sd_id128_is_null(id)) {
1935 log_error("Machine ID in container image is zero, refusing.");
1936 return -EINVAL;
1937 }
e01ff70a 1938
317feb4d
LP
1939 arg_uuid = id;
1940 }
691675ba 1941
e01ff70a
MS
1942 return 0;
1943}
1944
7336138e
LP
1945static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
1946 int r;
1947
1948 assert(directory);
1949
0de7acce 1950 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
1951 return 0;
1952
1953 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
1954 if (r == -EOPNOTSUPP)
1955 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
1956 if (r == -EBADE)
1957 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
1958 if (r < 0)
1959 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
1960 if (r == 0)
1961 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
1962 else
1963 log_debug("Patched directory tree to match UID/GID range.");
1964
1965 return r;
1966}
1967
113cea80 1968/*
6d416b9c
LS
1969 * Return values:
1970 * < 0 : wait_for_terminate() failed to get the state of the
1971 * container, the container was terminated by a signal, or
1972 * failed for an unknown reason. No change is made to the
1973 * container argument.
1974 * > 0 : The program executed in the container terminated with an
1975 * error. The exit code of the program executed in the
919699ec
LP
1976 * container is returned. The container argument has been set
1977 * to CONTAINER_TERMINATED.
6d416b9c
LS
1978 * 0 : The container is being rebooted, has been shut down or exited
1979 * successfully. The container argument has been set to either
1980 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 1981 *
6d416b9c
LS
1982 * That is, success is indicated by a return value of zero, and an
1983 * error is indicated by a non-zero value.
113cea80
DH
1984 */
1985static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 1986 siginfo_t status;
919699ec 1987 int r;
113cea80
DH
1988
1989 r = wait_for_terminate(pid, &status);
f647962d
MS
1990 if (r < 0)
1991 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
1992
1993 switch (status.si_code) {
fddbb89c 1994
113cea80 1995 case CLD_EXITED:
b5a2179b 1996 if (status.si_status == 0)
919699ec 1997 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 1998 else
919699ec 1999 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2000
919699ec
LP
2001 *container = CONTAINER_TERMINATED;
2002 return status.si_status;
113cea80
DH
2003
2004 case CLD_KILLED:
2005 if (status.si_status == SIGINT) {
919699ec 2006 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2007 *container = CONTAINER_TERMINATED;
919699ec
LP
2008 return 0;
2009
113cea80 2010 } else if (status.si_status == SIGHUP) {
919699ec 2011 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2012 *container = CONTAINER_REBOOTED;
919699ec 2013 return 0;
113cea80 2014 }
919699ec 2015
ec251fe7 2016 /* fall through */
113cea80
DH
2017
2018 case CLD_DUMPED:
fddbb89c 2019 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2020 return -EIO;
113cea80
DH
2021
2022 default:
fddbb89c 2023 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2024 return -EIO;
113cea80 2025 }
113cea80
DH
2026}
2027
023fb90b
LP
2028static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2029 pid_t pid;
2030
4a0b58c4 2031 pid = PTR_TO_PID(userdata);
023fb90b 2032 if (pid > 0) {
c6c8f6e2 2033 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2034 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2035 sd_event_source_set_userdata(s, NULL);
2036 return 0;
2037 }
2038 }
2039
2040 sd_event_exit(sd_event_source_get_event(s), 0);
2041 return 0;
2042}
2043
6916b164
AU
2044static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2045 for (;;) {
2046 siginfo_t si = {};
2047 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2048 return log_error_errno(errno, "Failed to waitid(): %m");
2049 if (si.si_pid == 0) /* No pending children. */
2050 break;
2051 if (si.si_pid == PTR_TO_PID(userdata)) {
2052 /* The main process we care for has exited. Return from
2053 * signal handler but leave the zombie. */
2054 sd_event_exit(sd_event_source_get_event(s), 0);
2055 break;
2056 }
2057 /* Reap all other children. */
2058 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2059 }
2060
2061 return 0;
2062}
2063
ec16945e 2064static int determine_names(void) {
1b9cebf6 2065 int r;
ec16945e 2066
c1521918
LP
2067 if (arg_template && !arg_directory && arg_machine) {
2068
2069 /* If --template= was specified then we should not
2070 * search for a machine, but instead create a new one
2071 * in /var/lib/machine. */
2072
605405c6 2073 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2074 if (!arg_directory)
2075 return log_oom();
2076 }
2077
ec16945e 2078 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2079 if (arg_machine) {
2080 _cleanup_(image_unrefp) Image *i = NULL;
2081
2082 r = image_find(arg_machine, &i);
2083 if (r < 0)
2084 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
0f3be6ca 2085 if (r == 0) {
35bca925 2086 log_error("No image for machine '%s'.", arg_machine);
1b9cebf6
LP
2087 return -ENOENT;
2088 }
2089
eb38edce 2090 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
0f03c2a4 2091 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2092 else
0f03c2a4 2093 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6 2094 if (r < 0)
0f3be6ca 2095 return log_oom();
1b9cebf6 2096
aee327b8
LP
2097 if (!arg_ephemeral)
2098 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2099 } else
ec16945e
LP
2100 arg_directory = get_current_dir_name();
2101
0f3be6ca 2102 if (!arg_directory && !arg_image) {
1b9cebf6 2103 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2104 return -EINVAL;
2105 }
2106 }
2107
2108 if (!arg_machine) {
4827ab48 2109
b9ba4dab
LP
2110 if (arg_directory && path_equal(arg_directory, "/"))
2111 arg_machine = gethostname_malloc();
4827ab48
LP
2112 else {
2113 if (arg_image) {
2114 char *e;
2115
2116 arg_machine = strdup(basename(arg_image));
2117
2118 /* Truncate suffix if there is one */
2119 e = endswith(arg_machine, ".raw");
2120 if (e)
2121 *e = 0;
2122 } else
2123 arg_machine = strdup(basename(arg_directory));
2124 }
ec16945e
LP
2125 if (!arg_machine)
2126 return log_oom();
2127
ae691c1d 2128 hostname_cleanup(arg_machine);
ec16945e
LP
2129 if (!machine_name_is_valid(arg_machine)) {
2130 log_error("Failed to determine machine name automatically, please use -M.");
2131 return -EINVAL;
2132 }
b9ba4dab
LP
2133
2134 if (arg_ephemeral) {
2135 char *b;
2136
2137 /* Add a random suffix when this is an
2138 * ephemeral machine, so that we can run many
2139 * instances at once without manually having
2140 * to specify -M each time. */
2141
2142 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2143 return log_oom();
2144
2145 free(arg_machine);
2146 arg_machine = b;
2147 }
ec16945e
LP
2148 }
2149
2150 return 0;
2151}
2152
8d4aa2bb 2153static int chase_symlinks_and_update(char **p, unsigned flags) {
3f342ec4
LP
2154 char *chased;
2155 int r;
2156
2157 assert(p);
2158
2159 if (!*p)
2160 return 0;
2161
8d4aa2bb 2162 r = chase_symlinks(*p, NULL, flags, &chased);
3f342ec4
LP
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
2165
2166 free(*p);
2167 *p = chased;
2168
2169 return 0;
2170}
2171
03cfe0d5 2172static int determine_uid_shift(const char *directory) {
6dac160c
LP
2173 int r;
2174
0de7acce 2175 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2176 arg_uid_shift = 0;
6dac160c 2177 return 0;
03cfe0d5 2178 }
6dac160c
LP
2179
2180 if (arg_uid_shift == UID_INVALID) {
2181 struct stat st;
2182
03cfe0d5 2183 r = stat(directory, &st);
6dac160c 2184 if (r < 0)
03cfe0d5 2185 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2186
2187 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2188
2189 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2190 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2191 return -EINVAL;
2192 }
2193
2194 arg_uid_range = UINT32_C(0x10000);
2195 }
2196
2197 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2198 log_error("UID base too high for UID range.");
2199 return -EINVAL;
2200 }
2201
6dac160c
LP
2202 return 0;
2203}
2204
03cfe0d5
LP
2205static int inner_child(
2206 Barrier *barrier,
2207 const char *directory,
2208 bool secondary,
2209 int kmsg_socket,
2210 int rtnl_socket,
f757855e 2211 FDSet *fds) {
69c79d3c 2212
03cfe0d5 2213 _cleanup_free_ char *home = NULL;
e01ff70a 2214 char as_uuid[37];
6aadfa4c 2215 unsigned n_env = 1;
03cfe0d5
LP
2216 const char *envp[] = {
2217 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2218 NULL, /* container */
03cfe0d5
LP
2219 NULL, /* TERM */
2220 NULL, /* HOME */
2221 NULL, /* USER */
2222 NULL, /* LOGNAME */
2223 NULL, /* container_uuid */
2224 NULL, /* LISTEN_FDS */
2225 NULL, /* LISTEN_PID */
9c1e04d0 2226 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2227 NULL
2228 };
1a68e1e5 2229 const char *exec_target;
88213476 2230
2371271c 2231 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2232 int r;
88213476 2233
03cfe0d5
LP
2234 assert(barrier);
2235 assert(directory);
2236 assert(kmsg_socket >= 0);
88213476 2237
0de7acce 2238 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2239 /* Tell the parent, that it now can write the UID map. */
2240 (void) barrier_place(barrier); /* #1 */
7027ff61 2241
03cfe0d5
LP
2242 /* Wait until the parent wrote the UID map */
2243 if (!barrier_place_and_sync(barrier)) { /* #2 */
2244 log_error("Parent died too early");
2245 return -ESRCH;
2246 }
88213476
LP
2247 }
2248
6d66bd3b
EV
2249 r = reset_uid_gid();
2250 if (r < 0)
2251 return log_error_errno(r, "Couldn't become new root: %m");
2252
0de7acce 2253 r = mount_all(NULL,
4f086aab 2254 arg_mount_settings | MOUNT_IN_USERNS,
0de7acce
LP
2255 arg_uid_shift,
2256 arg_uid_range,
2257 arg_selinux_apifs_context);
2258
03cfe0d5
LP
2259 if (r < 0)
2260 return r;
2261
4f086aab 2262 r = mount_sysfs(NULL, arg_mount_settings);
d8fc6a00
LP
2263 if (r < 0)
2264 return r;
2265
03cfe0d5
LP
2266 /* Wait until we are cgroup-ified, so that we
2267 * can mount the right cgroup path writable */
2268 if (!barrier_place_and_sync(barrier)) { /* #3 */
2269 log_error("Parent died too early");
2270 return -ESRCH;
88213476
LP
2271 }
2272
5a8ff0e6 2273 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2274 r = unshare(CLONE_NEWCGROUP);
2275 if (r < 0)
2276 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2277 r = mount_cgroups(
2278 "",
2279 arg_unified_cgroup_hierarchy,
2280 arg_userns_mode != USER_NAMESPACE_NO,
2281 arg_uid_shift,
2282 arg_uid_range,
5a8ff0e6 2283 arg_selinux_apifs_context,
ada54120 2284 true);
0996ef00
CB
2285 if (r < 0)
2286 return r;
2287 } else {
2288 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2289 if (r < 0)
2290 return r;
2291 }
ec16945e 2292
03cfe0d5
LP
2293 r = setup_boot_id(NULL);
2294 if (r < 0)
2295 return r;
ec16945e 2296
03cfe0d5
LP
2297 r = setup_kmsg(NULL, kmsg_socket);
2298 if (r < 0)
2299 return r;
2300 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2301
03cfe0d5 2302 umask(0022);
30535c16 2303
03cfe0d5
LP
2304 if (setsid() < 0)
2305 return log_error_errno(errno, "setsid() failed: %m");
2306
2307 if (arg_private_network)
2308 loopback_setup();
2309
7a8f6325
LP
2310 if (arg_expose_ports) {
2311 r = expose_port_send_rtnl(rtnl_socket);
2312 if (r < 0)
2313 return r;
2314 rtnl_socket = safe_close(rtnl_socket);
2315 }
03cfe0d5 2316
709f6e46
MS
2317 r = drop_capabilities();
2318 if (r < 0)
2319 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2320
2321 setup_hostname();
2322
050f7277 2323 if (arg_personality != PERSONALITY_INVALID) {
21022b9d
LP
2324 r = safe_personality(arg_personality);
2325 if (r < 0)
2326 return log_error_errno(r, "personality() failed: %m");
03cfe0d5 2327 } else if (secondary) {
21022b9d
LP
2328 r = safe_personality(PER_LINUX32);
2329 if (r < 0)
2330 return log_error_errno(r, "personality() failed: %m");
03cfe0d5
LP
2331 }
2332
349cc4a5 2333#if HAVE_SELINUX
03cfe0d5 2334 if (arg_selinux_context)
2ed96880 2335 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2336 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2337#endif
2338
ee645080 2339 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2340 if (r < 0)
2341 return r;
2342
6aadfa4c
ILG
2343 /* LXC sets container=lxc, so follow the scheme here */
2344 envp[n_env++] = strjoina("container=", arg_container_service_name);
2345
03cfe0d5
LP
2346 envp[n_env] = strv_find_prefix(environ, "TERM=");
2347 if (envp[n_env])
313cefa1 2348 n_env++;
03cfe0d5
LP
2349
2350 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2351 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2352 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2353 return log_oom();
2354
3bbaff3e 2355 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2356
691675ba 2357 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2358 return log_oom();
03cfe0d5
LP
2359
2360 if (fdset_size(fds) > 0) {
2361 r = fdset_cloexec(fds, false);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2364
2365 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2366 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2367 return log_oom();
2368 }
9c1e04d0
AP
2369 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2370 return log_oom();
03cfe0d5 2371
2371271c
TG
2372 env_use = strv_env_merge(2, envp, arg_setenv);
2373 if (!env_use)
2374 return log_oom();
03cfe0d5
LP
2375
2376 /* Let the parent know that we are ready and
2377 * wait until the parent is ready with the
2378 * setup, too... */
2379 if (!barrier_place_and_sync(barrier)) { /* #4 */
2380 log_error("Parent died too early");
2381 return -ESRCH;
2382 }
2383
5f932eb9
LP
2384 if (arg_chdir)
2385 if (chdir(arg_chdir) < 0)
2386 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2387
7732f92b 2388 if (arg_start_mode == START_PID2) {
75bf701f 2389 r = stub_pid1(arg_uuid);
7732f92b
LP
2390 if (r < 0)
2391 return r;
2392 }
2393
03cfe0d5
LP
2394 /* Now, explicitly close the log, so that we
2395 * then can close all remaining fds. Closing
2396 * the log explicitly first has the benefit
2397 * that the logging subsystem knows about it,
2398 * and is thus ready to be reopened should we
2399 * need it again. Note that the other fds
2400 * closed here are at least the locking and
2401 * barrier fds. */
2402 log_close();
2403 (void) fdset_close_others(fds);
2404
7732f92b 2405 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2406 char **a;
2407 size_t m;
2408
2409 /* Automatically search for the init system */
2410
75f32f04
ZJS
2411 m = strv_length(arg_parameters);
2412 a = newa(char*, m + 2);
2413 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2414 a[1 + m] = NULL;
03cfe0d5 2415
ced58da7 2416 a[0] = (char*) "/usr/lib/systemd/systemd";
03cfe0d5
LP
2417 execve(a[0], a, env_use);
2418
ced58da7 2419 a[0] = (char*) "/lib/systemd/systemd";
03cfe0d5
LP
2420 execve(a[0], a, env_use);
2421
ced58da7 2422 a[0] = (char*) "/sbin/init";
03cfe0d5 2423 execve(a[0], a, env_use);
ced58da7
LP
2424
2425 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
1a68e1e5
PW
2426 } else if (!strv_isempty(arg_parameters)) {
2427 exec_target = arg_parameters[0];
f757855e 2428 execvpe(arg_parameters[0], arg_parameters, env_use);
1a68e1e5 2429 } else {
5f932eb9 2430 if (!arg_chdir)
d929b0f9
ZJS
2431 /* If we cannot change the directory, we'll end up in /, that is expected. */
2432 (void) chdir(home ?: "/root");
5f932eb9 2433
03cfe0d5
LP
2434 execle("/bin/bash", "-bash", NULL, env_use);
2435 execle("/bin/sh", "-sh", NULL, env_use);
ced58da7
LP
2436
2437 exec_target = "/bin/bash, /bin/sh";
03cfe0d5
LP
2438 }
2439
35607a8d 2440 r = -errno;
03cfe0d5 2441 (void) log_open();
1a68e1e5 2442 return log_error_errno(r, "execv(%s) failed: %m", exec_target);
03cfe0d5
LP
2443}
2444
9c1e04d0
AP
2445static int setup_sd_notify_child(void) {
2446 static const int one = 1;
2447 int fd = -1;
2448 union sockaddr_union sa = {
2449 .sa.sa_family = AF_UNIX,
2450 };
2451 int r;
2452
2453 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2454 if (fd < 0)
2455 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2456
2457 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2458 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2459
2460 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2461 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2462 if (r < 0) {
2463 safe_close(fd);
2464 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2465 }
2466
adc7d9f0
EV
2467 r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
2468 if (r < 0) {
2469 safe_close(fd);
2470 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
2471 }
2472
9c1e04d0
AP
2473 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2474 if (r < 0) {
2475 safe_close(fd);
2476 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2477 }
2478
2479 return fd;
2480}
2481
03cfe0d5
LP
2482static int outer_child(
2483 Barrier *barrier,
2484 const char *directory,
2485 const char *console,
2d845785 2486 DissectedImage *dissected_image,
03cfe0d5
LP
2487 bool interactive,
2488 bool secondary,
2489 int pid_socket,
e01ff70a 2490 int uuid_socket,
9c1e04d0 2491 int notify_socket,
03cfe0d5
LP
2492 int kmsg_socket,
2493 int rtnl_socket,
825d5287 2494 int uid_shift_socket,
f757855e 2495 FDSet *fds) {
03cfe0d5
LP
2496
2497 pid_t pid;
2498 ssize_t l;
2499 int r;
9c1e04d0 2500 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2501
2502 assert(barrier);
2503 assert(directory);
2504 assert(console);
2505 assert(pid_socket >= 0);
e01ff70a 2506 assert(uuid_socket >= 0);
9c1e04d0 2507 assert(notify_socket >= 0);
03cfe0d5
LP
2508 assert(kmsg_socket >= 0);
2509
2510 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2511 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2512
2513 if (interactive) {
2514 close_nointr(STDIN_FILENO);
2515 close_nointr(STDOUT_FILENO);
2516 close_nointr(STDERR_FILENO);
2517
2518 r = open_terminal(console, O_RDWR);
2519 if (r != STDIN_FILENO) {
2520 if (r >= 0) {
2521 safe_close(r);
2522 r = -EINVAL;
2523 }
2524
2525 return log_error_errno(r, "Failed to open console: %m");
2526 }
2527
2528 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2529 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2530 return log_error_errno(errno, "Failed to duplicate console: %m");
2531 }
2532
2533 r = reset_audit_loginuid();
2534 if (r < 0)
2535 return r;
2536
2537 /* Mark everything as slave, so that we still
2538 * receive mounts from the real root, but don't
2539 * propagate mounts to the real root. */
60e76d48
ZJS
2540 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2541 if (r < 0)
2542 return r;
03cfe0d5 2543
2d845785 2544 if (dissected_image) {
18b5886e 2545 r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0));
2d845785
LP
2546 if (r < 0)
2547 return r;
2548 }
03cfe0d5 2549
391567f4
LP
2550 r = determine_uid_shift(directory);
2551 if (r < 0)
2552 return r;
2553
0de7acce 2554 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2555 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2556 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2557 if (l < 0)
2558 return log_error_errno(errno, "Failed to send UID shift: %m");
2559 if (l != sizeof(arg_uid_shift)) {
2560 log_error("Short write while sending UID shift.");
2561 return -EIO;
2562 }
0e7ac751 2563
0de7acce 2564 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2565 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2566 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2567 * not it will pick a different one, and send it back to us. */
2568
2569 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2570 if (l < 0)
2571 return log_error_errno(errno, "Failed to recv UID shift: %m");
2572 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2573 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2574 return -EIO;
2575 }
2576 }
2577
2578 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2579 }
2580
03cfe0d5 2581 /* Turn directory into bind mount */
60e76d48
ZJS
2582 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
2583 if (r < 0)
2584 return r;
03cfe0d5 2585
b53ede69
PW
2586 r = setup_pivot_root(
2587 directory,
2588 arg_pivot_root_new,
2589 arg_pivot_root_old);
2590 if (r < 0)
2591 return r;
2592
0de7acce
LP
2593 r = setup_volatile(
2594 directory,
2595 arg_volatile_mode,
2596 arg_userns_mode != USER_NAMESPACE_NO,
2597 arg_uid_shift,
2598 arg_uid_range,
2599 arg_selinux_context);
03cfe0d5
LP
2600 if (r < 0)
2601 return r;
2602
0de7acce
LP
2603 r = setup_volatile_state(
2604 directory,
2605 arg_volatile_mode,
2606 arg_userns_mode != USER_NAMESPACE_NO,
2607 arg_uid_shift,
2608 arg_uid_range,
2609 arg_selinux_context);
03cfe0d5
LP
2610 if (r < 0)
2611 return r;
2612
4ad14eff
LP
2613 /* Mark everything as shared so our mounts get propagated down. This is
2614 * required to make new bind mounts available in systemd services
2615 * inside the containter that create a new mount namespace.
2616 * See https://github.com/systemd/systemd/issues/3860
2617 * Further submounts (such as /dev) done after this will inherit the
13e785f7 2618 * shared propagation mode. */
4ad14eff
LP
2619 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
2620 if (r < 0)
2621 return r;
2622
2623 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2624 if (r < 0)
2625 return r;
2626
03cfe0d5
LP
2627 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2628 if (r < 0)
2629 return r;
2630
03cfe0d5 2631 if (arg_read_only) {
6b7c9f8b 2632 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
2633 if (r < 0)
2634 return log_error_errno(r, "Failed to make tree read-only: %m");
2635 }
2636
0de7acce 2637 r = mount_all(directory,
4f086aab 2638 arg_mount_settings,
0de7acce
LP
2639 arg_uid_shift,
2640 arg_uid_range,
2641 arg_selinux_apifs_context);
03cfe0d5
LP
2642 if (r < 0)
2643 return r;
2644
07fa00f9
LP
2645 r = copy_devnodes(directory);
2646 if (r < 0)
03cfe0d5
LP
2647 return r;
2648
2649 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2650
07fa00f9
LP
2651 r = setup_pts(directory);
2652 if (r < 0)
03cfe0d5
LP
2653 return r;
2654
2655 r = setup_propagate(directory);
2656 if (r < 0)
2657 return r;
2658
2659 r = setup_dev_console(directory, console);
2660 if (r < 0)
2661 return r;
2662
8e5430c4
LP
2663 r = setup_keyring();
2664 if (r < 0)
2665 return r;
2666
960e4569 2667 r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
03cfe0d5
LP
2668 if (r < 0)
2669 return r;
2670
2671 r = setup_timezone(directory);
2672 if (r < 0)
2673 return r;
2674
2675 r = setup_resolv_conf(directory);
2676 if (r < 0)
2677 return r;
2678
e01ff70a
MS
2679 r = setup_machine_id(directory);
2680 if (r < 0)
2681 return r;
2682
03cfe0d5
LP
2683 r = setup_journal(directory);
2684 if (r < 0)
2685 return r;
2686
0de7acce
LP
2687 r = mount_custom(
2688 directory,
2689 arg_custom_mounts,
2690 arg_n_custom_mounts,
2691 arg_userns_mode != USER_NAMESPACE_NO,
2692 arg_uid_shift,
2693 arg_uid_range,
2694 arg_selinux_apifs_context);
03cfe0d5
LP
2695 if (r < 0)
2696 return r;
2697
5a8ff0e6 2698 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
2699 r = mount_cgroups(
2700 directory,
2701 arg_unified_cgroup_hierarchy,
2702 arg_userns_mode != USER_NAMESPACE_NO,
2703 arg_uid_shift,
2704 arg_uid_range,
5a8ff0e6 2705 arg_selinux_apifs_context,
ada54120 2706 false);
0996ef00
CB
2707 if (r < 0)
2708 return r;
2709 }
03cfe0d5
LP
2710
2711 r = mount_move_root(directory);
2712 if (r < 0)
2713 return log_error_errno(r, "Failed to move root directory: %m");
2714
9c1e04d0
AP
2715 fd = setup_sd_notify_child();
2716 if (fd < 0)
2717 return fd;
2718
03cfe0d5 2719 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 2720 arg_clone_ns_flags |
03cfe0d5 2721 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 2722 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
2723 if (pid < 0)
2724 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2725 if (pid == 0) {
2726 pid_socket = safe_close(pid_socket);
e01ff70a 2727 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2728 notify_socket = safe_close(notify_socket);
825d5287 2729 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2730
2731 /* The inner child has all namespaces that are
2732 * requested, so that we all are owned by the user if
2733 * user namespaces are turned on. */
2734
f757855e 2735 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2736 if (r < 0)
2737 _exit(EXIT_FAILURE);
2738
2739 _exit(EXIT_SUCCESS);
2740 }
2741
2742 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2743 if (l < 0)
2744 return log_error_errno(errno, "Failed to send PID: %m");
2745 if (l != sizeof(pid)) {
2746 log_error("Short write while sending PID.");
2747 return -EIO;
2748 }
2749
e01ff70a
MS
2750 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2751 if (l < 0)
2752 return log_error_errno(errno, "Failed to send machine ID: %m");
2753 if (l != sizeof(arg_uuid)) {
2754 log_error("Short write while sending machine ID.");
2755 return -EIO;
2756 }
2757
9c1e04d0
AP
2758 l = send_one_fd(notify_socket, fd, 0);
2759 if (l < 0)
2760 return log_error_errno(errno, "Failed to send notify fd: %m");
2761
03cfe0d5 2762 pid_socket = safe_close(pid_socket);
e01ff70a 2763 uuid_socket = safe_close(uuid_socket);
9c1e04d0 2764 notify_socket = safe_close(notify_socket);
327e26d6
KN
2765 kmsg_socket = safe_close(kmsg_socket);
2766 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2767
2768 return 0;
2769}
2770
0e7ac751
LP
2771static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
2772 unsigned n_tries = 100;
2773 uid_t candidate;
2774 int r;
2775
2776 assert(shift);
2777 assert(ret_lock_file);
0de7acce 2778 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
2779 assert(arg_uid_range == 0x10000U);
2780
2781 candidate = *shift;
2782
2783 (void) mkdir("/run/systemd/nspawn-uid", 0755);
2784
2785 for (;;) {
2786 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
2787 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
2788
2789 if (--n_tries <= 0)
2790 return -EBUSY;
2791
2792 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
2793 goto next;
2794 if ((candidate & UINT32_C(0xFFFF)) != 0)
2795 goto next;
2796
2797 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
2798 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
2799 if (r == -EBUSY) /* Range already taken by another nspawn instance */
2800 goto next;
2801 if (r < 0)
2802 return r;
2803
2804 /* Make some superficial checks whether the range is currently known in the user database */
2805 if (getpwuid(candidate))
2806 goto next;
2807 if (getpwuid(candidate + UINT32_C(0xFFFE)))
2808 goto next;
2809 if (getgrgid(candidate))
2810 goto next;
2811 if (getgrgid(candidate + UINT32_C(0xFFFE)))
2812 goto next;
2813
2814 *ret_lock_file = lf;
2815 lf = (struct LockFile) LOCK_FILE_INIT;
2816 *shift = candidate;
2817 return 0;
2818
2819 next:
2820 random_bytes(&candidate, sizeof(candidate));
2821 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
2822 candidate &= (uid_t) UINT32_C(0xFFFF0000);
2823 }
2824}
2825
03cfe0d5
LP
2826static int setup_uid_map(pid_t pid) {
2827 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2828 int r;
2829
2830 assert(pid > 1);
2831
2832 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2833 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2834 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2835 if (r < 0)
2836 return log_error_errno(r, "Failed to write UID map: %m");
2837
2838 /* We always assign the same UID and GID ranges */
2839 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2840 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to write GID map: %m");
2843
2844 return 0;
2845}
2846
9c1e04d0 2847static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
2848 char buf[NOTIFY_BUFFER_MAX+1];
2849 char *p = NULL;
2850 struct iovec iovec = {
2851 .iov_base = buf,
2852 .iov_len = sizeof(buf)-1,
2853 };
2854 union {
2855 struct cmsghdr cmsghdr;
2856 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
2857 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
2858 } control = {};
2859 struct msghdr msghdr = {
2860 .msg_iov = &iovec,
2861 .msg_iovlen = 1,
2862 .msg_control = &control,
2863 .msg_controllen = sizeof(control),
2864 };
2865 struct cmsghdr *cmsg;
2866 struct ucred *ucred = NULL;
2867 ssize_t n;
2868 pid_t inner_child_pid;
2869 _cleanup_strv_free_ char **tags = NULL;
2870
2871 assert(userdata);
2872
2873 inner_child_pid = PTR_TO_PID(userdata);
2874
2875 if (revents != EPOLLIN) {
2876 log_warning("Got unexpected poll event for notify fd.");
2877 return 0;
2878 }
2879
2880 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
2881 if (n < 0) {
3742095b 2882 if (IN_SET(errno, EAGAIN, EINTR))
9c1e04d0
AP
2883 return 0;
2884
2885 return log_warning_errno(errno, "Couldn't read notification socket: %m");
2886 }
2887 cmsg_close_all(&msghdr);
2888
2889 CMSG_FOREACH(cmsg, &msghdr) {
2890 if (cmsg->cmsg_level == SOL_SOCKET &&
2891 cmsg->cmsg_type == SCM_CREDENTIALS &&
2892 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
2893
2894 ucred = (struct ucred*) CMSG_DATA(cmsg);
2895 }
2896 }
2897
2898 if (!ucred || ucred->pid != inner_child_pid) {
8cb57430 2899 log_debug("Received notify message without valid credentials. Ignoring.");
9c1e04d0
AP
2900 return 0;
2901 }
2902
2903 if ((size_t) n >= sizeof(buf)) {
2904 log_warning("Received notify message exceeded maximum size. Ignoring.");
2905 return 0;
2906 }
2907
2908 buf[n] = 0;
2909 tags = strv_split(buf, "\n\r");
2910 if (!tags)
2911 return log_oom();
2912
2913 if (strv_find(tags, "READY=1"))
2914 sd_notifyf(false, "READY=1\n");
2915
2916 p = strv_find_startswith(tags, "STATUS=");
2917 if (p)
2918 sd_notifyf(false, "STATUS=Container running: %s", p);
2919
2920 return 0;
2921}
2922
5773024d 2923static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
9c1e04d0 2924 int r;
9c1e04d0 2925
5773024d 2926 r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
9c1e04d0
AP
2927 if (r < 0)
2928 return log_error_errno(r, "Failed to allocate notify event source: %m");
2929
5773024d 2930 (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
9c1e04d0
AP
2931
2932 return 0;
2933}
2934
f757855e
LP
2935static int load_settings(void) {
2936 _cleanup_(settings_freep) Settings *settings = NULL;
2937 _cleanup_fclose_ FILE *f = NULL;
2938 _cleanup_free_ char *p = NULL;
2939 const char *fn, *i;
2940 int r;
2941
2942 /* If all settings are masked, there's no point in looking for
2943 * the settings file */
2944 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2945 return 0;
2946
2947 fn = strjoina(arg_machine, ".nspawn");
2948
2949 /* We first look in the admin's directories in /etc and /run */
2950 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2951 _cleanup_free_ char *j = NULL;
2952
605405c6 2953 j = strjoin(i, "/", fn);
f757855e
LP
2954 if (!j)
2955 return log_oom();
2956
2957 f = fopen(j, "re");
2958 if (f) {
2959 p = j;
2960 j = NULL;
2961
b938cb90 2962 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2963 if (arg_settings_trusted < 0)
2964 arg_settings_trusted = true;
2965
2966 break;
2967 }
2968
2969 if (errno != ENOENT)
2970 return log_error_errno(errno, "Failed to open %s: %m", j);
2971 }
2972
2973 if (!f) {
2974 /* After that, let's look for a file next to the
2975 * actual image we shall boot. */
2976
2977 if (arg_image) {
2978 p = file_in_same_dir(arg_image, fn);
2979 if (!p)
2980 return log_oom();
2981 } else if (arg_directory) {
2982 p = file_in_same_dir(arg_directory, fn);
2983 if (!p)
2984 return log_oom();
2985 }
2986
2987 if (p) {
2988 f = fopen(p, "re");
2989 if (!f && errno != ENOENT)
2990 return log_error_errno(errno, "Failed to open %s: %m", p);
2991
b938cb90 2992 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2993 if (arg_settings_trusted < 0)
2994 arg_settings_trusted = false;
2995 }
2996 }
2997
2998 if (!f)
2999 return 0;
3000
3001 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3002
3003 r = settings_load(f, p, &settings);
3004 if (r < 0)
3005 return r;
3006
3007 /* Copy over bits from the settings, unless they have been
3008 * explicitly masked by command line switches. */
3009
7732f92b
LP
3010 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3011 settings->start_mode >= 0) {
3012 arg_start_mode = settings->start_mode;
f757855e
LP
3013
3014 strv_free(arg_parameters);
3015 arg_parameters = settings->parameters;
3016 settings->parameters = NULL;
3017 }
3018
b53ede69
PW
3019 if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
3020 settings->pivot_root_new) {
3021 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
3022 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
3023 }
3024
5f932eb9
LP
3025 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3026 settings->working_directory) {
3027 free(arg_chdir);
3028 arg_chdir = settings->working_directory;
3029 settings->working_directory = NULL;
3030 }
3031
f757855e
LP
3032 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3033 settings->environment) {
3034 strv_free(arg_setenv);
3035 arg_setenv = settings->environment;
3036 settings->environment = NULL;
3037 }
3038
3039 if ((arg_settings_mask & SETTING_USER) == 0 &&
3040 settings->user) {
3041 free(arg_user);
3042 arg_user = settings->user;
3043 settings->user = NULL;
3044 }
3045
3046 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3047 uint64_t plus;
f757855e 3048
0e265674
LP
3049 plus = settings->capability;
3050 if (settings_private_network(settings))
3051 plus |= (1ULL << CAP_NET_ADMIN);
3052
3053 if (!arg_settings_trusted && plus != 0) {
3054 if (settings->capability != 0)
3055 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3056 } else
520e0d54 3057 arg_caps_retain |= plus;
f757855e 3058
520e0d54 3059 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3060 }
3061
3062 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3063 settings->kill_signal > 0)
3064 arg_kill_signal = settings->kill_signal;
3065
3066 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3067 settings->personality != PERSONALITY_INVALID)
3068 arg_personality = settings->personality;
3069
3070 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3071 !sd_id128_is_null(settings->machine_id)) {
3072
3073 if (!arg_settings_trusted)
3074 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3075 else
3076 arg_uuid = settings->machine_id;
3077 }
3078
3079 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3080 settings->read_only >= 0)
3081 arg_read_only = settings->read_only;
3082
3083 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3084 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3085 arg_volatile_mode = settings->volatile_mode;
3086
3087 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3088 settings->n_custom_mounts > 0) {
3089
3090 if (!arg_settings_trusted)
3091 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3092 else {
3093 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3094 arg_custom_mounts = settings->custom_mounts;
3095 arg_n_custom_mounts = settings->n_custom_mounts;
3096
3097 settings->custom_mounts = NULL;
3098 settings->n_custom_mounts = 0;
3099 }
3100 }
3101
3102 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3103 (settings->private_network >= 0 ||
3104 settings->network_veth >= 0 ||
3105 settings->network_bridge ||
22b28dfd 3106 settings->network_zone ||
f757855e
LP
3107 settings->network_interfaces ||
3108 settings->network_macvlan ||
f6d6bad1
LP
3109 settings->network_ipvlan ||
3110 settings->network_veth_extra)) {
f757855e
LP
3111
3112 if (!arg_settings_trusted)
3113 log_warning("Ignoring network settings, file %s is not trusted.", p);
3114 else {
f6d6bad1 3115 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3116 arg_private_network = settings_private_network(settings);
3117
f757855e
LP
3118 strv_free(arg_network_interfaces);
3119 arg_network_interfaces = settings->network_interfaces;
3120 settings->network_interfaces = NULL;
3121
3122 strv_free(arg_network_macvlan);
3123 arg_network_macvlan = settings->network_macvlan;
3124 settings->network_macvlan = NULL;
3125
3126 strv_free(arg_network_ipvlan);
3127 arg_network_ipvlan = settings->network_ipvlan;
3128 settings->network_ipvlan = NULL;
3129
f6d6bad1
LP
3130 strv_free(arg_network_veth_extra);
3131 arg_network_veth_extra = settings->network_veth_extra;
3132 settings->network_veth_extra = NULL;
3133
f757855e
LP
3134 free(arg_network_bridge);
3135 arg_network_bridge = settings->network_bridge;
3136 settings->network_bridge = NULL;
22b28dfd
LP
3137
3138 free(arg_network_zone);
3139 arg_network_zone = settings->network_zone;
3140 settings->network_zone = NULL;
f757855e
LP
3141 }
3142 }
3143
3144 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3145 settings->expose_ports) {
3146
3147 if (!arg_settings_trusted)
3148 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3149 else {
3150 expose_port_free_all(arg_expose_ports);
3151 arg_expose_ports = settings->expose_ports;
3152 settings->expose_ports = NULL;
3153 }
3154 }
3155
0de7acce
LP
3156 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3157 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3158
3159 if (!arg_settings_trusted)
3160 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3161 else {
3162 arg_userns_mode = settings->userns_mode;
3163 arg_uid_shift = settings->uid_shift;
3164 arg_uid_range = settings->uid_range;
3165 arg_userns_chown = settings->userns_chown;
3166 }
3167 }
3168
9c1e04d0
AP
3169 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3170 arg_notify_ready = settings->notify_ready;
3171
960e4569
LP
3172 if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
3173
3174 if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
3175 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
3176 else {
3177 strv_free(arg_syscall_whitelist);
3178 strv_free(arg_syscall_blacklist);
3179
3180 arg_syscall_whitelist = settings->syscall_whitelist;
3181 arg_syscall_blacklist = settings->syscall_blacklist;
3182
3183 settings->syscall_whitelist = settings->syscall_blacklist = NULL;
3184 }
3185 }
3186
f757855e
LP
3187 return 0;
3188}
3189
b0067625
ZJS
3190static int run(int master,
3191 const char* console,
2d845785 3192 DissectedImage *dissected_image,
b0067625
ZJS
3193 bool interactive,
3194 bool secondary,
3195 FDSet *fds,
3196 char veth_name[IFNAMSIZ], bool *veth_created,
3197 union in_addr_union *exposed,
3198 pid_t *pid, int *ret) {
3199
3200 static const struct sigaction sa = {
3201 .sa_handler = nop_signal_handler,
e28c7cd0 3202 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
b0067625
ZJS
3203 };
3204
3205 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3206 _cleanup_close_ int etc_passwd_lock = -1;
3207 _cleanup_close_pair_ int
3208 kmsg_socket_pair[2] = { -1, -1 },
3209 rtnl_socket_pair[2] = { -1, -1 },
3210 pid_socket_pair[2] = { -1, -1 },
3211 uuid_socket_pair[2] = { -1, -1 },
3212 notify_socket_pair[2] = { -1, -1 },
3213 uid_shift_socket_pair[2] = { -1, -1 };
3214 _cleanup_close_ int notify_socket= -1;
3215 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5773024d 3216 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
b0067625
ZJS
3217 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3218 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3219 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3220 ContainerStatus container_status = 0;
3221 char last_char = 0;
3222 int ifi = 0, r;
3223 ssize_t l;
3224 sigset_t mask_chld;
3225
3226 assert_se(sigemptyset(&mask_chld) == 0);
3227 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3228
3229 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3230 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3231 * check with getpwuid() if the specific user already exists. Note that /etc might be
3232 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3233 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3234 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3235 * really ours. */
3236
3237 etc_passwd_lock = take_etc_passwd_lock(NULL);
3238 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3239 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3240 }
3241
3242 r = barrier_create(&barrier);
3243 if (r < 0)
3244 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3245
3246 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3247 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3248
3249 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3250 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3251
3252 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3253 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3254
3255 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3256 return log_error_errno(errno, "Failed to create id socket pair: %m");
3257
3258 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3259 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3260
3261 if (arg_userns_mode != USER_NAMESPACE_NO)
3262 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3263 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3264
3265 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3266 * parent's blocking calls and give it a chance to call wait() and terminate. */
3267 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3268 if (r < 0)
3269 return log_error_errno(errno, "Failed to change the signal mask: %m");
3270
3271 r = sigaction(SIGCHLD, &sa, NULL);
3272 if (r < 0)
3273 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3274
3275 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3276 if (*pid < 0)
3277 return log_error_errno(errno, "clone() failed%s: %m",
3278 errno == EINVAL ?
3279 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3280
3281 if (*pid == 0) {
3282 /* The outer child only has a file system namespace. */
3283 barrier_set_role(&barrier, BARRIER_CHILD);
3284
3285 master = safe_close(master);
3286
3287 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3288 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3289 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3290 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3291 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3292 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3293
3294 (void) reset_all_signal_handlers();
3295 (void) reset_signal_mask();
3296
3297 r = outer_child(&barrier,
3298 arg_directory,
3299 console,
2d845785 3300 dissected_image,
b0067625
ZJS
3301 interactive,
3302 secondary,
3303 pid_socket_pair[1],
3304 uuid_socket_pair[1],
3305 notify_socket_pair[1],
3306 kmsg_socket_pair[1],
3307 rtnl_socket_pair[1],
3308 uid_shift_socket_pair[1],
3309 fds);
3310 if (r < 0)
3311 _exit(EXIT_FAILURE);
3312
3313 _exit(EXIT_SUCCESS);
3314 }
3315
3316 barrier_set_role(&barrier, BARRIER_PARENT);
3317
3318 fds = fdset_free(fds);
3319
3320 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3321 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3322 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3323 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3324 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3325 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3326
3327 if (arg_userns_mode != USER_NAMESPACE_NO) {
3328 /* The child just let us know the UID shift it might have read from the image. */
3329 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3330 if (l < 0)
3331 return log_error_errno(errno, "Failed to read UID shift: %m");
b0067625
ZJS
3332 if (l != sizeof arg_uid_shift) {
3333 log_error("Short read while reading UID shift.");
3334 return -EIO;
3335 }
3336
3337 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3338 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3339 * image, but if that's already in use, pick a new one, and report back to the child,
3340 * which one we now picked. */
3341
3342 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3343 if (r < 0)
3344 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3345
3346 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3347 if (l < 0)
3348 return log_error_errno(errno, "Failed to send UID shift: %m");
3349 if (l != sizeof arg_uid_shift) {
3350 log_error("Short write while writing UID shift.");
3351 return -EIO;
3352 }
3353 }
3354 }
3355
3356 /* Wait for the outer child. */
3357 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3358 if (r != 0)
3359 return r < 0 ? r : -EIO;
3360
3361 /* And now retrieve the PID of the inner child. */
3362 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3363 if (l < 0)
3364 return log_error_errno(errno, "Failed to read inner child PID: %m");
3365 if (l != sizeof *pid) {
3366 log_error("Short read while reading inner child PID.");
3367 return -EIO;
3368 }
3369
3370 /* We also retrieve container UUID in case it was generated by outer child */
3371 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3372 if (l < 0)
3373 return log_error_errno(errno, "Failed to read container machine ID: %m");
3374 if (l != sizeof(arg_uuid)) {
3375 log_error("Short read while reading container machined ID.");
3376 return -EIO;
3377 }
3378
3379 /* We also retrieve the socket used for notifications generated by outer child */
3380 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3381 if (notify_socket < 0)
3382 return log_error_errno(notify_socket,
3383 "Failed to receive notification socket from the outer child: %m");
3384
3385 log_debug("Init process invoked as PID "PID_FMT, *pid);
3386
3387 if (arg_userns_mode != USER_NAMESPACE_NO) {
3388 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3389 log_error("Child died too early.");
3390 return -ESRCH;
3391 }
3392
3393 r = setup_uid_map(*pid);
3394 if (r < 0)
3395 return r;
3396
3397 (void) barrier_place(&barrier); /* #2 */
3398 }
3399
3400 if (arg_private_network) {
3401
3402 r = move_network_interfaces(*pid, arg_network_interfaces);
3403 if (r < 0)
3404 return r;
3405
3406 if (arg_network_veth) {
3407 r = setup_veth(arg_machine, *pid, veth_name,
3408 arg_network_bridge || arg_network_zone);
3409 if (r < 0)
3410 return r;
3411 else if (r > 0)
3412 ifi = r;
3413
3414 if (arg_network_bridge) {
3415 /* Add the interface to a bridge */
3416 r = setup_bridge(veth_name, arg_network_bridge, false);
3417 if (r < 0)
3418 return r;
3419 if (r > 0)
3420 ifi = r;
3421 } else if (arg_network_zone) {
3422 /* Add the interface to a bridge, possibly creating it */
3423 r = setup_bridge(veth_name, arg_network_zone, true);
3424 if (r < 0)
3425 return r;
3426 if (r > 0)
3427 ifi = r;
3428 }
3429 }
3430
3431 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3432 if (r < 0)
3433 return r;
3434
3435 /* We created the primary and extra veth links now; let's remember this, so that we know to
3436 remove them later on. Note that we don't bother with removing veth links that were created
3437 here when their setup failed half-way, because in that case the kernel should be able to
3438 remove them on its own, since they cannot be referenced by anything yet. */
3439 *veth_created = true;
3440
3441 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3442 if (r < 0)
3443 return r;
3444
3445 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3446 if (r < 0)
3447 return r;
3448 }
3449
3450 if (arg_register) {
3451 r = register_machine(
3452 arg_machine,
3453 *pid,
3454 arg_directory,
3455 arg_uuid,
3456 ifi,
3457 arg_slice,
3458 arg_custom_mounts, arg_n_custom_mounts,
3459 arg_kill_signal,
3460 arg_property,
3461 arg_keep_unit,
3462 arg_container_service_name);
3463 if (r < 0)
3464 return r;
cd2dfc6f
LP
3465 } else if (!arg_keep_unit) {
3466 r = allocate_scope(
3467 arg_machine,
3468 *pid,
3469 arg_slice,
3470 arg_custom_mounts, arg_n_custom_mounts,
3471 arg_kill_signal,
3472 arg_property);
3473 if (r < 0)
3474 return r;
3475
3476 } else if (arg_slice || arg_property)
3477 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
b0067625 3478
f0bef277 3479 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3480 if (r < 0)
3481 return r;
3482
3483 if (arg_keep_unit) {
3484 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3485 if (r < 0)
3486 return r;
3487 }
3488
3489 r = chown_cgroup(*pid, arg_uid_shift);
3490 if (r < 0)
3491 return r;
3492
3493 /* Notify the child that the parent is ready with all
3494 * its setup (including cgroup-ification), and that
3495 * the child can now hand over control to the code to
3496 * run inside the container. */
3497 (void) barrier_place(&barrier); /* #3 */
3498
3499 /* Block SIGCHLD here, before notifying child.
3500 * process_pty() will handle it with the other signals. */
3501 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3502
3503 /* Reset signal to default */
3504 r = default_signals(SIGCHLD, -1);
3505 if (r < 0)
3506 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3507
3508 r = sd_event_new(&event);
3509 if (r < 0)
3510 return log_error_errno(r, "Failed to get default event source: %m");
3511
5773024d 3512 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
b0067625
ZJS
3513 if (r < 0)
3514 return r;
3515
3516 /* Let the child know that we are ready and wait that the child is completely ready now. */
3517 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3518 log_error("Child died too early.");
3519 return -ESRCH;
3520 }
3521
3522 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3523 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3524 etc_passwd_lock = safe_close(etc_passwd_lock);
3525
3526 sd_notifyf(false,
3527 "STATUS=Container running.\n"
3528 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3529 if (!arg_notify_ready)
3530 sd_notify(false, "READY=1\n");
3531
3532 if (arg_kill_signal > 0) {
3533 /* Try to kill the init system on SIGINT or SIGTERM */
3534 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3535 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3536 } else {
3537 /* Immediately exit */
3538 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3539 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3540 }
3541
6916b164
AU
3542 /* Exit when the child exits */
3543 sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
b0067625
ZJS
3544
3545 if (arg_expose_ports) {
3546 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3547 if (r < 0)
3548 return r;
3549
3550 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3551 }
3552
3553 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3554
3555 r = pty_forward_new(event, master,
3556 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3557 &forward);
3558 if (r < 0)
3559 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3560
3561 r = sd_event_loop(event);
3562 if (r < 0)
3563 return log_error_errno(r, "Failed to run event loop: %m");
3564
3565 pty_forward_get_last_char(forward, &last_char);
3566
3567 forward = pty_forward_free(forward);
3568
3569 if (!arg_quiet && last_char != '\n')
3570 putc('\n', stdout);
3571
3572 /* Kill if it is not dead yet anyway */
3573 if (arg_register && !arg_keep_unit)
3574 terminate_machine(*pid);
3575
3576 /* Normally redundant, but better safe than sorry */
c67b0082 3577 (void) kill(*pid, SIGKILL);
b0067625
ZJS
3578
3579 r = wait_for_container(*pid, &container_status);
3580 *pid = 0;
3581
3582 if (r < 0)
3583 /* We failed to wait for the container, or the container exited abnormally. */
3584 return r;
3585 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3586 /* r > 0 → The container exited with a non-zero status.
3587 * As a special case, we need to replace 133 with a different value,
3588 * because 133 is special-cased in the service file to reboot the container.
3589 * otherwise → The container exited with zero status and a reboot was not requested.
3590 */
2a49b612 3591 if (r == EXIT_FORCE_RESTART)
27e29a1e 3592 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3593 *ret = r;
b0067625
ZJS
3594 return 0; /* finito */
3595 }
3596
3597 /* CONTAINER_REBOOTED, loop again */
3598
3599 if (arg_keep_unit) {
3600 /* Special handling if we are running as a service: instead of simply
3601 * restarting the machine we want to restart the entire service, so let's
3602 * inform systemd about this with the special exit code 133. The service
3603 * file uses RestartForceExitStatus=133 so that this results in a full
3604 * nspawn restart. This is necessary since we might have cgroup parameters
3605 * set we want to have flushed out. */
2a49b612
ZJS
3606 *ret = EXIT_FORCE_RESTART;
3607 return 0; /* finito */
b0067625
ZJS
3608 }
3609
3610 expose_port_flush(arg_expose_ports, exposed);
3611
3612 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3613 *veth_created = false;
3614 return 1; /* loop again */
3615}
3616
03cfe0d5
LP
3617int main(int argc, char *argv[]) {
3618
2d845785
LP
3619 _cleanup_free_ char *console = NULL;
3620 _cleanup_close_ int master = -1;
03cfe0d5 3621 _cleanup_fdset_free_ FDSet *fds = NULL;
2d845785 3622 int r, n_fd_passed, ret = EXIT_SUCCESS;
5aa3eba5 3623 char veth_name[IFNAMSIZ] = "";
17cbb288 3624 bool secondary = false, remove_directory = false, remove_image = false;
03cfe0d5 3625 pid_t pid = 0;
03cfe0d5
LP
3626 union in_addr_union exposed = {};
3627 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
c67b0082
LP
3628 bool interactive, veth_created = false, remove_tmprootdir = false;
3629 char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
2d845785 3630 _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
18b5886e
LP
3631 _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
3632 _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
03cfe0d5
LP
3633
3634 log_parse_environment();
3635 log_open();
415fc41c 3636
7732f92b
LP
3637 /* Make sure rename_process() in the stub init process can work */
3638 saved_argv = argv;
3639 saved_argc = argc;
3640
03cfe0d5
LP
3641 r = parse_argv(argc, argv);
3642 if (r <= 0)
3643 goto finish;
3644
03cfe0d5
LP
3645 if (geteuid() != 0) {
3646 log_error("Need to be root.");
3647 r = -EPERM;
3648 goto finish;
3649 }
f757855e
LP
3650 r = determine_names();
3651 if (r < 0)
3652 goto finish;
3653
3654 r = load_settings();
3655 if (r < 0)
3656 goto finish;
3657
3658 r = verify_arguments();
3659 if (r < 0)
3660 goto finish;
03cfe0d5
LP
3661
3662 n_fd_passed = sd_listen_fds(false);
3663 if (n_fd_passed > 0) {
3664 r = fdset_new_listen_fds(&fds, false);
3665 if (r < 0) {
3666 log_error_errno(r, "Failed to collect file descriptors: %m");
3667 goto finish;
3668 }
3669 }
3670
3671 if (arg_directory) {
3672 assert(!arg_image);
3673
3674 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3675 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3676 r = -EINVAL;
3677 goto finish;
3678 }
3679
3680 if (arg_ephemeral) {
3681 _cleanup_free_ char *np = NULL;
3682
8d4aa2bb 3683 r = chase_symlinks_and_update(&arg_directory, 0);
3f342ec4
LP
3684 if (r < 0)
3685 goto finish;
3686
03cfe0d5
LP
3687 /* If the specified path is a mount point we
3688 * generate the new snapshot immediately
3689 * inside it under a random name. However if
3690 * the specified is not a mount point we
3691 * create the new snapshot in the parent
3692 * directory, just next to it. */
e1873695 3693 r = path_is_mount_point(arg_directory, NULL, 0);
03cfe0d5
LP
3694 if (r < 0) {
3695 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3696 goto finish;
3697 }
3698 if (r > 0)
770b5ce4 3699 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3700 else
770b5ce4 3701 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5 3702 if (r < 0) {
0f3be6ca 3703 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
03cfe0d5
LP
3704 goto finish;
3705 }
3706
3707 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3708 if (r < 0) {
3709 log_error_errno(r, "Failed to lock %s: %m", np);
3710 goto finish;
3711 }
3712
17cbb288
LP
3713 r = btrfs_subvol_snapshot(arg_directory, np,
3714 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3715 BTRFS_SNAPSHOT_FALLBACK_COPY |
3716 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3717 BTRFS_SNAPSHOT_RECURSIVE |
3718 BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3719 if (r < 0) {
3720 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3721 goto finish;
ec16945e
LP
3722 }
3723
3724 free(arg_directory);
3725 arg_directory = np;
8a16a7b4 3726 np = NULL;
ec16945e 3727
17cbb288 3728 remove_directory = true;
30535c16
LP
3729
3730 } else {
cb638b5e 3731 r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
8d4aa2bb
LP
3732 if (r < 0)
3733 goto finish;
3734
30535c16
LP
3735 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3736 if (r == -EBUSY) {
3737 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3738 goto finish;
3739 }
3740 if (r < 0) {
3741 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3742 goto finish;
30535c16
LP
3743 }
3744
3745 if (arg_template) {
8d4aa2bb 3746 r = chase_symlinks_and_update(&arg_template, 0);
3f342ec4
LP
3747 if (r < 0)
3748 goto finish;
3749
17cbb288
LP
3750 r = btrfs_subvol_snapshot(arg_template, arg_directory,
3751 (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
3752 BTRFS_SNAPSHOT_FALLBACK_COPY |
3753 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3754 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
3755 BTRFS_SNAPSHOT_RECURSIVE |
3756 BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3757 if (r == -EEXIST) {
3758 if (!arg_quiet)
3759 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3760 } else if (r < 0) {
83521414 3761 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3762 goto finish;
3763 } else {
3764 if (!arg_quiet)
3765 log_info("Populated %s from template %s.", arg_directory, arg_template);
3766 }
3767 }
ec16945e
LP
3768 }
3769
7732f92b 3770 if (arg_start_mode == START_BOOT) {
1b9e5b12 3771 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3772 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3773 r = -EINVAL;
1b9e5b12
LP
3774 goto finish;
3775 }
3776 } else {
3777 const char *p;
3778
16fb773e
LP
3779 p = strjoina(arg_directory, "/usr/");
3780 if (laccess(p, F_OK) < 0) {
3781 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3782 r = -EINVAL;
1b9e5b12 3783 goto finish;
1b9e5b12
LP
3784 }
3785 }
ec16945e 3786
6b9132a9 3787 } else {
ec16945e
LP
3788 assert(arg_image);
3789 assert(!arg_template);
3790
8d4aa2bb 3791 r = chase_symlinks_and_update(&arg_image, 0);
3f342ec4
LP
3792 if (r < 0)
3793 goto finish;
3794
0f3be6ca
LP
3795 if (arg_ephemeral) {
3796 _cleanup_free_ char *np = NULL;
3797
3798 r = tempfn_random(arg_image, "machine.", &np);
3799 if (r < 0) {
3800 log_error_errno(r, "Failed to generate name for image snapshot: %m");
3801 goto finish;
3802 }
3803
3804 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3805 if (r < 0) {
3806 r = log_error_errno(r, "Failed to create image lock: %m");
3807 goto finish;
3808 }
3809
1c876927 3810 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
0f3be6ca
LP
3811 if (r < 0) {
3812 r = log_error_errno(r, "Failed to copy image file: %m");
3813 goto finish;
3814 }
3815
3816 free(arg_image);
3817 arg_image = np;
3818 np = NULL;
3819
3820 remove_image = true;
3821 } else {
3822 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3823 if (r == -EBUSY) {
3824 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3825 goto finish;
3826 }
3827 if (r < 0) {
3828 r = log_error_errno(r, "Failed to create image lock: %m");
3829 goto finish;
3830 }
4623e8e6 3831
78ebe980
LP
3832 if (!arg_root_hash) {
3833 r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size);
3834 if (r < 0) {
3835 log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image);
3836 goto finish;
3837 }
3838 }
30535c16
LP
3839 }
3840
c67b0082 3841 if (!mkdtemp(tmprootdir)) {
0f3be6ca 3842 r = log_error_errno(errno, "Failed to create temporary directory: %m");
6b9132a9 3843 goto finish;
1b9e5b12 3844 }
6b9132a9 3845
c67b0082
LP
3846 remove_tmprootdir = true;
3847
3848 arg_directory = strdup(tmprootdir);
1b9e5b12
LP
3849 if (!arg_directory) {
3850 r = log_oom();
3851 goto finish;
6b9132a9 3852 }
88213476 3853
2d845785
LP
3854 r = loop_device_make_by_path(arg_image, arg_read_only ? O_RDONLY : O_RDWR, &loop);
3855 if (r < 0) {
3856 log_error_errno(r, "Failed to set up loopback block device: %m");
842f3b0f
LP
3857 goto finish;
3858 }
1b9e5b12 3859
e0f9e7bd
LP
3860 r = dissect_image(
3861 loop->fd,
3862 arg_root_hash, arg_root_hash_size,
3863 DISSECT_IMAGE_REQUIRE_ROOT,
3864 &dissected_image);
2d845785
LP
3865 if (r == -ENOPKG) {
3866 log_error_errno(r, "Could not find a suitable file system or partition table in image: %s", arg_image);
3867
3868 log_notice("Note that the disk image needs to\n"
3869 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
3870 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
3871 " c) or follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n"
3872 " d) or contain a file system without a partition table\n"
3873 "in order to be bootable with systemd-nspawn.");
1b9e5b12 3874 goto finish;
2d845785 3875 }
4623e8e6
LP
3876 if (r == -EADDRNOTAVAIL) {
3877 log_error_errno(r, "No root partition for specified root hash found.");
3878 goto finish;
3879 }
2d845785
LP
3880 if (r == -EOPNOTSUPP) {
3881 log_error_errno(r, "--image= is not supported, compiled without blkid support.");
3882 goto finish;
3883 }
759aaedc
LP
3884 if (r == -EPROTONOSUPPORT) {
3885 log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on.");
3886 goto finish;
3887 }
2d845785
LP
3888 if (r < 0) {
3889 log_error_errno(r, "Failed to dissect image: %m");
842f3b0f
LP
3890 goto finish;
3891 }
1b9e5b12 3892
4623e8e6
LP
3893 if (!arg_root_hash && dissected_image->can_verity)
3894 log_notice("Note: image %s contains verity information, but no root hash specified! Proceeding without integrity checking.", arg_image);
3895
3896 r = dissected_image_decrypt_interactively(dissected_image, NULL, arg_root_hash, arg_root_hash_size, 0, &decrypted_image);
1b9e5b12
LP
3897 if (r < 0)
3898 goto finish;
0f3be6ca
LP
3899
3900 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
3901 if (remove_image && unlink(arg_image) >= 0)
3902 remove_image = false;
842f3b0f 3903 }
842f3b0f 3904
86c0dd4a 3905 r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5a8af538
LP
3906 if (r < 0)
3907 goto finish;
bd15ab41
TH
3908
3909 r = detect_unified_cgroup_hierarchy(arg_directory);
3910 if (r < 0)
3911 goto finish;
5a8af538 3912
03cfe0d5
LP
3913 interactive =
3914 isatty(STDIN_FILENO) > 0 &&
3915 isatty(STDOUT_FILENO) > 0;
9c857b9d 3916
db7feb7e
LP
3917 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3918 if (master < 0) {
ec16945e 3919 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3920 goto finish;
3921 }
3922
611b312b
LP
3923 r = ptsname_malloc(master, &console);
3924 if (r < 0) {
3925 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3926 goto finish;
68b02049
DW
3927 }
3928
3929 if (arg_selinux_apifs_context) {
3930 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3931 if (r < 0)
3932 goto finish;
a258bf26
LP
3933 }
3934
a258bf26 3935 if (unlockpt(master) < 0) {
ec16945e 3936 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3937 goto finish;
3938 }
3939
9c857b9d
LP
3940 if (!arg_quiet)
3941 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3942 arg_machine, arg_image ?: arg_directory);
3943
72c0a2c2 3944 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3945
03cfe0d5
LP
3946 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3947 r = log_error_errno(errno, "Failed to become subreaper: %m");
3948 goto finish;
3949 }
3950
d87be9b0 3951 for (;;) {
b0067625
ZJS
3952 r = run(master,
3953 console,
2d845785 3954 dissected_image,
b0067625
ZJS
3955 interactive, secondary,
3956 fds,
3957 veth_name, &veth_created,
3958 &exposed,
3959 &pid, &ret);
3960 if (r <= 0)
d87be9b0 3961 break;
d87be9b0 3962 }
88213476
LP
3963
3964finish:
af4ec430 3965 sd_notify(false,
2a49b612
ZJS
3966 r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
3967 "STOPPING=1\nSTATUS=Terminating...");
af4ec430 3968
9444b1f2 3969 if (pid > 0)
c67b0082 3970 (void) kill(pid, SIGKILL);
88213476 3971
503546da 3972 /* Try to flush whatever is still queued in the pty */
6a0f896b 3973 if (master >= 0) {
1c876927 3974 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, 0);
6a0f896b
LP
3975 master = safe_close(master);
3976 }
3977
3978 if (pid > 0)
3979 (void) wait_for_terminate(pid, NULL);
503546da 3980
17cbb288 3981 if (remove_directory && arg_directory) {
ec16945e
LP
3982 int k;
3983
17cbb288 3984 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
ec16945e 3985 if (k < 0)
17cbb288 3986 log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
ec16945e
LP
3987 }
3988
0f3be6ca
LP
3989 if (remove_image && arg_image) {
3990 if (unlink(arg_image) < 0)
3991 log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
3992 }
3993
c67b0082
LP
3994 if (remove_tmprootdir) {
3995 if (rmdir(tmprootdir) < 0)
3996 log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
3997 }
3998
785890ac
LP
3999 if (arg_machine) {
4000 const char *p;
4001
63c372cb 4002 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4003 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4004 }
4005
7a8f6325 4006 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4007
4008 if (veth_created)
4009 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4010 (void) remove_bridge(arg_network_zone);
f757855e 4011
04d391da 4012 free(arg_directory);
ec16945e
LP
4013 free(arg_template);
4014 free(arg_image);
7027ff61 4015 free(arg_machine);
c74e630d 4016 free(arg_user);
b53ede69
PW
4017 free(arg_pivot_root_new);
4018 free(arg_pivot_root_old);
5f932eb9 4019 free(arg_chdir);
c74e630d 4020 strv_free(arg_setenv);
f757855e 4021 free(arg_network_bridge);
c74e630d
LP
4022 strv_free(arg_network_interfaces);
4023 strv_free(arg_network_macvlan);
4bbfe7ad 4024 strv_free(arg_network_ipvlan);
f6d6bad1 4025 strv_free(arg_network_veth_extra);
f757855e
LP
4026 strv_free(arg_parameters);
4027 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4028 expose_port_free_all(arg_expose_ports);
4623e8e6 4029 free(arg_root_hash);
6d0b55c2 4030
ec16945e 4031 return r < 0 ? EXIT_FAILURE : ret;
88213476 4032}