]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Rename formats-util.h to format-util.h
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
f97b34a6 60#include "format-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
57fb9fb5
LP
119typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124} LinkJournal;
88213476
LP
125
126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
687d0825 129static char *arg_user = NULL;
9444b1f2 130static sd_id128_t arg_uuid = {};
7027ff61 131static char *arg_machine = NULL;
c74e630d
LP
132static const char *arg_selinux_context = NULL;
133static const char *arg_selinux_apifs_context = NULL;
9444b1f2 134static const char *arg_slice = NULL;
ff01d048 135static bool arg_private_network = false;
bc2f673e 136static bool arg_read_only = false;
7732f92b 137static StartMode arg_start_mode = START_PID1;
ec16945e 138static bool arg_ephemeral = false;
57fb9fb5 139static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 140static bool arg_link_journal_try = false;
520e0d54 141static uint64_t arg_caps_retain =
50b52222
LP
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 153 (1ULL << CAP_MKNOD) |
5076f0cc
LP
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
5076f0cc 157 (1ULL << CAP_SETFCAP) |
50b52222 158 (1ULL << CAP_SETGID) |
5076f0cc
LP
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
50b52222 162 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 166 (1ULL << CAP_SYS_RESOURCE) |
50b52222 167 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
168static CustomMount *arg_custom_mounts = NULL;
169static unsigned arg_n_custom_mounts = 0;
f4889f65 170static char **arg_setenv = NULL;
284c0b91 171static bool arg_quiet = false;
eb91eb18 172static bool arg_register = true;
89f7c846 173static bool arg_keep_unit = false;
aa28aefe 174static char **arg_network_interfaces = NULL;
c74e630d 175static char **arg_network_macvlan = NULL;
4bbfe7ad 176static char **arg_network_ipvlan = NULL;
69c79d3c 177static bool arg_network_veth = false;
f6d6bad1 178static char **arg_network_veth_extra = NULL;
f757855e 179static char *arg_network_bridge = NULL;
22b28dfd 180static char *arg_network_zone = NULL;
050f7277 181static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 182static char *arg_image = NULL;
f757855e 183static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 184static ExposePort *arg_expose_ports = NULL;
f36933fe 185static char **arg_property = NULL;
0de7acce 186static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 187static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 188static bool arg_userns_chown = false;
c6c8f6e2 189static int arg_kill_signal = 0;
5da38d07 190static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
191static SettingsMask arg_settings_mask = 0;
192static int arg_settings_trusted = -1;
193static char **arg_parameters = NULL;
6aadfa4c 194static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 195static bool arg_notify_ready = false;
5a8ff0e6 196static bool arg_use_cgns = true;
0c582db0 197static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
88213476 198
601185b4 199static void help(void) {
88213476
LP
200 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
201 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
202 " -h --help Show this help\n"
203 " --version Print version string\n"
69c79d3c 204 " -q --quiet Do not show status information\n"
1b9e5b12 205 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
206 " --template=PATH Initialize root directory from template directory,\n"
207 " if missing\n"
208 " -x --ephemeral Run container with snapshot of root directory, and\n"
209 " remove it after exit\n"
210 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 211 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 212 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 213 " --chdir=PATH Set working directory in the container\n"
a8828ed9 214 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 215 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 216 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 217 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 218 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 219 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 220 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 221 " Similar, but with user configured UID/GID range\n"
24597ee0 222 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
223 " --private-network Disable network in container\n"
224 " --network-interface=INTERFACE\n"
225 " Assign an existing network interface to the\n"
226 " container\n"
c74e630d
LP
227 " --network-macvlan=INTERFACE\n"
228 " Create a macvlan network interface based on an\n"
229 " existing network interface to the container\n"
4bbfe7ad
TG
230 " --network-ipvlan=INTERFACE\n"
231 " Create a ipvlan network interface based on an\n"
232 " existing network interface to the container\n"
a8eaaee7 233 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 234 " and container\n"
f6d6bad1
LP
235 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
236 " Add an additional virtual Ethernet link between\n"
237 " host and container\n"
ab046dde 238 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
239 " Add a virtual Ethernet connection to the container\n"
240 " and attach it to an existing bridge on the host\n"
241 " --network-zone=NAME Similar, but attach the new interface to an\n"
242 " an automatically managed bridge interface\n"
6d0b55c2 243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 244 " Expose a container IP port on the host\n"
82adf6af
LP
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
a8828ed9
DW
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
574edc90 257 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 258 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
a8828ed9 261 " the container\n"
5e5bfa6e
EY
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
06c17c39 264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
267 " the container\n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 271 " --register=BOOLEAN Register container as machine\n"
89f7c846 272 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 273 " the service unit nspawn is running in\n"
6d0b55c2 274 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 275 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 276 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 277 , program_invocation_short_name);
88213476
LP
278}
279
5a8af538
LP
280static int custom_mounts_prepare(void) {
281 unsigned i;
282 int r;
283
284 /* Ensure the mounts are applied prefix first. */
285 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
286
287 /* Allocate working directories for the overlay file systems that need it */
288 for (i = 0; i < arg_n_custom_mounts; i++) {
289 CustomMount *m = &arg_custom_mounts[i];
290
0de7acce 291 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
292
293 if (arg_userns_chown) {
294 log_error("--private-users-chown may not be combined with custom root mounts.");
295 return -EINVAL;
296 } else if (arg_uid_shift == UID_INVALID) {
297 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
298 return -EINVAL;
299 }
825d5287
RM
300 }
301
5a8af538
LP
302 if (m->type != CUSTOM_MOUNT_OVERLAY)
303 continue;
304
305 if (m->work_dir)
306 continue;
307
308 if (m->read_only)
309 continue;
310
14bcf25c 311 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
312 if (r < 0)
313 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
314 }
315
316 return 0;
317}
318
0fd9563f 319static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 320 const char *e;
5da38d07
TH
321 int r, all_unified, systemd_unified;
322
efdb0237
LP
323 /* Allow the user to control whether the unified hierarchy is used */
324 e = getenv("UNIFIED_CGROUP_HIERARCHY");
325 if (e) {
326 r = parse_boolean(e);
327 if (r < 0)
328 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
329 if (r > 0)
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
331 else
332 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 333
efdb0237
LP
334 return 0;
335 }
336
98afd6af
ZJS
337 all_unified = cg_all_unified();
338 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
339
340 if (all_unified < 0 || systemd_unified < 0)
341 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
342 "Failed to determine whether the unified cgroups hierarchy is used: %m");
343
efdb0237 344 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
345 if (all_unified > 0) {
346 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
347 * routine only detects 231, so we'll have a false negative here for 230. */
348 r = systemd_installation_has_version(directory, 230);
349 if (r < 0)
350 return log_error_errno(r, "Failed to determine systemd version in container: %m");
351 if (r > 0)
352 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
353 else
354 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
355 } else if (systemd_unified > 0) {
356 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
357 r = systemd_installation_has_version(directory, 232);
358 if (r < 0)
359 return log_error_errno(r, "Failed to determine systemd version in container: %m");
360 if (r > 0)
361 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
362 else
363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
364 } else
5da38d07 365 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 366
efdb0237
LP
367 return 0;
368}
369
0c582db0
LB
370static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
371 int r;
372
373 r = getenv_bool(name);
374 if (r == -ENXIO)
375 return;
376 if (r < 0)
377 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
378 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
379}
380
88213476
LP
381static int parse_argv(int argc, char *argv[]) {
382
a41fe3a2 383 enum {
acbeb427
ZJS
384 ARG_VERSION = 0x100,
385 ARG_PRIVATE_NETWORK,
bc2f673e 386 ARG_UUID,
5076f0cc 387 ARG_READ_ONLY,
57fb9fb5 388 ARG_CAPABILITY,
420c7379 389 ARG_DROP_CAPABILITY,
17fe0523
LP
390 ARG_LINK_JOURNAL,
391 ARG_BIND,
f4889f65 392 ARG_BIND_RO,
06c17c39 393 ARG_TMPFS,
5a8af538
LP
394 ARG_OVERLAY,
395 ARG_OVERLAY_RO,
eb91eb18 396 ARG_SHARE_SYSTEM,
89f7c846 397 ARG_REGISTER,
aa28aefe 398 ARG_KEEP_UNIT,
69c79d3c 399 ARG_NETWORK_INTERFACE,
c74e630d 400 ARG_NETWORK_MACVLAN,
4bbfe7ad 401 ARG_NETWORK_IPVLAN,
ab046dde 402 ARG_NETWORK_BRIDGE,
22b28dfd 403 ARG_NETWORK_ZONE,
f6d6bad1 404 ARG_NETWORK_VETH_EXTRA,
6afc95b7 405 ARG_PERSONALITY,
4d9f07b4 406 ARG_VOLATILE,
ec16945e 407 ARG_TEMPLATE,
f36933fe 408 ARG_PROPERTY,
6dac160c 409 ARG_PRIVATE_USERS,
c6c8f6e2 410 ARG_KILL_SIGNAL,
f757855e 411 ARG_SETTINGS,
5f932eb9 412 ARG_CHDIR,
7336138e 413 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 414 ARG_NOTIFY_READY,
a41fe3a2
LP
415 };
416
88213476 417 static const struct option options[] = {
27eb8e90
ZJS
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "as-pid2", no_argument, NULL, 'a' },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, 'E' },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
449 { "network-veth", no_argument, NULL, 'n' },
450 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
451 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
452 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
453 { "personality", required_argument, NULL, ARG_PERSONALITY },
454 { "image", required_argument, NULL, 'i' },
455 { "volatile", optional_argument, NULL, ARG_VOLATILE },
456 { "port", required_argument, NULL, 'p' },
457 { "property", required_argument, NULL, ARG_PROPERTY },
458 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
459 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
460 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
461 { "settings", required_argument, NULL, ARG_SETTINGS },
462 { "chdir", required_argument, NULL, ARG_CHDIR },
463 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 464 {}
88213476
LP
465 };
466
9444b1f2 467 int c, r;
6aadfa4c 468 const char *p, *e;
a42c8b54 469 uint64_t plus = 0, minus = 0;
f757855e 470 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
471
472 assert(argc >= 0);
473 assert(argv);
474
19aac838 475 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
476
477 switch (c) {
478
479 case 'h':
601185b4
ZJS
480 help();
481 return 0;
88213476 482
acbeb427 483 case ARG_VERSION:
3f6fd1ba 484 return version();
acbeb427 485
88213476 486 case 'D':
0f03c2a4 487 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 488 if (r < 0)
0f03c2a4 489 return r;
ec16945e
LP
490 break;
491
492 case ARG_TEMPLATE:
0f03c2a4 493 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 494 if (r < 0)
0f03c2a4 495 return r;
88213476
LP
496 break;
497
1b9e5b12 498 case 'i':
0f03c2a4 499 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 500 if (r < 0)
0f03c2a4 501 return r;
ec16945e
LP
502 break;
503
504 case 'x':
505 arg_ephemeral = true;
1b9e5b12
LP
506 break;
507
687d0825 508 case 'u':
2fc09a9c
DM
509 r = free_and_strdup(&arg_user, optarg);
510 if (r < 0)
7027ff61 511 return log_oom();
687d0825 512
f757855e 513 arg_settings_mask |= SETTING_USER;
687d0825
MV
514 break;
515
22b28dfd
LP
516 case ARG_NETWORK_ZONE: {
517 char *j;
518
519 j = strappend("vz-", optarg);
520 if (!j)
521 return log_oom();
522
523 if (!ifname_valid(j)) {
524 log_error("Network zone name not valid: %s", j);
525 free(j);
526 return -EINVAL;
527 }
528
529 free(arg_network_zone);
530 arg_network_zone = j;
531
532 arg_network_veth = true;
533 arg_private_network = true;
534 arg_settings_mask |= SETTING_NETWORK;
535 break;
536 }
537
ab046dde 538 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
539
540 if (!ifname_valid(optarg)) {
541 log_error("Bridge interface name not valid: %s", optarg);
542 return -EINVAL;
543 }
544
f757855e
LP
545 r = free_and_strdup(&arg_network_bridge, optarg);
546 if (r < 0)
547 return log_oom();
ab046dde
TG
548
549 /* fall through */
550
0dfaa006 551 case 'n':
69c79d3c
LP
552 arg_network_veth = true;
553 arg_private_network = true;
f757855e 554 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
555 break;
556
f6d6bad1
LP
557 case ARG_NETWORK_VETH_EXTRA:
558 r = veth_extra_parse(&arg_network_veth_extra, optarg);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
561
562 arg_private_network = true;
563 arg_settings_mask |= SETTING_NETWORK;
564 break;
565
aa28aefe 566 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
567
568 if (!ifname_valid(optarg)) {
569 log_error("Network interface name not valid: %s", optarg);
570 return -EINVAL;
571 }
572
c74e630d
LP
573 if (strv_extend(&arg_network_interfaces, optarg) < 0)
574 return log_oom();
575
576 arg_private_network = true;
f757855e 577 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
578 break;
579
580 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
581
582 if (!ifname_valid(optarg)) {
583 log_error("MACVLAN network interface name not valid: %s", optarg);
584 return -EINVAL;
585 }
586
c74e630d 587 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
588 return log_oom();
589
4bbfe7ad 590 arg_private_network = true;
f757855e 591 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
592 break;
593
594 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
595
596 if (!ifname_valid(optarg)) {
597 log_error("IPVLAN network interface name not valid: %s", optarg);
598 return -EINVAL;
599 }
600
4bbfe7ad
TG
601 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
602 return log_oom();
603
aa28aefe
LP
604 /* fall through */
605
ff01d048
LP
606 case ARG_PRIVATE_NETWORK:
607 arg_private_network = true;
f757855e 608 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
609 break;
610
0f0dbc46 611 case 'b':
7732f92b
LP
612 if (arg_start_mode == START_PID2) {
613 log_error("--boot and --as-pid2 may not be combined.");
614 return -EINVAL;
615 }
616
617 arg_start_mode = START_BOOT;
618 arg_settings_mask |= SETTING_START_MODE;
619 break;
620
621 case 'a':
622 if (arg_start_mode == START_BOOT) {
623 log_error("--boot and --as-pid2 may not be combined.");
624 return -EINVAL;
625 }
626
627 arg_start_mode = START_PID2;
628 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
629 break;
630
144f0fc0 631 case ARG_UUID:
9444b1f2 632 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
633 if (r < 0)
634 return log_error_errno(r, "Invalid UUID: %s", optarg);
635
636 if (sd_id128_is_null(arg_uuid)) {
637 log_error("Machine UUID may not be all zeroes.");
638 return -EINVAL;
aa96c6cb 639 }
f757855e
LP
640
641 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 642 break;
aa96c6cb 643
9444b1f2 644 case 'S':
c74e630d 645 arg_slice = optarg;
144f0fc0
LP
646 break;
647
7027ff61 648 case 'M':
c1521918 649 if (isempty(optarg))
97b11eed 650 arg_machine = mfree(arg_machine);
c1521918 651 else {
0c3c4284 652 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
653 log_error("Invalid machine name: %s", optarg);
654 return -EINVAL;
655 }
7027ff61 656
0c3c4284
LP
657 r = free_and_strdup(&arg_machine, optarg);
658 if (r < 0)
eb91eb18
LP
659 return log_oom();
660
661 break;
662 }
7027ff61 663
82adf6af
LP
664 case 'Z':
665 arg_selinux_context = optarg;
a8828ed9
DW
666 break;
667
82adf6af
LP
668 case 'L':
669 arg_selinux_apifs_context = optarg;
a8828ed9
DW
670 break;
671
bc2f673e
LP
672 case ARG_READ_ONLY:
673 arg_read_only = true;
f757855e 674 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
675 break;
676
420c7379
LP
677 case ARG_CAPABILITY:
678 case ARG_DROP_CAPABILITY: {
6cbe4ed1 679 p = optarg;
9ed794a3 680 for (;;) {
6cbe4ed1 681 _cleanup_free_ char *t = NULL;
5076f0cc 682
6cbe4ed1
SS
683 r = extract_first_word(&p, &t, ",", 0);
684 if (r < 0)
685 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 686
6cbe4ed1
SS
687 if (r == 0)
688 break;
5076f0cc 689
39ed67d1
LP
690 if (streq(t, "all")) {
691 if (c == ARG_CAPABILITY)
a42c8b54 692 plus = (uint64_t) -1;
39ed67d1 693 else
a42c8b54 694 minus = (uint64_t) -1;
39ed67d1 695 } else {
2822da4f
LP
696 int cap;
697
698 cap = capability_from_name(t);
699 if (cap < 0) {
39ed67d1
LP
700 log_error("Failed to parse capability %s.", t);
701 return -EINVAL;
702 }
703
704 if (c == ARG_CAPABILITY)
a42c8b54 705 plus |= 1ULL << (uint64_t) cap;
39ed67d1 706 else
a42c8b54 707 minus |= 1ULL << (uint64_t) cap;
5076f0cc 708 }
5076f0cc
LP
709 }
710
f757855e 711 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
712 break;
713 }
714
57fb9fb5
LP
715 case 'j':
716 arg_link_journal = LINK_GUEST;
574edc90 717 arg_link_journal_try = true;
57fb9fb5
LP
718 break;
719
720 case ARG_LINK_JOURNAL:
53e438e3 721 if (streq(optarg, "auto")) {
57fb9fb5 722 arg_link_journal = LINK_AUTO;
53e438e3
LP
723 arg_link_journal_try = false;
724 } else if (streq(optarg, "no")) {
57fb9fb5 725 arg_link_journal = LINK_NO;
53e438e3
LP
726 arg_link_journal_try = false;
727 } else if (streq(optarg, "guest")) {
57fb9fb5 728 arg_link_journal = LINK_GUEST;
53e438e3
LP
729 arg_link_journal_try = false;
730 } else if (streq(optarg, "host")) {
57fb9fb5 731 arg_link_journal = LINK_HOST;
53e438e3
LP
732 arg_link_journal_try = false;
733 } else if (streq(optarg, "try-guest")) {
574edc90
MP
734 arg_link_journal = LINK_GUEST;
735 arg_link_journal_try = true;
736 } else if (streq(optarg, "try-host")) {
737 arg_link_journal = LINK_HOST;
738 arg_link_journal_try = true;
739 } else {
57fb9fb5
LP
740 log_error("Failed to parse link journal mode %s", optarg);
741 return -EINVAL;
742 }
743
744 break;
745
17fe0523 746 case ARG_BIND:
f757855e
LP
747 case ARG_BIND_RO:
748 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
749 if (r < 0)
750 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 751
f757855e 752 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 753 break;
06c17c39 754
f757855e
LP
755 case ARG_TMPFS:
756 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
757 if (r < 0)
758 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 759
f757855e 760 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 761 break;
5a8af538
LP
762
763 case ARG_OVERLAY:
764 case ARG_OVERLAY_RO: {
765 _cleanup_free_ char *upper = NULL, *destination = NULL;
766 _cleanup_strv_free_ char **lower = NULL;
767 CustomMount *m;
768 unsigned n = 0;
769 char **i;
770
62f9f39a
RM
771 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
772 if (r == -ENOMEM)
06c17c39 773 return log_oom();
62f9f39a
RM
774 else if (r < 0) {
775 log_error("Invalid overlay specification: %s", optarg);
776 return r;
777 }
06c17c39 778
5a8af538
LP
779 STRV_FOREACH(i, lower) {
780 if (!path_is_absolute(*i)) {
781 log_error("Overlay path %s is not absolute.", *i);
782 return -EINVAL;
783 }
784
785 n++;
786 }
787
788 if (n < 2) {
789 log_error("--overlay= needs at least two colon-separated directories specified.");
790 return -EINVAL;
791 }
792
793 if (n == 2) {
794 /* If two parameters are specified,
795 * the first one is the lower, the
796 * second one the upper directory. And
af86c440
ZJS
797 * we'll also define the destination
798 * mount point the same as the upper. */
5a8af538
LP
799 upper = lower[1];
800 lower[1] = NULL;
801
802 destination = strdup(upper);
803 if (!destination)
804 return log_oom();
805
806 } else {
807 upper = lower[n - 2];
808 destination = lower[n - 1];
809 lower[n - 2] = NULL;
810 }
811
f757855e 812 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
813 if (!m)
814 return log_oom();
815
816 m->destination = destination;
817 m->source = upper;
818 m->lower = lower;
819 m->read_only = c == ARG_OVERLAY_RO;
820
821 upper = destination = NULL;
822 lower = NULL;
06c17c39 823
f757855e 824 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
825 break;
826 }
827
a5f1cb3b 828 case 'E': {
f4889f65
LP
829 char **n;
830
831 if (!env_assignment_is_valid(optarg)) {
832 log_error("Environment variable assignment '%s' is not valid.", optarg);
833 return -EINVAL;
834 }
835
836 n = strv_env_set(arg_setenv, optarg);
837 if (!n)
838 return log_oom();
839
840 strv_free(arg_setenv);
841 arg_setenv = n;
f757855e
LP
842
843 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
844 break;
845 }
846
284c0b91
LP
847 case 'q':
848 arg_quiet = true;
849 break;
850
8a96d94e 851 case ARG_SHARE_SYSTEM:
a6b5216c 852 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
853 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
854 arg_clone_ns_flags = 0;
8a96d94e
LP
855 break;
856
eb91eb18
LP
857 case ARG_REGISTER:
858 r = parse_boolean(optarg);
859 if (r < 0) {
860 log_error("Failed to parse --register= argument: %s", optarg);
861 return r;
862 }
863
864 arg_register = r;
865 break;
866
89f7c846
LP
867 case ARG_KEEP_UNIT:
868 arg_keep_unit = true;
869 break;
870
6afc95b7
LP
871 case ARG_PERSONALITY:
872
ac45f971 873 arg_personality = personality_from_string(optarg);
050f7277 874 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
875 log_error("Unknown or unsupported personality '%s'.", optarg);
876 return -EINVAL;
877 }
878
f757855e 879 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
880 break;
881
4d9f07b4
LP
882 case ARG_VOLATILE:
883
884 if (!optarg)
f757855e 885 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 886 else {
f757855e 887 VolatileMode m;
4d9f07b4 888
f757855e
LP
889 m = volatile_mode_from_string(optarg);
890 if (m < 0) {
891 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 892 return -EINVAL;
f757855e
LP
893 } else
894 arg_volatile_mode = m;
6d0b55c2
LP
895 }
896
f757855e
LP
897 arg_settings_mask |= SETTING_VOLATILE_MODE;
898 break;
6d0b55c2 899
f757855e
LP
900 case 'p':
901 r = expose_port_parse(&arg_expose_ports, optarg);
902 if (r == -EEXIST)
903 return log_error_errno(r, "Duplicate port specification: %s", optarg);
904 if (r < 0)
905 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 906
f757855e 907 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 908 break;
6d0b55c2 909
f36933fe
LP
910 case ARG_PROPERTY:
911 if (strv_extend(&arg_property, optarg) < 0)
912 return log_oom();
913
914 break;
915
ae209204
ZJS
916 case ARG_PRIVATE_USERS: {
917 int boolean = -1;
0de7acce 918
ae209204
ZJS
919 if (!optarg)
920 boolean = true;
921 else if (!in_charset(optarg, DIGITS))
922 /* do *not* parse numbers as booleans */
923 boolean = parse_boolean(optarg);
924
925 if (boolean == false) {
0de7acce
LP
926 /* no: User namespacing off */
927 arg_userns_mode = USER_NAMESPACE_NO;
928 arg_uid_shift = UID_INVALID;
929 arg_uid_range = UINT32_C(0x10000);
ae209204 930 } else if (boolean == true) {
0de7acce
LP
931 /* yes: User namespacing on, UID range is read from root dir */
932 arg_userns_mode = USER_NAMESPACE_FIXED;
933 arg_uid_shift = UID_INVALID;
934 arg_uid_range = UINT32_C(0x10000);
935 } else if (streq(optarg, "pick")) {
936 /* pick: User namespacing on, UID range is picked randomly */
937 arg_userns_mode = USER_NAMESPACE_PICK;
938 arg_uid_shift = UID_INVALID;
939 arg_uid_range = UINT32_C(0x10000);
940 } else {
6c2058b3 941 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
942 const char *range, *shift;
943
0de7acce
LP
944 /* anything else: User namespacing on, UID range is explicitly configured */
945
6dac160c
LP
946 range = strchr(optarg, ':');
947 if (range) {
6c2058b3
ZJS
948 buffer = strndup(optarg, range - optarg);
949 if (!buffer)
950 return log_oom();
951 shift = buffer;
6dac160c
LP
952
953 range++;
bfd292ec
ZJS
954 r = safe_atou32(range, &arg_uid_range);
955 if (r < 0)
be715731 956 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
957 } else
958 shift = optarg;
959
be715731
ZJS
960 r = parse_uid(shift, &arg_uid_shift);
961 if (r < 0)
962 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
963
964 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
965 }
966
be715731
ZJS
967 if (arg_uid_range <= 0) {
968 log_error("UID range cannot be 0.");
969 return -EINVAL;
970 }
971
0de7acce 972 arg_settings_mask |= SETTING_USERNS;
6dac160c 973 break;
ae209204 974 }
6dac160c 975
0de7acce 976 case 'U':
ccabee0d
LP
977 if (userns_supported()) {
978 arg_userns_mode = USER_NAMESPACE_PICK;
979 arg_uid_shift = UID_INVALID;
980 arg_uid_range = UINT32_C(0x10000);
981
982 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
983 }
984
7336138e
LP
985 break;
986
0de7acce 987 case ARG_PRIVATE_USERS_CHOWN:
19aac838 988 arg_userns_chown = true;
0de7acce
LP
989
990 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
991 break;
992
c6c8f6e2
LP
993 case ARG_KILL_SIGNAL:
994 arg_kill_signal = signal_from_string_try_harder(optarg);
995 if (arg_kill_signal < 0) {
996 log_error("Cannot parse signal: %s", optarg);
997 return -EINVAL;
998 }
999
f757855e
LP
1000 arg_settings_mask |= SETTING_KILL_SIGNAL;
1001 break;
1002
1003 case ARG_SETTINGS:
1004
1005 /* no → do not read files
1006 * yes → read files, do not override cmdline, trust only subset
1007 * override → read files, override cmdline, trust only subset
1008 * trusted → read files, do not override cmdline, trust all
1009 */
1010
1011 r = parse_boolean(optarg);
1012 if (r < 0) {
1013 if (streq(optarg, "trusted")) {
1014 mask_all_settings = false;
1015 mask_no_settings = false;
1016 arg_settings_trusted = true;
1017
1018 } else if (streq(optarg, "override")) {
1019 mask_all_settings = false;
1020 mask_no_settings = true;
1021 arg_settings_trusted = -1;
1022 } else
1023 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1024 } else if (r > 0) {
1025 /* yes */
1026 mask_all_settings = false;
1027 mask_no_settings = false;
1028 arg_settings_trusted = -1;
1029 } else {
1030 /* no */
1031 mask_all_settings = true;
1032 mask_no_settings = false;
1033 arg_settings_trusted = false;
1034 }
1035
c6c8f6e2
LP
1036 break;
1037
5f932eb9
LP
1038 case ARG_CHDIR:
1039 if (!path_is_absolute(optarg)) {
1040 log_error("Working directory %s is not an absolute path.", optarg);
1041 return -EINVAL;
1042 }
1043
1044 r = free_and_strdup(&arg_chdir, optarg);
1045 if (r < 0)
1046 return log_oom();
1047
1048 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1049 break;
1050
9c1e04d0
AP
1051 case ARG_NOTIFY_READY:
1052 r = parse_boolean(optarg);
1053 if (r < 0) {
1054 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1055 return -EINVAL;
1056 }
1057 arg_notify_ready = r;
1058 arg_settings_mask |= SETTING_NOTIFY_READY;
1059 break;
1060
88213476
LP
1061 case '?':
1062 return -EINVAL;
1063
1064 default:
eb9da376 1065 assert_not_reached("Unhandled option");
88213476 1066 }
88213476 1067
0c582db0
LB
1068 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1069 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1070 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1071 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1072
48a8d337
LB
1073 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1074 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1075 arg_register = false;
0c582db0
LB
1076 if (arg_start_mode != START_PID1) {
1077 log_error("--boot cannot be used without namespacing.");
1078 return -EINVAL;
1079 }
1080 }
eb91eb18 1081
0de7acce 1082 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1083 arg_userns_chown = true;
1084
89f7c846
LP
1085 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1086 log_error("--keep-unit may not be used when invoked from a user session.");
1087 return -EINVAL;
1088 }
1089
1b9e5b12
LP
1090 if (arg_directory && arg_image) {
1091 log_error("--directory= and --image= may not be combined.");
1092 return -EINVAL;
1093 }
1094
ec16945e
LP
1095 if (arg_template && arg_image) {
1096 log_error("--template= and --image= may not be combined.");
1097 return -EINVAL;
1098 }
1099
1100 if (arg_template && !(arg_directory || arg_machine)) {
1101 log_error("--template= needs --directory= or --machine=.");
1102 return -EINVAL;
1103 }
1104
1105 if (arg_ephemeral && arg_template) {
1106 log_error("--ephemeral and --template= may not be combined.");
1107 return -EINVAL;
1108 }
1109
1110 if (arg_ephemeral && arg_image) {
1111 log_error("--ephemeral and --image= may not be combined.");
1112 return -EINVAL;
1113 }
1114
df9a75e4
LP
1115 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1116 log_error("--ephemeral and --link-journal= may not be combined.");
1117 return -EINVAL;
1118 }
1119
ccabee0d 1120 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1121 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1122 return -EOPNOTSUPP;
1123 }
1124
1125 if (arg_userns_chown && arg_read_only) {
1126 log_error("--read-only and --private-users-chown may not be combined.");
1127 return -EINVAL;
1128 }
f757855e 1129
22b28dfd
LP
1130 if (arg_network_bridge && arg_network_zone) {
1131 log_error("--network-bridge= and --network-zone= may not be combined.");
1132 return -EINVAL;
1133 }
1134
f757855e
LP
1135 if (argc > optind) {
1136 arg_parameters = strv_copy(argv + optind);
1137 if (!arg_parameters)
1138 return log_oom();
1139
7732f92b 1140 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1141 }
1142
1143 /* Load all settings from .nspawn files */
1144 if (mask_no_settings)
1145 arg_settings_mask = 0;
1146
1147 /* Don't load any settings from .nspawn files */
1148 if (mask_all_settings)
1149 arg_settings_mask = _SETTINGS_MASK_ALL;
1150
520e0d54 1151 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1152
6aadfa4c
ILG
1153 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1154 if (e)
1155 arg_container_service_name = e;
1156
5a8ff0e6
CB
1157 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1158 if (r < 0)
1159 arg_use_cgns = cg_ns_supported();
1160 else
1161 arg_use_cgns = r;
1162
f757855e
LP
1163 return 1;
1164}
1165
1166static int verify_arguments(void) {
1167
1168 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1169 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1170 return -EINVAL;
1171 }
1172
6d0b55c2
LP
1173 if (arg_expose_ports && !arg_private_network) {
1174 log_error("Cannot use --port= without private networking.");
1175 return -EINVAL;
1176 }
1177
1c1ea217
EV
1178#ifndef HAVE_LIBIPTC
1179 if (arg_expose_ports) {
1180 log_error("--port= is not supported, compiled without libiptc support.");
1181 return -EOPNOTSUPP;
1182 }
1183#endif
1184
7732f92b 1185 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1186 arg_kill_signal = SIGRTMIN+3;
1187
f757855e 1188 return 0;
88213476
LP
1189}
1190
03cfe0d5
LP
1191static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1192 assert(p);
1193
0de7acce 1194 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1195 return 0;
1196
1197 if (uid == UID_INVALID && gid == GID_INVALID)
1198 return 0;
1199
1200 if (uid != UID_INVALID) {
1201 uid += arg_uid_shift;
1202
1203 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1204 return -EOVERFLOW;
1205 }
1206
1207 if (gid != GID_INVALID) {
1208 gid += (gid_t) arg_uid_shift;
1209
1210 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1211 return -EOVERFLOW;
1212 }
1213
1214 if (lchown(p, uid, gid) < 0)
1215 return -errno;
b12afc8c
LP
1216
1217 return 0;
1218}
1219
03cfe0d5
LP
1220static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1221 const char *q;
1222
1223 q = prefix_roota(root, path);
1224 if (mkdir(q, mode) < 0) {
1225 if (errno == EEXIST)
1226 return 0;
1227 return -errno;
1228 }
1229
1230 return userns_lchown(q, uid, gid);
1231}
1232
e58a1277 1233static int setup_timezone(const char *dest) {
03cfe0d5
LP
1234 _cleanup_free_ char *p = NULL, *q = NULL;
1235 const char *where, *check, *what;
d4036145
LP
1236 char *z, *y;
1237 int r;
f8440af5 1238
e58a1277
LP
1239 assert(dest);
1240
1241 /* Fix the timezone, if possible */
d4036145
LP
1242 r = readlink_malloc("/etc/localtime", &p);
1243 if (r < 0) {
0b493a02
MP
1244 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1245 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1246 * with a symbolic link to a time zone data file.
0b493a02
MP
1247 *
1248 * Example:
21dc0227 1249 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1250 */
d4036145
LP
1251 return 0;
1252 }
1253
1254 z = path_startswith(p, "../usr/share/zoneinfo/");
1255 if (!z)
1256 z = path_startswith(p, "/usr/share/zoneinfo/");
1257 if (!z) {
1258 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1259 return 0;
1260 }
1261
03cfe0d5 1262 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1263 r = readlink_malloc(where, &q);
1264 if (r >= 0) {
1265 y = path_startswith(q, "../usr/share/zoneinfo/");
1266 if (!y)
1267 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1268
d4036145
LP
1269 /* Already pointing to the right place? Then do nothing .. */
1270 if (y && streq(y, z))
1271 return 0;
1272 }
1273
03cfe0d5 1274 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1275 check = prefix_roota(dest, check);
03cfe0d5 1276 if (laccess(check, F_OK) < 0) {
d4036145
LP
1277 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1278 return 0;
1279 }
68fb0892 1280
79d80fc1
TG
1281 r = unlink(where);
1282 if (r < 0 && errno != ENOENT) {
56f64d95 1283 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1284 return 0;
1285 }
4d9f07b4 1286
03cfe0d5 1287 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1288 if (symlink(what, where) < 0) {
56f64d95 1289 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1290 return 0;
1291 }
e58a1277 1292
03cfe0d5
LP
1293 r = userns_lchown(where, 0, 0);
1294 if (r < 0)
1295 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1296
e58a1277 1297 return 0;
88213476
LP
1298}
1299
2547bb41 1300static int setup_resolv_conf(const char *dest) {
03cfe0d5 1301 const char *where = NULL;
79d80fc1 1302 int r;
2547bb41
LP
1303
1304 assert(dest);
1305
1306 if (arg_private_network)
1307 return 0;
1308
1309 /* Fix resolv.conf, if possible */
03cfe0d5 1310 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1311
3539724c
LP
1312 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
1313 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1314 * container, so that the container can use the host's resolver. Given that network namespacing is
1315 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1316 * advantage that the container will be able to follow the host's DNS server configuration changes
1317 * transparently. */
1318
60e76d48
ZJS
1319 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1320 if (r >= 0)
1321 return mount_verbose(LOG_ERR, NULL, where, NULL,
1322 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1323 }
1324
1325 /* If that didn't work, let's copy the file */
f2068bcc 1326 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1327 if (r < 0) {
3539724c
LP
1328 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1329 * resolved or something similar runs inside and the symlink points there.
68a313c5 1330 *
3539724c 1331 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1332 */
1333 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1334 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1335 return 0;
1336 }
2547bb41 1337
03cfe0d5
LP
1338 r = userns_lchown(where, 0, 0);
1339 if (r < 0)
3539724c 1340 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1341
2547bb41
LP
1342 return 0;
1343}
1344
04bc4a3f 1345static int setup_boot_id(const char *dest) {
3bbaff3e 1346 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1347 const char *from, *to;
04bc4a3f
LP
1348 int r;
1349
04bc4a3f
LP
1350 /* Generate a new randomized boot ID, so that each boot-up of
1351 * the container gets a new one */
1352
03cfe0d5
LP
1353 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1354 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1355
1356 r = sd_id128_randomize(&rnd);
f647962d
MS
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1359
15b1248a 1360 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1363
60e76d48
ZJS
1364 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1365 if (r >= 0)
1366 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1367 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1368
3bbaff3e 1369 (void) unlink(from);
04bc4a3f
LP
1370 return r;
1371}
1372
e58a1277 1373static int copy_devnodes(const char *dest) {
88213476
LP
1374
1375 static const char devnodes[] =
1376 "null\0"
1377 "zero\0"
1378 "full\0"
1379 "random\0"
1380 "urandom\0"
85614d66
TG
1381 "tty\0"
1382 "net/tun\0";
88213476
LP
1383
1384 const char *d;
e58a1277 1385 int r = 0;
7fd1b19b 1386 _cleanup_umask_ mode_t u;
a258bf26
LP
1387
1388 assert(dest);
124640f1
LP
1389
1390 u = umask(0000);
88213476 1391
03cfe0d5
LP
1392 /* Create /dev/net, so that we can create /dev/net/tun in it */
1393 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1394 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1395
88213476 1396 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1397 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1398 struct stat st;
88213476 1399
7f112f50 1400 from = strappend("/dev/", d);
03cfe0d5 1401 to = prefix_root(dest, from);
88213476
LP
1402
1403 if (stat(from, &st) < 0) {
1404
4a62c710
MS
1405 if (errno != ENOENT)
1406 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1407
a258bf26 1408 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1409
03cfe0d5 1410 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1411 return -EIO;
a258bf26 1412
85614d66 1413 } else {
81f5049b 1414 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1415 /*
1416 * This is some sort of protection too against
1417 * recursive userns chown on shared /dev/
1418 */
1419 if (errno == EEXIST)
1420 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1421 if (errno != EPERM)
1422 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1423
1424 /* Some systems abusively restrict mknod but
1425 * allow bind mounts. */
1426 r = touch(to);
1427 if (r < 0)
1428 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1429 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1430 if (r < 0)
1431 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1432 }
6278cf60 1433
03cfe0d5
LP
1434 r = userns_lchown(to, 0, 0);
1435 if (r < 0)
1436 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1437 }
88213476
LP
1438 }
1439
e58a1277
LP
1440 return r;
1441}
88213476 1442
03cfe0d5
LP
1443static int setup_pts(const char *dest) {
1444 _cleanup_free_ char *options = NULL;
1445 const char *p;
709f6e46 1446 int r;
03cfe0d5
LP
1447
1448#ifdef HAVE_SELINUX
1449 if (arg_selinux_apifs_context)
1450 (void) asprintf(&options,
3dce8915 1451 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1452 arg_uid_shift + TTY_GID,
1453 arg_selinux_apifs_context);
1454 else
1455#endif
1456 (void) asprintf(&options,
3dce8915 1457 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1458 arg_uid_shift + TTY_GID);
f2d88580 1459
03cfe0d5 1460 if (!options)
f2d88580
LP
1461 return log_oom();
1462
03cfe0d5 1463 /* Mount /dev/pts itself */
cc9fce65 1464 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1465 if (mkdir(p, 0755) < 0)
1466 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1467 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1468 if (r < 0)
1469 return r;
709f6e46
MS
1470 r = userns_lchown(p, 0, 0);
1471 if (r < 0)
1472 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1473
1474 /* Create /dev/ptmx symlink */
1475 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1476 if (symlink("pts/ptmx", p) < 0)
1477 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1478 r = userns_lchown(p, 0, 0);
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1481
03cfe0d5
LP
1482 /* And fix /dev/pts/ptmx ownership */
1483 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1484 r = userns_lchown(p, 0, 0);
1485 if (r < 0)
1486 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1487
f2d88580
LP
1488 return 0;
1489}
1490
e58a1277 1491static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1492 _cleanup_umask_ mode_t u;
1493 const char *to;
e58a1277 1494 int r;
e58a1277
LP
1495
1496 assert(dest);
1497 assert(console);
1498
1499 u = umask(0000);
1500
03cfe0d5 1501 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1502 if (r < 0)
1503 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1504
a258bf26
LP
1505 /* We need to bind mount the right tty to /dev/console since
1506 * ptys can only exist on pts file systems. To have something
81f5049b 1507 * to bind mount things on we create a empty regular file. */
a258bf26 1508
03cfe0d5 1509 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1510 r = touch(to);
1511 if (r < 0)
1512 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1513
60e76d48 1514 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1515}
1516
1517static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1518 const char *from, *to;
7fd1b19b 1519 _cleanup_umask_ mode_t u;
d9603714 1520 int fd, r;
e58a1277 1521
e58a1277 1522 assert(kmsg_socket >= 0);
a258bf26 1523
e58a1277 1524 u = umask(0000);
a258bf26 1525
03cfe0d5 1526 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1527 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1528 * on the reading side behave very similar to /proc/kmsg,
1529 * their writing side behaves differently from /dev/kmsg in
1530 * that writing blocks when nothing is reading. In order to
1531 * avoid any problems with containers deadlocking due to this
1532 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1533 from = prefix_roota(dest, "/run/kmsg");
1534 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1535
4a62c710 1536 if (mkfifo(from, 0600) < 0)
03cfe0d5 1537 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1538 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1539 if (r < 0)
1540 return r;
e58a1277
LP
1541
1542 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1543 if (fd < 0)
1544 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1545
e58a1277
LP
1546 /* Store away the fd in the socket, so that it stays open as
1547 * long as we run the child */
3ee897d6 1548 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1549 safe_close(fd);
e58a1277 1550
d9603714
DH
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1553
03cfe0d5
LP
1554 /* And now make the FIFO unavailable as /run/kmsg... */
1555 (void) unlink(from);
1556
25ea79fe 1557 return 0;
88213476
LP
1558}
1559
1c4baffc 1560static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1561 union in_addr_union *exposed = userdata;
1562
1563 assert(rtnl);
1564 assert(m);
1565 assert(exposed);
1566
7a8f6325 1567 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1568 return 0;
1569}
1570
3a74cea5 1571static int setup_hostname(void) {
3a74cea5 1572
0c582db0 1573 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1574 return 0;
1575
605f81a8 1576 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1577 return -errno;
3a74cea5 1578
7027ff61 1579 return 0;
3a74cea5
LP
1580}
1581
57fb9fb5 1582static int setup_journal(const char *directory) {
e01ff70a 1583 sd_id128_t this_id;
0f5e1382 1584 _cleanup_free_ char *d = NULL;
e01ff70a 1585 const char *p, *q;
8054d749 1586 bool try;
e01ff70a 1587 char id[33];
57fb9fb5
LP
1588 int r;
1589
df9a75e4
LP
1590 /* Don't link journals in ephemeral mode */
1591 if (arg_ephemeral)
1592 return 0;
1593
8054d749
LP
1594 if (arg_link_journal == LINK_NO)
1595 return 0;
1596
1597 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1598
4d680aee 1599 r = sd_id128_get_machine(&this_id);
f647962d
MS
1600 if (r < 0)
1601 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1602
e01ff70a 1603 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1604 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1605 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1606 if (try)
4d680aee 1607 return 0;
df9a75e4 1608 return -EEXIST;
4d680aee
ZJS
1609 }
1610
03cfe0d5
LP
1611 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1612 if (r < 0)
1613 return log_error_errno(r, "Failed to create /var: %m");
1614
1615 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to create /var/log: %m");
1618
1619 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1620 if (r < 0)
1621 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1622
e01ff70a
MS
1623 (void) sd_id128_to_string(arg_uuid, id);
1624
03cfe0d5
LP
1625 p = strjoina("/var/log/journal/", id);
1626 q = prefix_roota(directory, p);
27407a01 1627
e26d6ce5 1628 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1629 if (try)
1630 return 0;
27407a01 1631
8054d749
LP
1632 log_error("%s: already a mount point, refusing to use for journal", p);
1633 return -EEXIST;
57fb9fb5
LP
1634 }
1635
e26d6ce5 1636 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1637 if (try)
1638 return 0;
57fb9fb5 1639
8054d749
LP
1640 log_error("%s: already a mount point, refusing to use for journal", q);
1641 return -EEXIST;
57fb9fb5
LP
1642 }
1643
1644 r = readlink_and_make_absolute(p, &d);
1645 if (r >= 0) {
1646 if ((arg_link_journal == LINK_GUEST ||
1647 arg_link_journal == LINK_AUTO) &&
1648 path_equal(d, q)) {
1649
03cfe0d5 1650 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1651 if (r < 0)
709f6e46 1652 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1653 return 0;
57fb9fb5
LP
1654 }
1655
4a62c710
MS
1656 if (unlink(p) < 0)
1657 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1658 } else if (r == -EINVAL) {
1659
1660 if (arg_link_journal == LINK_GUEST &&
1661 rmdir(p) < 0) {
1662
27407a01
ZJS
1663 if (errno == ENOTDIR) {
1664 log_error("%s already exists and is neither a symlink nor a directory", p);
1665 return r;
4314d33f
MS
1666 } else
1667 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1668 }
4314d33f
MS
1669 } else if (r != -ENOENT)
1670 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1671
1672 if (arg_link_journal == LINK_GUEST) {
1673
1674 if (symlink(q, p) < 0) {
8054d749 1675 if (try) {
56f64d95 1676 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1677 return 0;
4314d33f
MS
1678 } else
1679 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1680 }
1681
03cfe0d5 1682 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1683 if (r < 0)
709f6e46 1684 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1685 return 0;
57fb9fb5
LP
1686 }
1687
1688 if (arg_link_journal == LINK_HOST) {
ccddd104 1689 /* don't create parents here — if the host doesn't have
574edc90 1690 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1691
1692 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1693 if (try) {
56f64d95 1694 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1695 return 0;
4314d33f
MS
1696 } else
1697 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1698 }
1699
27407a01
ZJS
1700 } else if (access(p, F_OK) < 0)
1701 return 0;
57fb9fb5 1702
cdb2b9d0
LP
1703 if (dir_is_empty(q) == 0)
1704 log_warning("%s is not empty, proceeding anyway.", q);
1705
03cfe0d5 1706 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1709
60e76d48
ZJS
1710 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1711 if (r < 0)
4a62c710 1712 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1713
27407a01 1714 return 0;
57fb9fb5
LP
1715}
1716
88213476 1717static int drop_capabilities(void) {
520e0d54 1718 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1719}
1720
db999e0f
LP
1721static int reset_audit_loginuid(void) {
1722 _cleanup_free_ char *p = NULL;
1723 int r;
1724
0c582db0 1725 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1726 return 0;
1727
1728 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1729 if (r == -ENOENT)
db999e0f 1730 return 0;
f647962d
MS
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1733
1734 /* Already reset? */
1735 if (streq(p, "4294967295"))
1736 return 0;
1737
ad118bda 1738 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1739 if (r < 0) {
10a87006
LP
1740 log_error_errno(r,
1741 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1742 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1743 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1744 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1745 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1746
db999e0f 1747 sleep(5);
77b6e194 1748 }
db999e0f
LP
1749
1750 return 0;
77b6e194
LP
1751}
1752
24fb1112 1753
785890ac
LP
1754static int setup_propagate(const char *root) {
1755 const char *p, *q;
709f6e46 1756 int r;
785890ac
LP
1757
1758 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1759 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1760 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1761 (void) mkdir_p(p, 0600);
1762
709f6e46
MS
1763 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1764 if (r < 0)
1765 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1766
709f6e46
MS
1767 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1768 if (r < 0)
1769 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1770
709f6e46
MS
1771 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1772 if (r < 0)
1773 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1774
03cfe0d5 1775 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1776 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1777 if (r < 0)
1778 return r;
785890ac 1779
60e76d48
ZJS
1780 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1781 if (r < 0)
1782 return r;
785890ac 1783
19caffac
AC
1784 /* machined will MS_MOVE into that directory, and that's only
1785 * supported for non-shared mounts. */
60e76d48 1786 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1787}
1788
1b9e5b12
LP
1789static int setup_image(char **device_path, int *loop_nr) {
1790 struct loop_info64 info = {
1791 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1792 };
1793 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1794 _cleanup_free_ char* loopdev = NULL;
1795 struct stat st;
1796 int r, nr;
1797
1798 assert(device_path);
1799 assert(loop_nr);
ec16945e 1800 assert(arg_image);
1b9e5b12
LP
1801
1802 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1803 if (fd < 0)
1804 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1805
4a62c710
MS
1806 if (fstat(fd, &st) < 0)
1807 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1808
1809 if (S_ISBLK(st.st_mode)) {
1810 char *p;
1811
1812 p = strdup(arg_image);
1813 if (!p)
1814 return log_oom();
1815
1816 *device_path = p;
1817
1818 *loop_nr = -1;
1819
1820 r = fd;
1821 fd = -1;
1822
1823 return r;
1824 }
1825
1826 if (!S_ISREG(st.st_mode)) {
070edd97 1827 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1828 return -EINVAL;
1829 }
1830
1831 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1832 if (control < 0)
1833 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1834
1835 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1836 if (nr < 0)
1837 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1838
1839 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1840 return log_oom();
1841
1842 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1843 if (loop < 0)
1844 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1845
4a62c710
MS
1846 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1847 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1848
1849 if (arg_read_only)
1850 info.lo_flags |= LO_FLAGS_READ_ONLY;
1851
4a62c710
MS
1852 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1853 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1854
1855 *device_path = loopdev;
1856 loopdev = NULL;
1857
1858 *loop_nr = nr;
1859
1860 r = loop;
1861 loop = -1;
1862
1863 return r;
1864}
1865
ada4799a
LP
1866#define PARTITION_TABLE_BLURB \
1867 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1868 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1869 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1870 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1871 "to be bootable with systemd-nspawn."
1872
1b9e5b12
LP
1873static int dissect_image(
1874 int fd,
727fd4fd
LP
1875 char **root_device, bool *root_device_rw,
1876 char **home_device, bool *home_device_rw,
1877 char **srv_device, bool *srv_device_rw,
a6bc7db9 1878 char **esp_device,
1b9e5b12
LP
1879 bool *secondary) {
1880
1881#ifdef HAVE_BLKID
a6bc7db9 1882 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1883#ifdef GPT_ROOT_NATIVE
1884 int root_nr = -1;
1885#endif
1886#ifdef GPT_ROOT_SECONDARY
1887 int secondary_root_nr = -1;
1888#endif
a6bc7db9 1889 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1890 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1891 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1892 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1893 _cleanup_udev_unref_ struct udev *udev = NULL;
1894 struct udev_list_entry *first, *item;
f6c51a81 1895 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1896 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1897 const char *pttype = NULL;
1898 blkid_partlist pl;
1899 struct stat st;
c09ef2e4 1900 unsigned i;
1b9e5b12
LP
1901 int r;
1902
1903 assert(fd >= 0);
1904 assert(root_device);
1905 assert(home_device);
1906 assert(srv_device);
a6bc7db9 1907 assert(esp_device);
1b9e5b12 1908 assert(secondary);
ec16945e 1909 assert(arg_image);
1b9e5b12
LP
1910
1911 b = blkid_new_probe();
1912 if (!b)
1913 return log_oom();
1914
1915 errno = 0;
1916 r = blkid_probe_set_device(b, fd, 0, 0);
1917 if (r != 0) {
1918 if (errno == 0)
1919 return log_oom();
1920
e1427b13 1921 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1922 }
1923
1924 blkid_probe_enable_partitions(b, 1);
1925 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1926
1927 errno = 0;
1928 r = blkid_do_safeprobe(b);
1929 if (r == -2 || r == 1) {
ada4799a
LP
1930 log_error("Failed to identify any partition table on\n"
1931 " %s\n"
1932 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1933 return -EINVAL;
1934 } else if (r != 0) {
1935 if (errno == 0)
1936 errno = EIO;
e1427b13 1937 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1938 }
1939
48861960 1940 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1941
1942 is_gpt = streq_ptr(pttype, "gpt");
1943 is_mbr = streq_ptr(pttype, "dos");
1944
1945 if (!is_gpt && !is_mbr) {
1946 log_error("No GPT or MBR partition table discovered on\n"
1947 " %s\n"
1948 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1949 return -EINVAL;
1950 }
1951
1952 errno = 0;
1953 pl = blkid_probe_get_partitions(b);
1954 if (!pl) {
1955 if (errno == 0)
1956 return log_oom();
1957
1958 log_error("Failed to list partitions of %s", arg_image);
1959 return -errno;
1960 }
1961
1962 udev = udev_new();
1963 if (!udev)
1964 return log_oom();
1965
4a62c710
MS
1966 if (fstat(fd, &st) < 0)
1967 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1968
c09ef2e4
LP
1969 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1970 if (!d)
1b9e5b12
LP
1971 return log_oom();
1972
c09ef2e4
LP
1973 for (i = 0;; i++) {
1974 int n, m;
1b9e5b12 1975
c09ef2e4
LP
1976 if (i >= 10) {
1977 log_error("Kernel partitions never appeared.");
1978 return -ENXIO;
1979 }
1980
1981 e = udev_enumerate_new(udev);
1982 if (!e)
1983 return log_oom();
1984
1985 r = udev_enumerate_add_match_parent(e, d);
1986 if (r < 0)
1987 return log_oom();
1988
1989 r = udev_enumerate_scan_devices(e);
1990 if (r < 0)
1991 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1992
1993 /* Count the partitions enumerated by the kernel */
1994 n = 0;
1995 first = udev_enumerate_get_list_entry(e);
1996 udev_list_entry_foreach(item, first)
1997 n++;
1998
1999 /* Count the partitions enumerated by blkid */
2000 m = blkid_partlist_numof_partitions(pl);
2001 if (n == m + 1)
2002 break;
2003 if (n > m + 1) {
2004 log_error("blkid and kernel partition list do not match.");
2005 return -EIO;
2006 }
2007 if (n < m + 1) {
2008 unsigned j;
2009
2010 /* The kernel has probed fewer partitions than
2011 * blkid? Maybe the kernel prober is still
2012 * running or it got EBUSY because udev
2013 * already opened the device. Let's reprobe
2014 * the device, which is a synchronous call
2015 * that waits until probing is complete. */
2016
2017 for (j = 0; j < 20; j++) {
2018
2019 r = ioctl(fd, BLKRRPART, 0);
2020 if (r < 0)
2021 r = -errno;
2022 if (r >= 0 || r != -EBUSY)
2023 break;
2024
2025 /* If something else has the device
2026 * open, such as an udev rule, the
2027 * ioctl will return EBUSY. Since
2028 * there's no way to wait until it
2029 * isn't busy anymore, let's just wait
2030 * a bit, and try again.
2031 *
2032 * This is really something they
2033 * should fix in the kernel! */
2034
2035 usleep(50 * USEC_PER_MSEC);
2036 }
2037
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to reread partition table: %m");
2040 }
2041
2042 e = udev_enumerate_unref(e);
2043 }
1b9e5b12
LP
2044
2045 first = udev_enumerate_get_list_entry(e);
2046 udev_list_entry_foreach(item, first) {
2047 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2048 const char *node;
727fd4fd 2049 unsigned long long flags;
1b9e5b12
LP
2050 blkid_partition pp;
2051 dev_t qn;
2052 int nr;
2053
2054 errno = 0;
2055 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2056 if (!q) {
2057 if (!errno)
2058 errno = ENOMEM;
2059
e1427b13 2060 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2061 }
2062
2063 qn = udev_device_get_devnum(q);
2064 if (major(qn) == 0)
2065 continue;
2066
2067 if (st.st_rdev == qn)
2068 continue;
2069
2070 node = udev_device_get_devnode(q);
2071 if (!node)
2072 continue;
2073
2074 pp = blkid_partlist_devno_to_partition(pl, qn);
2075 if (!pp)
2076 continue;
2077
727fd4fd 2078 flags = blkid_partition_get_flags(pp);
727fd4fd 2079
1b9e5b12
LP
2080 nr = blkid_partition_get_partno(pp);
2081 if (nr < 0)
2082 continue;
2083
ada4799a
LP
2084 if (is_gpt) {
2085 sd_id128_t type_id;
2086 const char *stype;
1b9e5b12 2087
f6c51a81
LP
2088 if (flags & GPT_FLAG_NO_AUTO)
2089 continue;
2090
ada4799a
LP
2091 stype = blkid_partition_get_type_string(pp);
2092 if (!stype)
2093 continue;
1b9e5b12 2094
ada4799a 2095 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2096 continue;
2097
ada4799a 2098 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2099
ada4799a
LP
2100 if (home && nr >= home_nr)
2101 continue;
1b9e5b12 2102
ada4799a
LP
2103 home_nr = nr;
2104 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2105
ada4799a
LP
2106 r = free_and_strdup(&home, node);
2107 if (r < 0)
2108 return log_oom();
727fd4fd 2109
ada4799a
LP
2110 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2111
2112 if (srv && nr >= srv_nr)
2113 continue;
2114
2115 srv_nr = nr;
2116 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2117
2118 r = free_and_strdup(&srv, node);
2119 if (r < 0)
2120 return log_oom();
a6bc7db9
LP
2121 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2122
2123 if (esp && nr >= esp_nr)
2124 continue;
2125
2126 esp_nr = nr;
2127
2128 r = free_and_strdup(&esp, node);
2129 if (r < 0)
2130 return log_oom();
ada4799a 2131 }
1b9e5b12 2132#ifdef GPT_ROOT_NATIVE
ada4799a 2133 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2134
ada4799a
LP
2135 if (root && nr >= root_nr)
2136 continue;
1b9e5b12 2137
ada4799a
LP
2138 root_nr = nr;
2139 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2140
ada4799a
LP
2141 r = free_and_strdup(&root, node);
2142 if (r < 0)
2143 return log_oom();
2144 }
1b9e5b12
LP
2145#endif
2146#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2147 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2148
2149 if (secondary_root && nr >= secondary_root_nr)
2150 continue;
2151
2152 secondary_root_nr = nr;
2153 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2154
2155 r = free_and_strdup(&secondary_root, node);
2156 if (r < 0)
2157 return log_oom();
2158 }
2159#endif
f6c51a81
LP
2160 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2161
2162 if (generic)
2163 multiple_generic = true;
2164 else {
2165 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2166
2167 r = free_and_strdup(&generic, node);
2168 if (r < 0)
2169 return log_oom();
2170 }
2171 }
ada4799a
LP
2172
2173 } else if (is_mbr) {
2174 int type;
1b9e5b12 2175
f6c51a81
LP
2176 if (flags != 0x80) /* Bootable flag */
2177 continue;
2178
ada4799a
LP
2179 type = blkid_partition_get_type(pp);
2180 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2181 continue;
2182
f6c51a81
LP
2183 if (generic)
2184 multiple_generic = true;
2185 else {
2186 generic_rw = true;
727fd4fd 2187
f6c51a81
LP
2188 r = free_and_strdup(&root, node);
2189 if (r < 0)
2190 return log_oom();
2191 }
1b9e5b12 2192 }
1b9e5b12
LP
2193 }
2194
1b9e5b12
LP
2195 if (root) {
2196 *root_device = root;
2197 root = NULL;
727fd4fd
LP
2198
2199 *root_device_rw = root_rw;
1b9e5b12
LP
2200 *secondary = false;
2201 } else if (secondary_root) {
2202 *root_device = secondary_root;
2203 secondary_root = NULL;
727fd4fd
LP
2204
2205 *root_device_rw = secondary_root_rw;
1b9e5b12 2206 *secondary = true;
f6c51a81
LP
2207 } else if (generic) {
2208
2209 /* There were no partitions with precise meanings
2210 * around, but we found generic partitions. In this
2211 * case, if there's only one, we can go ahead and boot
2212 * it, otherwise we bail out, because we really cannot
2213 * make any sense of it. */
2214
2215 if (multiple_generic) {
2216 log_error("Identified multiple bootable Linux partitions on\n"
2217 " %s\n"
2218 PARTITION_TABLE_BLURB, arg_image);
2219 return -EINVAL;
2220 }
2221
2222 *root_device = generic;
2223 generic = NULL;
2224
2225 *root_device_rw = generic_rw;
2226 *secondary = false;
2227 } else {
2228 log_error("Failed to identify root partition in disk image\n"
2229 " %s\n"
2230 PARTITION_TABLE_BLURB, arg_image);
2231 return -EINVAL;
1b9e5b12
LP
2232 }
2233
2234 if (home) {
2235 *home_device = home;
2236 home = NULL;
727fd4fd
LP
2237
2238 *home_device_rw = home_rw;
1b9e5b12
LP
2239 }
2240
2241 if (srv) {
2242 *srv_device = srv;
2243 srv = NULL;
727fd4fd
LP
2244
2245 *srv_device_rw = srv_rw;
1b9e5b12
LP
2246 }
2247
a6bc7db9
LP
2248 if (esp) {
2249 *esp_device = esp;
2250 esp = NULL;
2251 }
2252
1b9e5b12
LP
2253 return 0;
2254#else
2255 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2256 return -EOPNOTSUPP;
1b9e5b12
LP
2257#endif
2258}
2259
727fd4fd 2260static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2261#ifdef HAVE_BLKID
2262 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2bce2acc 2263 const char *fstype, *p, *options;
1b9e5b12
LP
2264 int r;
2265
2266 assert(what);
2267 assert(where);
2268
727fd4fd
LP
2269 if (arg_read_only)
2270 rw = false;
2271
1b9e5b12 2272 if (directory)
63c372cb 2273 p = strjoina(where, directory);
1b9e5b12
LP
2274 else
2275 p = where;
2276
2277 errno = 0;
2278 b = blkid_new_probe_from_filename(what);
2279 if (!b) {
2280 if (errno == 0)
2281 return log_oom();
e1427b13 2282 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2283 }
2284
2285 blkid_probe_enable_superblocks(b, 1);
2286 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2287
2288 errno = 0;
2289 r = blkid_do_safeprobe(b);
2290 if (r == -1 || r == 1) {
2291 log_error("Cannot determine file system type of %s", what);
2292 return -EINVAL;
2293 } else if (r != 0) {
2294 if (errno == 0)
2295 errno = EIO;
e1427b13 2296 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2297 }
2298
2299 errno = 0;
2300 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2301 if (errno == 0)
2302 errno = EINVAL;
2303 log_error("Failed to determine file system type of %s", what);
2304 return -errno;
2305 }
2306
2307 if (streq(fstype, "crypto_LUKS")) {
2308 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2309 return -EOPNOTSUPP;
1b9e5b12
LP
2310 }
2311
2bce2acc
LP
2312 /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
2313 * sparse when possible. */
2314 if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
2315 const char *l;
2316
2317 l = path_startswith(what, "/dev");
2318 if (l && startswith(l, "loop"))
2319 options = "discard";
2320 }
2321
2322 return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
1b9e5b12
LP
2323#else
2324 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2325 return -EOPNOTSUPP;
1b9e5b12
LP
2326#endif
2327}
2328
317feb4d 2329static int setup_machine_id(const char *directory) {
691675ba
LP
2330 const char *etc_machine_id;
2331 sd_id128_t id;
3bbaff3e 2332 int r;
e01ff70a 2333
317feb4d
LP
2334 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2335 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2336 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2337 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2338 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2339 * container behaves nicely). */
2340
e01ff70a
MS
2341 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2342
691675ba 2343 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2344 if (r < 0) {
2345 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2346 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2347
317feb4d
LP
2348 if (sd_id128_is_null(arg_uuid)) {
2349 r = sd_id128_randomize(&arg_uuid);
2350 if (r < 0)
2351 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2352 }
2353 } else {
2354 if (sd_id128_is_null(id)) {
2355 log_error("Machine ID in container image is zero, refusing.");
2356 return -EINVAL;
2357 }
e01ff70a 2358
317feb4d
LP
2359 arg_uuid = id;
2360 }
691675ba 2361
e01ff70a
MS
2362 return 0;
2363}
2364
7336138e
LP
2365static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2366 int r;
2367
2368 assert(directory);
2369
0de7acce 2370 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2371 return 0;
2372
2373 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2374 if (r == -EOPNOTSUPP)
2375 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2376 if (r == -EBADE)
2377 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2380 if (r == 0)
2381 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2382 else
2383 log_debug("Patched directory tree to match UID/GID range.");
2384
2385 return r;
2386}
2387
727fd4fd
LP
2388static int mount_devices(
2389 const char *where,
2390 const char *root_device, bool root_device_rw,
2391 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2392 const char *srv_device, bool srv_device_rw,
2393 const char *esp_device) {
1b9e5b12
LP
2394 int r;
2395
2396 assert(where);
2397
2398 if (root_device) {
727fd4fd 2399 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2400 if (r < 0)
2401 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2402 }
2403
2404 if (home_device) {
727fd4fd 2405 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2408 }
2409
2410 if (srv_device) {
727fd4fd 2411 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2412 if (r < 0)
2413 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2414 }
2415
a6bc7db9
LP
2416 if (esp_device) {
2417 const char *mp, *x;
2418
2419 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2420
2421 mp = "/efi";
2422 x = strjoina(arg_directory, mp);
2423 r = dir_is_empty(x);
2424 if (r == -ENOENT) {
2425 mp = "/boot";
2426 x = strjoina(arg_directory, mp);
2427 r = dir_is_empty(x);
2428 }
2429
2430 if (r > 0) {
2431 r = mount_device(esp_device, arg_directory, mp, true);
2432 if (r < 0)
2433 return log_error_errno(r, "Failed to mount ESP: %m");
2434 }
2435 }
2436
1b9e5b12
LP
2437 return 0;
2438}
2439
2440static void loop_remove(int nr, int *image_fd) {
2441 _cleanup_close_ int control = -1;
e8c8ddcc 2442 int r;
1b9e5b12
LP
2443
2444 if (nr < 0)
2445 return;
2446
2447 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2448 r = ioctl(*image_fd, LOOP_CLR_FD);
2449 if (r < 0)
5e4074aa 2450 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2451 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2452 }
2453
2454 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2455 if (control < 0) {
56f64d95 2456 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2457 return;
e8c8ddcc 2458 }
1b9e5b12 2459
e8c8ddcc
TG
2460 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2461 if (r < 0)
5e4074aa 2462 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2463}
2464
113cea80 2465/*
6d416b9c
LS
2466 * Return values:
2467 * < 0 : wait_for_terminate() failed to get the state of the
2468 * container, the container was terminated by a signal, or
2469 * failed for an unknown reason. No change is made to the
2470 * container argument.
2471 * > 0 : The program executed in the container terminated with an
2472 * error. The exit code of the program executed in the
919699ec
LP
2473 * container is returned. The container argument has been set
2474 * to CONTAINER_TERMINATED.
6d416b9c
LS
2475 * 0 : The container is being rebooted, has been shut down or exited
2476 * successfully. The container argument has been set to either
2477 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2478 *
6d416b9c
LS
2479 * That is, success is indicated by a return value of zero, and an
2480 * error is indicated by a non-zero value.
113cea80
DH
2481 */
2482static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2483 siginfo_t status;
919699ec 2484 int r;
113cea80
DH
2485
2486 r = wait_for_terminate(pid, &status);
f647962d
MS
2487 if (r < 0)
2488 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2489
2490 switch (status.si_code) {
fddbb89c 2491
113cea80 2492 case CLD_EXITED:
b5a2179b 2493 if (status.si_status == 0)
919699ec 2494 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2495 else
919699ec 2496 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2497
919699ec
LP
2498 *container = CONTAINER_TERMINATED;
2499 return status.si_status;
113cea80
DH
2500
2501 case CLD_KILLED:
2502 if (status.si_status == SIGINT) {
919699ec 2503 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2504 *container = CONTAINER_TERMINATED;
919699ec
LP
2505 return 0;
2506
113cea80 2507 } else if (status.si_status == SIGHUP) {
919699ec 2508 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2509 *container = CONTAINER_REBOOTED;
919699ec 2510 return 0;
113cea80 2511 }
919699ec 2512
113cea80
DH
2513 /* CLD_KILLED fallthrough */
2514
2515 case CLD_DUMPED:
fddbb89c 2516 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2517 return -EIO;
113cea80
DH
2518
2519 default:
fddbb89c 2520 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2521 return -EIO;
113cea80 2522 }
113cea80
DH
2523}
2524
023fb90b
LP
2525static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2526 pid_t pid;
2527
4a0b58c4 2528 pid = PTR_TO_PID(userdata);
023fb90b 2529 if (pid > 0) {
c6c8f6e2 2530 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2531 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2532 sd_event_source_set_userdata(s, NULL);
2533 return 0;
2534 }
2535 }
2536
2537 sd_event_exit(sd_event_source_get_event(s), 0);
2538 return 0;
2539}
2540
ec16945e 2541static int determine_names(void) {
1b9cebf6 2542 int r;
ec16945e 2543
c1521918
LP
2544 if (arg_template && !arg_directory && arg_machine) {
2545
2546 /* If --template= was specified then we should not
2547 * search for a machine, but instead create a new one
2548 * in /var/lib/machine. */
2549
605405c6 2550 arg_directory = strjoin("/var/lib/machines/", arg_machine);
c1521918
LP
2551 if (!arg_directory)
2552 return log_oom();
2553 }
2554
ec16945e 2555 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2556 if (arg_machine) {
2557 _cleanup_(image_unrefp) Image *i = NULL;
2558
2559 r = image_find(arg_machine, &i);
2560 if (r < 0)
2561 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2562 else if (r == 0) {
2563 log_error("No image for machine '%s': %m", arg_machine);
2564 return -ENOENT;
2565 }
2566
aceac2f0 2567 if (i->type == IMAGE_RAW)
0f03c2a4 2568 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2569 else
0f03c2a4 2570 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2571 if (r < 0)
2572 return log_error_errno(r, "Invalid image directory: %m");
2573
aee327b8
LP
2574 if (!arg_ephemeral)
2575 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2576 } else
ec16945e
LP
2577 arg_directory = get_current_dir_name();
2578
1b9cebf6
LP
2579 if (!arg_directory && !arg_machine) {
2580 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2581 return -EINVAL;
2582 }
2583 }
2584
2585 if (!arg_machine) {
b9ba4dab
LP
2586 if (arg_directory && path_equal(arg_directory, "/"))
2587 arg_machine = gethostname_malloc();
2588 else
2589 arg_machine = strdup(basename(arg_image ?: arg_directory));
2590
ec16945e
LP
2591 if (!arg_machine)
2592 return log_oom();
2593
ae691c1d 2594 hostname_cleanup(arg_machine);
ec16945e
LP
2595 if (!machine_name_is_valid(arg_machine)) {
2596 log_error("Failed to determine machine name automatically, please use -M.");
2597 return -EINVAL;
2598 }
b9ba4dab
LP
2599
2600 if (arg_ephemeral) {
2601 char *b;
2602
2603 /* Add a random suffix when this is an
2604 * ephemeral machine, so that we can run many
2605 * instances at once without manually having
2606 * to specify -M each time. */
2607
2608 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2609 return log_oom();
2610
2611 free(arg_machine);
2612 arg_machine = b;
2613 }
ec16945e
LP
2614 }
2615
2616 return 0;
2617}
2618
03cfe0d5 2619static int determine_uid_shift(const char *directory) {
6dac160c
LP
2620 int r;
2621
0de7acce 2622 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2623 arg_uid_shift = 0;
6dac160c 2624 return 0;
03cfe0d5 2625 }
6dac160c
LP
2626
2627 if (arg_uid_shift == UID_INVALID) {
2628 struct stat st;
2629
03cfe0d5 2630 r = stat(directory, &st);
6dac160c 2631 if (r < 0)
03cfe0d5 2632 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2633
2634 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2635
2636 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2637 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2638 return -EINVAL;
2639 }
2640
2641 arg_uid_range = UINT32_C(0x10000);
2642 }
2643
2644 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2645 log_error("UID base too high for UID range.");
2646 return -EINVAL;
2647 }
2648
6dac160c
LP
2649 return 0;
2650}
2651
03cfe0d5
LP
2652static int inner_child(
2653 Barrier *barrier,
2654 const char *directory,
2655 bool secondary,
2656 int kmsg_socket,
2657 int rtnl_socket,
f757855e 2658 FDSet *fds) {
69c79d3c 2659
03cfe0d5 2660 _cleanup_free_ char *home = NULL;
e01ff70a 2661 char as_uuid[37];
6aadfa4c 2662 unsigned n_env = 1;
03cfe0d5
LP
2663 const char *envp[] = {
2664 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2665 NULL, /* container */
03cfe0d5
LP
2666 NULL, /* TERM */
2667 NULL, /* HOME */
2668 NULL, /* USER */
2669 NULL, /* LOGNAME */
2670 NULL, /* container_uuid */
2671 NULL, /* LISTEN_FDS */
2672 NULL, /* LISTEN_PID */
9c1e04d0 2673 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2674 NULL
2675 };
88213476 2676
2371271c 2677 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2678 int r;
88213476 2679
03cfe0d5
LP
2680 assert(barrier);
2681 assert(directory);
2682 assert(kmsg_socket >= 0);
88213476 2683
efdb0237
LP
2684 cg_unified_flush();
2685
0de7acce 2686 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2687 /* Tell the parent, that it now can write the UID map. */
2688 (void) barrier_place(barrier); /* #1 */
7027ff61 2689
03cfe0d5
LP
2690 /* Wait until the parent wrote the UID map */
2691 if (!barrier_place_and_sync(barrier)) { /* #2 */
2692 log_error("Parent died too early");
2693 return -ESRCH;
2694 }
88213476
LP
2695 }
2696
6d66bd3b
EV
2697 r = reset_uid_gid();
2698 if (r < 0)
2699 return log_error_errno(r, "Couldn't become new root: %m");
2700
0de7acce
LP
2701 r = mount_all(NULL,
2702 arg_userns_mode != USER_NAMESPACE_NO,
2703 true,
2704 arg_private_network,
2705 arg_uid_shift,
2706 arg_uid_range,
2707 arg_selinux_apifs_context);
2708
03cfe0d5
LP
2709 if (r < 0)
2710 return r;
2711
d8fc6a00
LP
2712 r = mount_sysfs(NULL);
2713 if (r < 0)
2714 return r;
2715
03cfe0d5
LP
2716 /* Wait until we are cgroup-ified, so that we
2717 * can mount the right cgroup path writable */
2718 if (!barrier_place_and_sync(barrier)) { /* #3 */
2719 log_error("Parent died too early");
2720 return -ESRCH;
88213476
LP
2721 }
2722
5a8ff0e6 2723 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2724 r = unshare(CLONE_NEWCGROUP);
2725 if (r < 0)
2726 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2727 r = mount_cgroups(
2728 "",
2729 arg_unified_cgroup_hierarchy,
2730 arg_userns_mode != USER_NAMESPACE_NO,
2731 arg_uid_shift,
2732 arg_uid_range,
5a8ff0e6 2733 arg_selinux_apifs_context,
ada54120 2734 true);
0996ef00
CB
2735 if (r < 0)
2736 return r;
2737 } else {
2738 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2739 if (r < 0)
2740 return r;
2741 }
ec16945e 2742
03cfe0d5
LP
2743 r = setup_boot_id(NULL);
2744 if (r < 0)
2745 return r;
ec16945e 2746
03cfe0d5
LP
2747 r = setup_kmsg(NULL, kmsg_socket);
2748 if (r < 0)
2749 return r;
2750 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2751
03cfe0d5 2752 umask(0022);
30535c16 2753
03cfe0d5
LP
2754 if (setsid() < 0)
2755 return log_error_errno(errno, "setsid() failed: %m");
2756
2757 if (arg_private_network)
2758 loopback_setup();
2759
7a8f6325
LP
2760 if (arg_expose_ports) {
2761 r = expose_port_send_rtnl(rtnl_socket);
2762 if (r < 0)
2763 return r;
2764 rtnl_socket = safe_close(rtnl_socket);
2765 }
03cfe0d5 2766
709f6e46
MS
2767 r = drop_capabilities();
2768 if (r < 0)
2769 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2770
2771 setup_hostname();
2772
050f7277 2773 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2774 if (personality(arg_personality) < 0)
2775 return log_error_errno(errno, "personality() failed: %m");
2776 } else if (secondary) {
2777 if (personality(PER_LINUX32) < 0)
2778 return log_error_errno(errno, "personality() failed: %m");
2779 }
2780
2781#ifdef HAVE_SELINUX
2782 if (arg_selinux_context)
2ed96880 2783 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2784 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2785#endif
2786
ee645080 2787 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2788 if (r < 0)
2789 return r;
2790
6aadfa4c
ILG
2791 /* LXC sets container=lxc, so follow the scheme here */
2792 envp[n_env++] = strjoina("container=", arg_container_service_name);
2793
03cfe0d5
LP
2794 envp[n_env] = strv_find_prefix(environ, "TERM=");
2795 if (envp[n_env])
313cefa1 2796 n_env++;
03cfe0d5
LP
2797
2798 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2799 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2800 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2801 return log_oom();
2802
3bbaff3e 2803 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2804
691675ba 2805 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2806 return log_oom();
03cfe0d5
LP
2807
2808 if (fdset_size(fds) > 0) {
2809 r = fdset_cloexec(fds, false);
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2812
2813 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2814 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2815 return log_oom();
2816 }
9c1e04d0
AP
2817 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2818 return log_oom();
03cfe0d5 2819
2371271c
TG
2820 env_use = strv_env_merge(2, envp, arg_setenv);
2821 if (!env_use)
2822 return log_oom();
03cfe0d5
LP
2823
2824 /* Let the parent know that we are ready and
2825 * wait until the parent is ready with the
2826 * setup, too... */
2827 if (!barrier_place_and_sync(barrier)) { /* #4 */
2828 log_error("Parent died too early");
2829 return -ESRCH;
2830 }
2831
5f932eb9
LP
2832 if (arg_chdir)
2833 if (chdir(arg_chdir) < 0)
2834 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2835
7732f92b
LP
2836 if (arg_start_mode == START_PID2) {
2837 r = stub_pid1();
2838 if (r < 0)
2839 return r;
2840 }
2841
03cfe0d5
LP
2842 /* Now, explicitly close the log, so that we
2843 * then can close all remaining fds. Closing
2844 * the log explicitly first has the benefit
2845 * that the logging subsystem knows about it,
2846 * and is thus ready to be reopened should we
2847 * need it again. Note that the other fds
2848 * closed here are at least the locking and
2849 * barrier fds. */
2850 log_close();
2851 (void) fdset_close_others(fds);
2852
7732f92b 2853 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2854 char **a;
2855 size_t m;
2856
2857 /* Automatically search for the init system */
2858
75f32f04
ZJS
2859 m = strv_length(arg_parameters);
2860 a = newa(char*, m + 2);
2861 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2862 a[1 + m] = NULL;
03cfe0d5
LP
2863
2864 a[0] = (char*) "/usr/lib/systemd/systemd";
2865 execve(a[0], a, env_use);
2866
2867 a[0] = (char*) "/lib/systemd/systemd";
2868 execve(a[0], a, env_use);
2869
2870 a[0] = (char*) "/sbin/init";
2871 execve(a[0], a, env_use);
f757855e
LP
2872 } else if (!strv_isempty(arg_parameters))
2873 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2874 else {
5f932eb9 2875 if (!arg_chdir)
d929b0f9
ZJS
2876 /* If we cannot change the directory, we'll end up in /, that is expected. */
2877 (void) chdir(home ?: "/root");
5f932eb9 2878
03cfe0d5
LP
2879 execle("/bin/bash", "-bash", NULL, env_use);
2880 execle("/bin/sh", "-sh", NULL, env_use);
2881 }
2882
35607a8d 2883 r = -errno;
03cfe0d5 2884 (void) log_open();
35607a8d 2885 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2886}
2887
9c1e04d0
AP
2888static int setup_sd_notify_child(void) {
2889 static const int one = 1;
2890 int fd = -1;
2891 union sockaddr_union sa = {
2892 .sa.sa_family = AF_UNIX,
2893 };
2894 int r;
2895
2896 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2897 if (fd < 0)
2898 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2899
2900 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2901 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2902
2903 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2904 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2905 if (r < 0) {
2906 safe_close(fd);
2907 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2908 }
2909
2910 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2911 if (r < 0) {
2912 safe_close(fd);
2913 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2914 }
2915
2916 return fd;
2917}
2918
03cfe0d5
LP
2919static int outer_child(
2920 Barrier *barrier,
2921 const char *directory,
2922 const char *console,
2923 const char *root_device, bool root_device_rw,
2924 const char *home_device, bool home_device_rw,
2925 const char *srv_device, bool srv_device_rw,
a6bc7db9 2926 const char *esp_device,
03cfe0d5
LP
2927 bool interactive,
2928 bool secondary,
2929 int pid_socket,
e01ff70a 2930 int uuid_socket,
9c1e04d0 2931 int notify_socket,
03cfe0d5
LP
2932 int kmsg_socket,
2933 int rtnl_socket,
825d5287 2934 int uid_shift_socket,
f757855e 2935 FDSet *fds) {
03cfe0d5
LP
2936
2937 pid_t pid;
2938 ssize_t l;
2939 int r;
9c1e04d0 2940 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2941
2942 assert(barrier);
2943 assert(directory);
2944 assert(console);
2945 assert(pid_socket >= 0);
e01ff70a 2946 assert(uuid_socket >= 0);
9c1e04d0 2947 assert(notify_socket >= 0);
03cfe0d5
LP
2948 assert(kmsg_socket >= 0);
2949
efdb0237
LP
2950 cg_unified_flush();
2951
03cfe0d5
LP
2952 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2953 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2954
2955 if (interactive) {
2956 close_nointr(STDIN_FILENO);
2957 close_nointr(STDOUT_FILENO);
2958 close_nointr(STDERR_FILENO);
2959
2960 r = open_terminal(console, O_RDWR);
2961 if (r != STDIN_FILENO) {
2962 if (r >= 0) {
2963 safe_close(r);
2964 r = -EINVAL;
2965 }
2966
2967 return log_error_errno(r, "Failed to open console: %m");
2968 }
2969
2970 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2971 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2972 return log_error_errno(errno, "Failed to duplicate console: %m");
2973 }
2974
2975 r = reset_audit_loginuid();
2976 if (r < 0)
2977 return r;
2978
2979 /* Mark everything as slave, so that we still
2980 * receive mounts from the real root, but don't
2981 * propagate mounts to the real root. */
60e76d48
ZJS
2982 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2983 if (r < 0)
2984 return r;
03cfe0d5
LP
2985
2986 r = mount_devices(directory,
2987 root_device, root_device_rw,
2988 home_device, home_device_rw,
a6bc7db9
LP
2989 srv_device, srv_device_rw,
2990 esp_device);
03cfe0d5
LP
2991 if (r < 0)
2992 return r;
2993
391567f4
LP
2994 r = determine_uid_shift(directory);
2995 if (r < 0)
2996 return r;
2997
0fd9563f
ZJS
2998 r = detect_unified_cgroup_hierarchy(directory);
2999 if (r < 0)
3000 return r;
3001
0de7acce 3002 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 3003 /* Let the parent know which UID shift we read from the image */
825d5287
RM
3004 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3005 if (l < 0)
3006 return log_error_errno(errno, "Failed to send UID shift: %m");
3007 if (l != sizeof(arg_uid_shift)) {
3008 log_error("Short write while sending UID shift.");
3009 return -EIO;
3010 }
0e7ac751 3011
0de7acce 3012 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3013 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3014 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3015 * not it will pick a different one, and send it back to us. */
3016
3017 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3018 if (l < 0)
3019 return log_error_errno(errno, "Failed to recv UID shift: %m");
3020 if (l != sizeof(arg_uid_shift)) {
595bfe7d 3021 log_error("Short read while receiving UID shift.");
0e7ac751
LP
3022 return -EIO;
3023 }
3024 }
3025
3026 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3027 }
3028
03cfe0d5 3029 /* Turn directory into bind mount */
60e76d48
ZJS
3030 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3031 if (r < 0)
3032 return r;
03cfe0d5 3033
19caffac
AC
3034 /* Mark everything as shared so our mounts get propagated down. This is
3035 * required to make new bind mounts available in systemd services
3036 * inside the containter that create a new mount namespace.
3037 * See https://github.com/systemd/systemd/issues/3860
3038 * Further submounts (such as /dev) done after this will inherit the
3039 * shared propagation mode.*/
60e76d48
ZJS
3040 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3041 if (r < 0)
3042 return r;
19caffac 3043
7336138e 3044 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
3045 if (r < 0)
3046 return r;
3047
0de7acce
LP
3048 r = setup_volatile(
3049 directory,
3050 arg_volatile_mode,
3051 arg_userns_mode != USER_NAMESPACE_NO,
3052 arg_uid_shift,
3053 arg_uid_range,
3054 arg_selinux_context);
03cfe0d5
LP
3055 if (r < 0)
3056 return r;
3057
0de7acce
LP
3058 r = setup_volatile_state(
3059 directory,
3060 arg_volatile_mode,
3061 arg_userns_mode != USER_NAMESPACE_NO,
3062 arg_uid_shift,
3063 arg_uid_range,
3064 arg_selinux_context);
03cfe0d5
LP
3065 if (r < 0)
3066 return r;
3067
03cfe0d5
LP
3068 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3069 if (r < 0)
3070 return r;
3071
03cfe0d5 3072 if (arg_read_only) {
6b7c9f8b 3073 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3074 if (r < 0)
3075 return log_error_errno(r, "Failed to make tree read-only: %m");
3076 }
3077
0de7acce
LP
3078 r = mount_all(directory,
3079 arg_userns_mode != USER_NAMESPACE_NO,
3080 false,
3081 arg_private_network,
3082 arg_uid_shift,
3083 arg_uid_range,
3084 arg_selinux_apifs_context);
03cfe0d5
LP
3085 if (r < 0)
3086 return r;
3087
07fa00f9
LP
3088 r = copy_devnodes(directory);
3089 if (r < 0)
03cfe0d5
LP
3090 return r;
3091
3092 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3093
07fa00f9
LP
3094 r = setup_pts(directory);
3095 if (r < 0)
03cfe0d5
LP
3096 return r;
3097
3098 r = setup_propagate(directory);
3099 if (r < 0)
3100 return r;
3101
3102 r = setup_dev_console(directory, console);
3103 if (r < 0)
3104 return r;
3105
520e0d54 3106 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3107 if (r < 0)
3108 return r;
3109
3110 r = setup_timezone(directory);
3111 if (r < 0)
3112 return r;
3113
3114 r = setup_resolv_conf(directory);
3115 if (r < 0)
3116 return r;
3117
e01ff70a
MS
3118 r = setup_machine_id(directory);
3119 if (r < 0)
3120 return r;
3121
03cfe0d5
LP
3122 r = setup_journal(directory);
3123 if (r < 0)
3124 return r;
3125
0de7acce
LP
3126 r = mount_custom(
3127 directory,
3128 arg_custom_mounts,
3129 arg_n_custom_mounts,
3130 arg_userns_mode != USER_NAMESPACE_NO,
3131 arg_uid_shift,
3132 arg_uid_range,
3133 arg_selinux_apifs_context);
03cfe0d5
LP
3134 if (r < 0)
3135 return r;
3136
5a8ff0e6 3137 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3138 r = mount_cgroups(
3139 directory,
3140 arg_unified_cgroup_hierarchy,
3141 arg_userns_mode != USER_NAMESPACE_NO,
3142 arg_uid_shift,
3143 arg_uid_range,
5a8ff0e6 3144 arg_selinux_apifs_context,
ada54120 3145 false);
0996ef00
CB
3146 if (r < 0)
3147 return r;
3148 }
03cfe0d5
LP
3149
3150 r = mount_move_root(directory);
3151 if (r < 0)
3152 return log_error_errno(r, "Failed to move root directory: %m");
3153
9c1e04d0
AP
3154 fd = setup_sd_notify_child();
3155 if (fd < 0)
3156 return fd;
3157
03cfe0d5 3158 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3159 arg_clone_ns_flags |
03cfe0d5 3160 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3161 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3162 if (pid < 0)
3163 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3164 if (pid == 0) {
3165 pid_socket = safe_close(pid_socket);
e01ff70a 3166 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3167 notify_socket = safe_close(notify_socket);
825d5287 3168 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3169
3170 /* The inner child has all namespaces that are
3171 * requested, so that we all are owned by the user if
3172 * user namespaces are turned on. */
3173
f757855e 3174 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3175 if (r < 0)
3176 _exit(EXIT_FAILURE);
3177
3178 _exit(EXIT_SUCCESS);
3179 }
3180
3181 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3182 if (l < 0)
3183 return log_error_errno(errno, "Failed to send PID: %m");
3184 if (l != sizeof(pid)) {
3185 log_error("Short write while sending PID.");
3186 return -EIO;
3187 }
3188
e01ff70a
MS
3189 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3190 if (l < 0)
3191 return log_error_errno(errno, "Failed to send machine ID: %m");
3192 if (l != sizeof(arg_uuid)) {
3193 log_error("Short write while sending machine ID.");
3194 return -EIO;
3195 }
3196
9c1e04d0
AP
3197 l = send_one_fd(notify_socket, fd, 0);
3198 if (l < 0)
3199 return log_error_errno(errno, "Failed to send notify fd: %m");
3200
03cfe0d5 3201 pid_socket = safe_close(pid_socket);
e01ff70a 3202 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3203 notify_socket = safe_close(notify_socket);
327e26d6
KN
3204 kmsg_socket = safe_close(kmsg_socket);
3205 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3206
3207 return 0;
3208}
3209
0e7ac751
LP
3210static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3211 unsigned n_tries = 100;
3212 uid_t candidate;
3213 int r;
3214
3215 assert(shift);
3216 assert(ret_lock_file);
0de7acce 3217 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3218 assert(arg_uid_range == 0x10000U);
3219
3220 candidate = *shift;
3221
3222 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3223
3224 for (;;) {
3225 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3226 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3227
3228 if (--n_tries <= 0)
3229 return -EBUSY;
3230
3231 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3232 goto next;
3233 if ((candidate & UINT32_C(0xFFFF)) != 0)
3234 goto next;
3235
3236 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3237 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3238 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3239 goto next;
3240 if (r < 0)
3241 return r;
3242
3243 /* Make some superficial checks whether the range is currently known in the user database */
3244 if (getpwuid(candidate))
3245 goto next;
3246 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3247 goto next;
3248 if (getgrgid(candidate))
3249 goto next;
3250 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3251 goto next;
3252
3253 *ret_lock_file = lf;
3254 lf = (struct LockFile) LOCK_FILE_INIT;
3255 *shift = candidate;
3256 return 0;
3257
3258 next:
3259 random_bytes(&candidate, sizeof(candidate));
3260 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3261 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3262 }
3263}
3264
03cfe0d5
LP
3265static int setup_uid_map(pid_t pid) {
3266 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3267 int r;
3268
3269 assert(pid > 1);
3270
3271 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3272 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3273 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3274 if (r < 0)
3275 return log_error_errno(r, "Failed to write UID map: %m");
3276
3277 /* We always assign the same UID and GID ranges */
3278 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3279 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3280 if (r < 0)
3281 return log_error_errno(r, "Failed to write GID map: %m");
3282
3283 return 0;
3284}
3285
9c1e04d0 3286static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3287 char buf[NOTIFY_BUFFER_MAX+1];
3288 char *p = NULL;
3289 struct iovec iovec = {
3290 .iov_base = buf,
3291 .iov_len = sizeof(buf)-1,
3292 };
3293 union {
3294 struct cmsghdr cmsghdr;
3295 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3296 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3297 } control = {};
3298 struct msghdr msghdr = {
3299 .msg_iov = &iovec,
3300 .msg_iovlen = 1,
3301 .msg_control = &control,
3302 .msg_controllen = sizeof(control),
3303 };
3304 struct cmsghdr *cmsg;
3305 struct ucred *ucred = NULL;
3306 ssize_t n;
3307 pid_t inner_child_pid;
3308 _cleanup_strv_free_ char **tags = NULL;
3309
3310 assert(userdata);
3311
3312 inner_child_pid = PTR_TO_PID(userdata);
3313
3314 if (revents != EPOLLIN) {
3315 log_warning("Got unexpected poll event for notify fd.");
3316 return 0;
3317 }
3318
3319 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3320 if (n < 0) {
3321 if (errno == EAGAIN || errno == EINTR)
3322 return 0;
3323
3324 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3325 }
3326 cmsg_close_all(&msghdr);
3327
3328 CMSG_FOREACH(cmsg, &msghdr) {
3329 if (cmsg->cmsg_level == SOL_SOCKET &&
3330 cmsg->cmsg_type == SCM_CREDENTIALS &&
3331 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3332
3333 ucred = (struct ucred*) CMSG_DATA(cmsg);
3334 }
3335 }
3336
3337 if (!ucred || ucred->pid != inner_child_pid) {
3338 log_warning("Received notify message without valid credentials. Ignoring.");
3339 return 0;
3340 }
3341
3342 if ((size_t) n >= sizeof(buf)) {
3343 log_warning("Received notify message exceeded maximum size. Ignoring.");
3344 return 0;
3345 }
3346
3347 buf[n] = 0;
3348 tags = strv_split(buf, "\n\r");
3349 if (!tags)
3350 return log_oom();
3351
3352 if (strv_find(tags, "READY=1"))
3353 sd_notifyf(false, "READY=1\n");
3354
3355 p = strv_find_startswith(tags, "STATUS=");
3356 if (p)
3357 sd_notifyf(false, "STATUS=Container running: %s", p);
3358
3359 return 0;
3360}
3361
3362static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3363 int r;
3364 sd_event_source *notify_event_source;
3365
3366 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3367 if (r < 0)
3368 return log_error_errno(r, "Failed to allocate notify event source: %m");
3369
3370 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3371
3372 return 0;
3373}
3374
f757855e
LP
3375static int load_settings(void) {
3376 _cleanup_(settings_freep) Settings *settings = NULL;
3377 _cleanup_fclose_ FILE *f = NULL;
3378 _cleanup_free_ char *p = NULL;
3379 const char *fn, *i;
3380 int r;
3381
3382 /* If all settings are masked, there's no point in looking for
3383 * the settings file */
3384 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3385 return 0;
3386
3387 fn = strjoina(arg_machine, ".nspawn");
3388
3389 /* We first look in the admin's directories in /etc and /run */
3390 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3391 _cleanup_free_ char *j = NULL;
3392
605405c6 3393 j = strjoin(i, "/", fn);
f757855e
LP
3394 if (!j)
3395 return log_oom();
3396
3397 f = fopen(j, "re");
3398 if (f) {
3399 p = j;
3400 j = NULL;
3401
b938cb90 3402 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3403 if (arg_settings_trusted < 0)
3404 arg_settings_trusted = true;
3405
3406 break;
3407 }
3408
3409 if (errno != ENOENT)
3410 return log_error_errno(errno, "Failed to open %s: %m", j);
3411 }
3412
3413 if (!f) {
3414 /* After that, let's look for a file next to the
3415 * actual image we shall boot. */
3416
3417 if (arg_image) {
3418 p = file_in_same_dir(arg_image, fn);
3419 if (!p)
3420 return log_oom();
3421 } else if (arg_directory) {
3422 p = file_in_same_dir(arg_directory, fn);
3423 if (!p)
3424 return log_oom();
3425 }
3426
3427 if (p) {
3428 f = fopen(p, "re");
3429 if (!f && errno != ENOENT)
3430 return log_error_errno(errno, "Failed to open %s: %m", p);
3431
b938cb90 3432 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3433 if (arg_settings_trusted < 0)
3434 arg_settings_trusted = false;
3435 }
3436 }
3437
3438 if (!f)
3439 return 0;
3440
3441 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3442
3443 r = settings_load(f, p, &settings);
3444 if (r < 0)
3445 return r;
3446
3447 /* Copy over bits from the settings, unless they have been
3448 * explicitly masked by command line switches. */
3449
7732f92b
LP
3450 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3451 settings->start_mode >= 0) {
3452 arg_start_mode = settings->start_mode;
f757855e
LP
3453
3454 strv_free(arg_parameters);
3455 arg_parameters = settings->parameters;
3456 settings->parameters = NULL;
3457 }
3458
5f932eb9
LP
3459 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3460 settings->working_directory) {
3461 free(arg_chdir);
3462 arg_chdir = settings->working_directory;
3463 settings->working_directory = NULL;
3464 }
3465
f757855e
LP
3466 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3467 settings->environment) {
3468 strv_free(arg_setenv);
3469 arg_setenv = settings->environment;
3470 settings->environment = NULL;
3471 }
3472
3473 if ((arg_settings_mask & SETTING_USER) == 0 &&
3474 settings->user) {
3475 free(arg_user);
3476 arg_user = settings->user;
3477 settings->user = NULL;
3478 }
3479
3480 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3481 uint64_t plus;
f757855e 3482
0e265674
LP
3483 plus = settings->capability;
3484 if (settings_private_network(settings))
3485 plus |= (1ULL << CAP_NET_ADMIN);
3486
3487 if (!arg_settings_trusted && plus != 0) {
3488 if (settings->capability != 0)
3489 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3490 } else
520e0d54 3491 arg_caps_retain |= plus;
f757855e 3492
520e0d54 3493 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3494 }
3495
3496 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3497 settings->kill_signal > 0)
3498 arg_kill_signal = settings->kill_signal;
3499
3500 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3501 settings->personality != PERSONALITY_INVALID)
3502 arg_personality = settings->personality;
3503
3504 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3505 !sd_id128_is_null(settings->machine_id)) {
3506
3507 if (!arg_settings_trusted)
3508 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3509 else
3510 arg_uuid = settings->machine_id;
3511 }
3512
3513 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3514 settings->read_only >= 0)
3515 arg_read_only = settings->read_only;
3516
3517 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3518 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3519 arg_volatile_mode = settings->volatile_mode;
3520
3521 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3522 settings->n_custom_mounts > 0) {
3523
3524 if (!arg_settings_trusted)
3525 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3526 else {
3527 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3528 arg_custom_mounts = settings->custom_mounts;
3529 arg_n_custom_mounts = settings->n_custom_mounts;
3530
3531 settings->custom_mounts = NULL;
3532 settings->n_custom_mounts = 0;
3533 }
3534 }
3535
3536 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3537 (settings->private_network >= 0 ||
3538 settings->network_veth >= 0 ||
3539 settings->network_bridge ||
22b28dfd 3540 settings->network_zone ||
f757855e
LP
3541 settings->network_interfaces ||
3542 settings->network_macvlan ||
f6d6bad1
LP
3543 settings->network_ipvlan ||
3544 settings->network_veth_extra)) {
f757855e
LP
3545
3546 if (!arg_settings_trusted)
3547 log_warning("Ignoring network settings, file %s is not trusted.", p);
3548 else {
f6d6bad1 3549 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3550 arg_private_network = settings_private_network(settings);
3551
f757855e
LP
3552 strv_free(arg_network_interfaces);
3553 arg_network_interfaces = settings->network_interfaces;
3554 settings->network_interfaces = NULL;
3555
3556 strv_free(arg_network_macvlan);
3557 arg_network_macvlan = settings->network_macvlan;
3558 settings->network_macvlan = NULL;
3559
3560 strv_free(arg_network_ipvlan);
3561 arg_network_ipvlan = settings->network_ipvlan;
3562 settings->network_ipvlan = NULL;
3563
f6d6bad1
LP
3564 strv_free(arg_network_veth_extra);
3565 arg_network_veth_extra = settings->network_veth_extra;
3566 settings->network_veth_extra = NULL;
3567
f757855e
LP
3568 free(arg_network_bridge);
3569 arg_network_bridge = settings->network_bridge;
3570 settings->network_bridge = NULL;
22b28dfd
LP
3571
3572 free(arg_network_zone);
3573 arg_network_zone = settings->network_zone;
3574 settings->network_zone = NULL;
f757855e
LP
3575 }
3576 }
3577
3578 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3579 settings->expose_ports) {
3580
3581 if (!arg_settings_trusted)
3582 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3583 else {
3584 expose_port_free_all(arg_expose_ports);
3585 arg_expose_ports = settings->expose_ports;
3586 settings->expose_ports = NULL;
3587 }
3588 }
3589
0de7acce
LP
3590 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3591 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3592
3593 if (!arg_settings_trusted)
3594 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3595 else {
3596 arg_userns_mode = settings->userns_mode;
3597 arg_uid_shift = settings->uid_shift;
3598 arg_uid_range = settings->uid_range;
3599 arg_userns_chown = settings->userns_chown;
3600 }
3601 }
3602
9c1e04d0
AP
3603 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3604 arg_notify_ready = settings->notify_ready;
3605
f757855e
LP
3606 return 0;
3607}
3608
b0067625
ZJS
3609static int run(int master,
3610 const char* console,
3611 const char *root_device, bool root_device_rw,
3612 const char *home_device, bool home_device_rw,
3613 const char *srv_device, bool srv_device_rw,
3614 const char *esp_device,
3615 bool interactive,
3616 bool secondary,
3617 FDSet *fds,
3618 char veth_name[IFNAMSIZ], bool *veth_created,
3619 union in_addr_union *exposed,
3620 pid_t *pid, int *ret) {
3621
3622 static const struct sigaction sa = {
3623 .sa_handler = nop_signal_handler,
3624 .sa_flags = SA_NOCLDSTOP,
3625 };
3626
3627 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3628 _cleanup_close_ int etc_passwd_lock = -1;
3629 _cleanup_close_pair_ int
3630 kmsg_socket_pair[2] = { -1, -1 },
3631 rtnl_socket_pair[2] = { -1, -1 },
3632 pid_socket_pair[2] = { -1, -1 },
3633 uuid_socket_pair[2] = { -1, -1 },
3634 notify_socket_pair[2] = { -1, -1 },
3635 uid_shift_socket_pair[2] = { -1, -1 };
3636 _cleanup_close_ int notify_socket= -1;
3637 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3638 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3639 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3640 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3641 ContainerStatus container_status = 0;
3642 char last_char = 0;
3643 int ifi = 0, r;
3644 ssize_t l;
3645 sigset_t mask_chld;
3646
3647 assert_se(sigemptyset(&mask_chld) == 0);
3648 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3649
3650 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3651 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3652 * check with getpwuid() if the specific user already exists. Note that /etc might be
3653 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3654 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3655 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3656 * really ours. */
3657
3658 etc_passwd_lock = take_etc_passwd_lock(NULL);
3659 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3660 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3661 }
3662
3663 r = barrier_create(&barrier);
3664 if (r < 0)
3665 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3666
3667 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3668 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3669
3670 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3671 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3672
3673 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3674 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3675
3676 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3677 return log_error_errno(errno, "Failed to create id socket pair: %m");
3678
3679 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3680 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3681
3682 if (arg_userns_mode != USER_NAMESPACE_NO)
3683 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3684 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3685
3686 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3687 * parent's blocking calls and give it a chance to call wait() and terminate. */
3688 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3689 if (r < 0)
3690 return log_error_errno(errno, "Failed to change the signal mask: %m");
3691
3692 r = sigaction(SIGCHLD, &sa, NULL);
3693 if (r < 0)
3694 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3695
3696 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3697 if (*pid < 0)
3698 return log_error_errno(errno, "clone() failed%s: %m",
3699 errno == EINVAL ?
3700 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3701
3702 if (*pid == 0) {
3703 /* The outer child only has a file system namespace. */
3704 barrier_set_role(&barrier, BARRIER_CHILD);
3705
3706 master = safe_close(master);
3707
3708 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3709 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3710 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3711 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3712 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3713 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3714
3715 (void) reset_all_signal_handlers();
3716 (void) reset_signal_mask();
3717
3718 r = outer_child(&barrier,
3719 arg_directory,
3720 console,
3721 root_device, root_device_rw,
3722 home_device, home_device_rw,
3723 srv_device, srv_device_rw,
3724 esp_device,
3725 interactive,
3726 secondary,
3727 pid_socket_pair[1],
3728 uuid_socket_pair[1],
3729 notify_socket_pair[1],
3730 kmsg_socket_pair[1],
3731 rtnl_socket_pair[1],
3732 uid_shift_socket_pair[1],
3733 fds);
3734 if (r < 0)
3735 _exit(EXIT_FAILURE);
3736
3737 _exit(EXIT_SUCCESS);
3738 }
3739
3740 barrier_set_role(&barrier, BARRIER_PARENT);
3741
3742 fds = fdset_free(fds);
3743
3744 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3745 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3746 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3747 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3748 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3749 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3750
3751 if (arg_userns_mode != USER_NAMESPACE_NO) {
3752 /* The child just let us know the UID shift it might have read from the image. */
3753 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3754 if (l < 0)
3755 return log_error_errno(errno, "Failed to read UID shift: %m");
3756
3757 if (l != sizeof arg_uid_shift) {
3758 log_error("Short read while reading UID shift.");
3759 return -EIO;
3760 }
3761
3762 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3763 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3764 * image, but if that's already in use, pick a new one, and report back to the child,
3765 * which one we now picked. */
3766
3767 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3768 if (r < 0)
3769 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3770
3771 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3772 if (l < 0)
3773 return log_error_errno(errno, "Failed to send UID shift: %m");
3774 if (l != sizeof arg_uid_shift) {
3775 log_error("Short write while writing UID shift.");
3776 return -EIO;
3777 }
3778 }
3779 }
3780
3781 /* Wait for the outer child. */
3782 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3783 if (r != 0)
3784 return r < 0 ? r : -EIO;
3785
3786 /* And now retrieve the PID of the inner child. */
3787 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3788 if (l < 0)
3789 return log_error_errno(errno, "Failed to read inner child PID: %m");
3790 if (l != sizeof *pid) {
3791 log_error("Short read while reading inner child PID.");
3792 return -EIO;
3793 }
3794
3795 /* We also retrieve container UUID in case it was generated by outer child */
3796 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3797 if (l < 0)
3798 return log_error_errno(errno, "Failed to read container machine ID: %m");
3799 if (l != sizeof(arg_uuid)) {
3800 log_error("Short read while reading container machined ID.");
3801 return -EIO;
3802 }
3803
3804 /* We also retrieve the socket used for notifications generated by outer child */
3805 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3806 if (notify_socket < 0)
3807 return log_error_errno(notify_socket,
3808 "Failed to receive notification socket from the outer child: %m");
3809
3810 log_debug("Init process invoked as PID "PID_FMT, *pid);
3811
3812 if (arg_userns_mode != USER_NAMESPACE_NO) {
3813 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3814 log_error("Child died too early.");
3815 return -ESRCH;
3816 }
3817
3818 r = setup_uid_map(*pid);
3819 if (r < 0)
3820 return r;
3821
3822 (void) barrier_place(&barrier); /* #2 */
3823 }
3824
3825 if (arg_private_network) {
3826
3827 r = move_network_interfaces(*pid, arg_network_interfaces);
3828 if (r < 0)
3829 return r;
3830
3831 if (arg_network_veth) {
3832 r = setup_veth(arg_machine, *pid, veth_name,
3833 arg_network_bridge || arg_network_zone);
3834 if (r < 0)
3835 return r;
3836 else if (r > 0)
3837 ifi = r;
3838
3839 if (arg_network_bridge) {
3840 /* Add the interface to a bridge */
3841 r = setup_bridge(veth_name, arg_network_bridge, false);
3842 if (r < 0)
3843 return r;
3844 if (r > 0)
3845 ifi = r;
3846 } else if (arg_network_zone) {
3847 /* Add the interface to a bridge, possibly creating it */
3848 r = setup_bridge(veth_name, arg_network_zone, true);
3849 if (r < 0)
3850 return r;
3851 if (r > 0)
3852 ifi = r;
3853 }
3854 }
3855
3856 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3857 if (r < 0)
3858 return r;
3859
3860 /* We created the primary and extra veth links now; let's remember this, so that we know to
3861 remove them later on. Note that we don't bother with removing veth links that were created
3862 here when their setup failed half-way, because in that case the kernel should be able to
3863 remove them on its own, since they cannot be referenced by anything yet. */
3864 *veth_created = true;
3865
3866 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3867 if (r < 0)
3868 return r;
3869
3870 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3871 if (r < 0)
3872 return r;
3873 }
3874
3875 if (arg_register) {
3876 r = register_machine(
3877 arg_machine,
3878 *pid,
3879 arg_directory,
3880 arg_uuid,
3881 ifi,
3882 arg_slice,
3883 arg_custom_mounts, arg_n_custom_mounts,
3884 arg_kill_signal,
3885 arg_property,
3886 arg_keep_unit,
3887 arg_container_service_name);
3888 if (r < 0)
3889 return r;
3890 }
3891
f0bef277 3892 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3893 if (r < 0)
3894 return r;
3895
3896 if (arg_keep_unit) {
3897 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3898 if (r < 0)
3899 return r;
3900 }
3901
3902 r = chown_cgroup(*pid, arg_uid_shift);
3903 if (r < 0)
3904 return r;
3905
3906 /* Notify the child that the parent is ready with all
3907 * its setup (including cgroup-ification), and that
3908 * the child can now hand over control to the code to
3909 * run inside the container. */
3910 (void) barrier_place(&barrier); /* #3 */
3911
3912 /* Block SIGCHLD here, before notifying child.
3913 * process_pty() will handle it with the other signals. */
3914 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3915
3916 /* Reset signal to default */
3917 r = default_signals(SIGCHLD, -1);
3918 if (r < 0)
3919 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3920
3921 r = sd_event_new(&event);
3922 if (r < 0)
3923 return log_error_errno(r, "Failed to get default event source: %m");
3924
3925 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
3926 if (r < 0)
3927 return r;
3928
3929 /* Let the child know that we are ready and wait that the child is completely ready now. */
3930 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3931 log_error("Child died too early.");
3932 return -ESRCH;
3933 }
3934
3935 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3936 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3937 etc_passwd_lock = safe_close(etc_passwd_lock);
3938
3939 sd_notifyf(false,
3940 "STATUS=Container running.\n"
3941 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3942 if (!arg_notify_ready)
3943 sd_notify(false, "READY=1\n");
3944
3945 if (arg_kill_signal > 0) {
3946 /* Try to kill the init system on SIGINT or SIGTERM */
3947 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3948 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3949 } else {
3950 /* Immediately exit */
3951 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3952 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3953 }
3954
3955 /* simply exit on sigchld */
3956 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3957
3958 if (arg_expose_ports) {
3959 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3960 if (r < 0)
3961 return r;
3962
3963 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3964 }
3965
3966 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3967
3968 r = pty_forward_new(event, master,
3969 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3970 &forward);
3971 if (r < 0)
3972 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3973
3974 r = sd_event_loop(event);
3975 if (r < 0)
3976 return log_error_errno(r, "Failed to run event loop: %m");
3977
3978 pty_forward_get_last_char(forward, &last_char);
3979
3980 forward = pty_forward_free(forward);
3981
3982 if (!arg_quiet && last_char != '\n')
3983 putc('\n', stdout);
3984
3985 /* Kill if it is not dead yet anyway */
3986 if (arg_register && !arg_keep_unit)
3987 terminate_machine(*pid);
3988
3989 /* Normally redundant, but better safe than sorry */
3990 kill(*pid, SIGKILL);
3991
3992 r = wait_for_container(*pid, &container_status);
3993 *pid = 0;
3994
3995 if (r < 0)
3996 /* We failed to wait for the container, or the container exited abnormally. */
3997 return r;
3998 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3999 /* r > 0 → The container exited with a non-zero status.
4000 * As a special case, we need to replace 133 with a different value,
4001 * because 133 is special-cased in the service file to reboot the container.
4002 * otherwise → The container exited with zero status and a reboot was not requested.
4003 */
4004 if (r == 133)
4005 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 4006 *ret = r;
b0067625
ZJS
4007 return 0; /* finito */
4008 }
4009
4010 /* CONTAINER_REBOOTED, loop again */
4011
4012 if (arg_keep_unit) {
4013 /* Special handling if we are running as a service: instead of simply
4014 * restarting the machine we want to restart the entire service, so let's
4015 * inform systemd about this with the special exit code 133. The service
4016 * file uses RestartForceExitStatus=133 so that this results in a full
4017 * nspawn restart. This is necessary since we might have cgroup parameters
4018 * set we want to have flushed out. */
4019 *ret = 0;
4020 return 133;
4021 }
4022
4023 expose_port_flush(arg_expose_ports, exposed);
4024
4025 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4026 *veth_created = false;
4027 return 1; /* loop again */
4028}
4029
03cfe0d5
LP
4030int main(int argc, char *argv[]) {
4031
a6bc7db9 4032 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
4033 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4034 _cleanup_close_ int master = -1, image_fd = -1;
4035 _cleanup_fdset_free_ FDSet *fds = NULL;
b0067625 4036 int r, n_fd_passed, loop_nr = -1, ret = EXIT_FAILURE;
5aa3eba5 4037 char veth_name[IFNAMSIZ] = "";
03cfe0d5 4038 bool secondary = false, remove_subvol = false;
03cfe0d5 4039 pid_t pid = 0;
03cfe0d5
LP
4040 union in_addr_union exposed = {};
4041 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
7513c5b8 4042 bool interactive, veth_created = false;
03cfe0d5
LP
4043
4044 log_parse_environment();
4045 log_open();
4046
7732f92b
LP
4047 /* Make sure rename_process() in the stub init process can work */
4048 saved_argv = argv;
4049 saved_argc = argc;
4050
03cfe0d5
LP
4051 r = parse_argv(argc, argv);
4052 if (r <= 0)
4053 goto finish;
4054
03cfe0d5
LP
4055 if (geteuid() != 0) {
4056 log_error("Need to be root.");
4057 r = -EPERM;
4058 goto finish;
4059 }
f757855e
LP
4060 r = determine_names();
4061 if (r < 0)
4062 goto finish;
4063
4064 r = load_settings();
4065 if (r < 0)
4066 goto finish;
4067
4068 r = verify_arguments();
4069 if (r < 0)
4070 goto finish;
03cfe0d5
LP
4071
4072 n_fd_passed = sd_listen_fds(false);
4073 if (n_fd_passed > 0) {
4074 r = fdset_new_listen_fds(&fds, false);
4075 if (r < 0) {
4076 log_error_errno(r, "Failed to collect file descriptors: %m");
4077 goto finish;
4078 }
4079 }
4080
4081 if (arg_directory) {
4082 assert(!arg_image);
4083
4084 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4085 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4086 r = -EINVAL;
4087 goto finish;
4088 }
4089
4090 if (arg_ephemeral) {
4091 _cleanup_free_ char *np = NULL;
4092
4093 /* If the specified path is a mount point we
4094 * generate the new snapshot immediately
4095 * inside it under a random name. However if
4096 * the specified is not a mount point we
4097 * create the new snapshot in the parent
4098 * directory, just next to it. */
e26d6ce5 4099 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4100 if (r < 0) {
4101 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4102 goto finish;
4103 }
4104 if (r > 0)
770b5ce4 4105 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4106 else
770b5ce4 4107 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4108 if (r < 0) {
4109 log_error_errno(r, "Failed to generate name for snapshot: %m");
4110 goto finish;
4111 }
4112
4113 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4114 if (r < 0) {
4115 log_error_errno(r, "Failed to lock %s: %m", np);
4116 goto finish;
4117 }
4118
5bcd08db 4119 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4120 if (r < 0) {
4121 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4122 goto finish;
ec16945e
LP
4123 }
4124
4125 free(arg_directory);
4126 arg_directory = np;
8a16a7b4 4127 np = NULL;
ec16945e
LP
4128
4129 remove_subvol = true;
30535c16
LP
4130
4131 } else {
4132 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4133 if (r == -EBUSY) {
4134 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4135 goto finish;
4136 }
4137 if (r < 0) {
4138 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4139 goto finish;
30535c16
LP
4140 }
4141
4142 if (arg_template) {
5bcd08db 4143 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4144 if (r == -EEXIST) {
4145 if (!arg_quiet)
4146 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4147 } else if (r < 0) {
83521414 4148 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4149 goto finish;
4150 } else {
4151 if (!arg_quiet)
4152 log_info("Populated %s from template %s.", arg_directory, arg_template);
4153 }
4154 }
ec16945e
LP
4155 }
4156
7732f92b 4157 if (arg_start_mode == START_BOOT) {
1b9e5b12 4158 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4159 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4160 r = -EINVAL;
1b9e5b12
LP
4161 goto finish;
4162 }
4163 } else {
4164 const char *p;
4165
16fb773e
LP
4166 p = strjoina(arg_directory, "/usr/");
4167 if (laccess(p, F_OK) < 0) {
4168 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 4169 r = -EINVAL;
1b9e5b12 4170 goto finish;
1b9e5b12
LP
4171 }
4172 }
ec16945e 4173
6b9132a9 4174 } else {
1b9e5b12 4175 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4176
ec16945e
LP
4177 assert(arg_image);
4178 assert(!arg_template);
4179
30535c16
LP
4180 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4181 if (r == -EBUSY) {
4182 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4183 goto finish;
4184 }
4185 if (r < 0) {
4186 r = log_error_errno(r, "Failed to create image lock: %m");
4187 goto finish;
4188 }
4189
1b9e5b12 4190 if (!mkdtemp(template)) {
56f64d95 4191 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4192 r = -errno;
6b9132a9 4193 goto finish;
1b9e5b12 4194 }
6b9132a9 4195
1b9e5b12
LP
4196 arg_directory = strdup(template);
4197 if (!arg_directory) {
4198 r = log_oom();
4199 goto finish;
6b9132a9 4200 }
88213476 4201
1b9e5b12
LP
4202 image_fd = setup_image(&device_path, &loop_nr);
4203 if (image_fd < 0) {
4204 r = image_fd;
842f3b0f
LP
4205 goto finish;
4206 }
1b9e5b12 4207
4d9f07b4
LP
4208 r = dissect_image(image_fd,
4209 &root_device, &root_device_rw,
4210 &home_device, &home_device_rw,
4211 &srv_device, &srv_device_rw,
a6bc7db9 4212 &esp_device,
4d9f07b4 4213 &secondary);
1b9e5b12
LP
4214 if (r < 0)
4215 goto finish;
842f3b0f 4216 }
842f3b0f 4217
5a8af538
LP
4218 r = custom_mounts_prepare();
4219 if (r < 0)
4220 goto finish;
4221
03cfe0d5
LP
4222 interactive =
4223 isatty(STDIN_FILENO) > 0 &&
4224 isatty(STDOUT_FILENO) > 0;
9c857b9d 4225
db7feb7e
LP
4226 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4227 if (master < 0) {
ec16945e 4228 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4229 goto finish;
4230 }
4231
611b312b
LP
4232 r = ptsname_malloc(master, &console);
4233 if (r < 0) {
4234 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4235 goto finish;
68b02049
DW
4236 }
4237
4238 if (arg_selinux_apifs_context) {
4239 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4240 if (r < 0)
4241 goto finish;
a258bf26
LP
4242 }
4243
a258bf26 4244 if (unlockpt(master) < 0) {
ec16945e 4245 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4246 goto finish;
4247 }
4248
9c857b9d
LP
4249 if (!arg_quiet)
4250 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4251 arg_machine, arg_image ?: arg_directory);
4252
72c0a2c2 4253 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4254
03cfe0d5
LP
4255 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4256 r = log_error_errno(errno, "Failed to become subreaper: %m");
4257 goto finish;
4258 }
4259
d87be9b0 4260 for (;;) {
b0067625
ZJS
4261 r = run(master,
4262 console,
4263 root_device, root_device_rw,
4264 home_device, home_device_rw,
4265 srv_device, srv_device_rw,
4266 esp_device,
4267 interactive, secondary,
4268 fds,
4269 veth_name, &veth_created,
4270 &exposed,
4271 &pid, &ret);
4272 if (r <= 0)
d87be9b0 4273 break;
d87be9b0 4274 }
88213476
LP
4275
4276finish:
af4ec430
LP
4277 sd_notify(false,
4278 "STOPPING=1\n"
4279 "STATUS=Terminating...");
4280
9444b1f2
LP
4281 if (pid > 0)
4282 kill(pid, SIGKILL);
88213476 4283
503546da
LP
4284 /* Try to flush whatever is still queued in the pty */
4285 if (master >= 0)
59f448cf 4286 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 4287
03cfe0d5
LP
4288 loop_remove(loop_nr, &image_fd);
4289
ec16945e
LP
4290 if (remove_subvol && arg_directory) {
4291 int k;
4292
5bcd08db 4293 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
4294 if (k < 0)
4295 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4296 }
4297
785890ac
LP
4298 if (arg_machine) {
4299 const char *p;
4300
63c372cb 4301 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4302 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4303 }
4304
7a8f6325 4305 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4306
4307 if (veth_created)
4308 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4309 (void) remove_bridge(arg_network_zone);
f757855e 4310
04d391da 4311 free(arg_directory);
ec16945e
LP
4312 free(arg_template);
4313 free(arg_image);
7027ff61 4314 free(arg_machine);
c74e630d 4315 free(arg_user);
5f932eb9 4316 free(arg_chdir);
c74e630d 4317 strv_free(arg_setenv);
f757855e 4318 free(arg_network_bridge);
c74e630d
LP
4319 strv_free(arg_network_interfaces);
4320 strv_free(arg_network_macvlan);
4bbfe7ad 4321 strv_free(arg_network_ipvlan);
f6d6bad1 4322 strv_free(arg_network_veth_extra);
f757855e
LP
4323 strv_free(arg_parameters);
4324 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4325 expose_port_free_all(arg_expose_ports);
6d0b55c2 4326
ec16945e 4327 return r < 0 ? EXIT_FAILURE : ret;
88213476 4328}