]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
systemctl: tweak the "systemctl list-units" output a bit
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
8fe0087e 60#include "formats-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
57fb9fb5
LP
119typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124} LinkJournal;
88213476
LP
125
126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
687d0825 129static char *arg_user = NULL;
9444b1f2 130static sd_id128_t arg_uuid = {};
7027ff61 131static char *arg_machine = NULL;
c74e630d
LP
132static const char *arg_selinux_context = NULL;
133static const char *arg_selinux_apifs_context = NULL;
9444b1f2 134static const char *arg_slice = NULL;
ff01d048 135static bool arg_private_network = false;
bc2f673e 136static bool arg_read_only = false;
7732f92b 137static StartMode arg_start_mode = START_PID1;
ec16945e 138static bool arg_ephemeral = false;
57fb9fb5 139static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 140static bool arg_link_journal_try = false;
520e0d54 141static uint64_t arg_caps_retain =
50b52222
LP
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 153 (1ULL << CAP_MKNOD) |
5076f0cc
LP
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
5076f0cc 157 (1ULL << CAP_SETFCAP) |
50b52222 158 (1ULL << CAP_SETGID) |
5076f0cc
LP
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
50b52222 162 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 166 (1ULL << CAP_SYS_RESOURCE) |
50b52222 167 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
168static CustomMount *arg_custom_mounts = NULL;
169static unsigned arg_n_custom_mounts = 0;
f4889f65 170static char **arg_setenv = NULL;
284c0b91 171static bool arg_quiet = false;
eb91eb18 172static bool arg_register = true;
89f7c846 173static bool arg_keep_unit = false;
aa28aefe 174static char **arg_network_interfaces = NULL;
c74e630d 175static char **arg_network_macvlan = NULL;
4bbfe7ad 176static char **arg_network_ipvlan = NULL;
69c79d3c 177static bool arg_network_veth = false;
f6d6bad1 178static char **arg_network_veth_extra = NULL;
f757855e 179static char *arg_network_bridge = NULL;
22b28dfd 180static char *arg_network_zone = NULL;
050f7277 181static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 182static char *arg_image = NULL;
f757855e 183static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 184static ExposePort *arg_expose_ports = NULL;
f36933fe 185static char **arg_property = NULL;
0de7acce 186static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 187static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 188static bool arg_userns_chown = false;
c6c8f6e2 189static int arg_kill_signal = 0;
5da38d07 190static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
f757855e
LP
191static SettingsMask arg_settings_mask = 0;
192static int arg_settings_trusted = -1;
193static char **arg_parameters = NULL;
6aadfa4c 194static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 195static bool arg_notify_ready = false;
5a8ff0e6 196static bool arg_use_cgns = true;
0c582db0 197static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
88213476 198
601185b4 199static void help(void) {
88213476
LP
200 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
201 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
202 " -h --help Show this help\n"
203 " --version Print version string\n"
69c79d3c 204 " -q --quiet Do not show status information\n"
1b9e5b12 205 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
206 " --template=PATH Initialize root directory from template directory,\n"
207 " if missing\n"
208 " -x --ephemeral Run container with snapshot of root directory, and\n"
209 " remove it after exit\n"
210 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 211 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 212 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 213 " --chdir=PATH Set working directory in the container\n"
a8828ed9 214 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 215 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 216 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 217 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 218 " --property=NAME=VALUE Set scope unit property\n"
90b4a64d 219 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
03cfe0d5 220 " --private-users[=UIDBASE[:NUIDS]]\n"
90b4a64d 221 " Similar, but with user configured UID/GID range\n"
24597ee0 222 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
69c79d3c
LP
223 " --private-network Disable network in container\n"
224 " --network-interface=INTERFACE\n"
225 " Assign an existing network interface to the\n"
226 " container\n"
c74e630d
LP
227 " --network-macvlan=INTERFACE\n"
228 " Create a macvlan network interface based on an\n"
229 " existing network interface to the container\n"
4bbfe7ad
TG
230 " --network-ipvlan=INTERFACE\n"
231 " Create a ipvlan network interface based on an\n"
232 " existing network interface to the container\n"
a8eaaee7 233 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 234 " and container\n"
f6d6bad1
LP
235 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
236 " Add an additional virtual Ethernet link between\n"
237 " host and container\n"
ab046dde 238 " --network-bridge=INTERFACE\n"
90b4a64d
ZJS
239 " Add a virtual Ethernet connection to the container\n"
240 " and attach it to an existing bridge on the host\n"
241 " --network-zone=NAME Similar, but attach the new interface to an\n"
242 " an automatically managed bridge interface\n"
6d0b55c2 243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 244 " Expose a container IP port on the host\n"
82adf6af
LP
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
a8828ed9
DW
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
574edc90 257 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 258 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
a8828ed9 261 " the container\n"
5e5bfa6e
EY
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
06c17c39 264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
267 " the container\n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
eb91eb18 271 " --register=BOOLEAN Register container as machine\n"
89f7c846 272 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 273 " the service unit nspawn is running in\n"
6d0b55c2 274 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 275 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
90b4a64d 276 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
6d0b55c2 277 , program_invocation_short_name);
88213476
LP
278}
279
5a8af538
LP
280static int custom_mounts_prepare(void) {
281 unsigned i;
282 int r;
283
284 /* Ensure the mounts are applied prefix first. */
285 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
286
287 /* Allocate working directories for the overlay file systems that need it */
288 for (i = 0; i < arg_n_custom_mounts; i++) {
289 CustomMount *m = &arg_custom_mounts[i];
290
0de7acce 291 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
292
293 if (arg_userns_chown) {
294 log_error("--private-users-chown may not be combined with custom root mounts.");
295 return -EINVAL;
296 } else if (arg_uid_shift == UID_INVALID) {
297 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
298 return -EINVAL;
299 }
825d5287
RM
300 }
301
5a8af538
LP
302 if (m->type != CUSTOM_MOUNT_OVERLAY)
303 continue;
304
305 if (m->work_dir)
306 continue;
307
308 if (m->read_only)
309 continue;
310
14bcf25c 311 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
312 if (r < 0)
313 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
314 }
315
316 return 0;
317}
318
0fd9563f 319static int detect_unified_cgroup_hierarchy(const char *directory) {
efdb0237 320 const char *e;
5da38d07
TH
321 int r, all_unified, systemd_unified;
322
efdb0237
LP
323 /* Allow the user to control whether the unified hierarchy is used */
324 e = getenv("UNIFIED_CGROUP_HIERARCHY");
325 if (e) {
326 r = parse_boolean(e);
327 if (r < 0)
328 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
5da38d07
TH
329 if (r > 0)
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
331 else
332 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 333
efdb0237
LP
334 return 0;
335 }
336
98afd6af
ZJS
337 all_unified = cg_all_unified();
338 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
339
340 if (all_unified < 0 || systemd_unified < 0)
341 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
342 "Failed to determine whether the unified cgroups hierarchy is used: %m");
343
efdb0237 344 /* Otherwise inherit the default from the host system */
a8725a06
ZJS
345 if (all_unified > 0) {
346 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
347 * routine only detects 231, so we'll have a false negative here for 230. */
348 r = systemd_installation_has_version(directory, 230);
349 if (r < 0)
350 return log_error_errno(r, "Failed to determine systemd version in container: %m");
351 if (r > 0)
352 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
353 else
354 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
355 } else if (systemd_unified > 0) {
356 /* Mixed cgroup hierarchy support was added in 232 */
0fd9563f
ZJS
357 r = systemd_installation_has_version(directory, 232);
358 if (r < 0)
359 return log_error_errno(r, "Failed to determine systemd version in container: %m");
360 if (r > 0)
361 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
362 else
363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
364 } else
5da38d07 365 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
efdb0237 366
efdb0237
LP
367 return 0;
368}
369
0c582db0
LB
370static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
371 int r;
372
373 r = getenv_bool(name);
374 if (r == -ENXIO)
375 return;
376 if (r < 0)
377 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
378 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
379}
380
88213476
LP
381static int parse_argv(int argc, char *argv[]) {
382
a41fe3a2 383 enum {
acbeb427
ZJS
384 ARG_VERSION = 0x100,
385 ARG_PRIVATE_NETWORK,
bc2f673e 386 ARG_UUID,
5076f0cc 387 ARG_READ_ONLY,
57fb9fb5 388 ARG_CAPABILITY,
420c7379 389 ARG_DROP_CAPABILITY,
17fe0523
LP
390 ARG_LINK_JOURNAL,
391 ARG_BIND,
f4889f65 392 ARG_BIND_RO,
06c17c39 393 ARG_TMPFS,
5a8af538
LP
394 ARG_OVERLAY,
395 ARG_OVERLAY_RO,
eb91eb18 396 ARG_SHARE_SYSTEM,
89f7c846 397 ARG_REGISTER,
aa28aefe 398 ARG_KEEP_UNIT,
69c79d3c 399 ARG_NETWORK_INTERFACE,
c74e630d 400 ARG_NETWORK_MACVLAN,
4bbfe7ad 401 ARG_NETWORK_IPVLAN,
ab046dde 402 ARG_NETWORK_BRIDGE,
22b28dfd 403 ARG_NETWORK_ZONE,
f6d6bad1 404 ARG_NETWORK_VETH_EXTRA,
6afc95b7 405 ARG_PERSONALITY,
4d9f07b4 406 ARG_VOLATILE,
ec16945e 407 ARG_TEMPLATE,
f36933fe 408 ARG_PROPERTY,
6dac160c 409 ARG_PRIVATE_USERS,
c6c8f6e2 410 ARG_KILL_SIGNAL,
f757855e 411 ARG_SETTINGS,
5f932eb9 412 ARG_CHDIR,
7336138e 413 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 414 ARG_NOTIFY_READY,
a41fe3a2
LP
415 };
416
88213476 417 static const struct option options[] = {
27eb8e90
ZJS
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "as-pid2", no_argument, NULL, 'a' },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, 'E' },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
449 { "network-veth", no_argument, NULL, 'n' },
450 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
451 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
452 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
453 { "personality", required_argument, NULL, ARG_PERSONALITY },
454 { "image", required_argument, NULL, 'i' },
455 { "volatile", optional_argument, NULL, ARG_VOLATILE },
456 { "port", required_argument, NULL, 'p' },
457 { "property", required_argument, NULL, ARG_PROPERTY },
458 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
459 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
460 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
461 { "settings", required_argument, NULL, ARG_SETTINGS },
462 { "chdir", required_argument, NULL, ARG_CHDIR },
463 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 464 {}
88213476
LP
465 };
466
9444b1f2 467 int c, r;
6aadfa4c 468 const char *p, *e;
a42c8b54 469 uint64_t plus = 0, minus = 0;
f757855e 470 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
471
472 assert(argc >= 0);
473 assert(argv);
474
19aac838 475 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
476
477 switch (c) {
478
479 case 'h':
601185b4
ZJS
480 help();
481 return 0;
88213476 482
acbeb427 483 case ARG_VERSION:
3f6fd1ba 484 return version();
acbeb427 485
88213476 486 case 'D':
0f03c2a4 487 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 488 if (r < 0)
0f03c2a4 489 return r;
ec16945e
LP
490 break;
491
492 case ARG_TEMPLATE:
0f03c2a4 493 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 494 if (r < 0)
0f03c2a4 495 return r;
88213476
LP
496 break;
497
1b9e5b12 498 case 'i':
0f03c2a4 499 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 500 if (r < 0)
0f03c2a4 501 return r;
ec16945e
LP
502 break;
503
504 case 'x':
505 arg_ephemeral = true;
1b9e5b12
LP
506 break;
507
687d0825 508 case 'u':
2fc09a9c
DM
509 r = free_and_strdup(&arg_user, optarg);
510 if (r < 0)
7027ff61 511 return log_oom();
687d0825 512
f757855e 513 arg_settings_mask |= SETTING_USER;
687d0825
MV
514 break;
515
22b28dfd
LP
516 case ARG_NETWORK_ZONE: {
517 char *j;
518
519 j = strappend("vz-", optarg);
520 if (!j)
521 return log_oom();
522
523 if (!ifname_valid(j)) {
524 log_error("Network zone name not valid: %s", j);
525 free(j);
526 return -EINVAL;
527 }
528
529 free(arg_network_zone);
530 arg_network_zone = j;
531
532 arg_network_veth = true;
533 arg_private_network = true;
534 arg_settings_mask |= SETTING_NETWORK;
535 break;
536 }
537
ab046dde 538 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
539
540 if (!ifname_valid(optarg)) {
541 log_error("Bridge interface name not valid: %s", optarg);
542 return -EINVAL;
543 }
544
f757855e
LP
545 r = free_and_strdup(&arg_network_bridge, optarg);
546 if (r < 0)
547 return log_oom();
ab046dde
TG
548
549 /* fall through */
550
0dfaa006 551 case 'n':
69c79d3c
LP
552 arg_network_veth = true;
553 arg_private_network = true;
f757855e 554 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
555 break;
556
f6d6bad1
LP
557 case ARG_NETWORK_VETH_EXTRA:
558 r = veth_extra_parse(&arg_network_veth_extra, optarg);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
561
562 arg_private_network = true;
563 arg_settings_mask |= SETTING_NETWORK;
564 break;
565
aa28aefe 566 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
567
568 if (!ifname_valid(optarg)) {
569 log_error("Network interface name not valid: %s", optarg);
570 return -EINVAL;
571 }
572
c74e630d
LP
573 if (strv_extend(&arg_network_interfaces, optarg) < 0)
574 return log_oom();
575
576 arg_private_network = true;
f757855e 577 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
578 break;
579
580 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
581
582 if (!ifname_valid(optarg)) {
583 log_error("MACVLAN network interface name not valid: %s", optarg);
584 return -EINVAL;
585 }
586
c74e630d 587 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
588 return log_oom();
589
4bbfe7ad 590 arg_private_network = true;
f757855e 591 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
592 break;
593
594 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
595
596 if (!ifname_valid(optarg)) {
597 log_error("IPVLAN network interface name not valid: %s", optarg);
598 return -EINVAL;
599 }
600
4bbfe7ad
TG
601 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
602 return log_oom();
603
aa28aefe
LP
604 /* fall through */
605
ff01d048
LP
606 case ARG_PRIVATE_NETWORK:
607 arg_private_network = true;
f757855e 608 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
609 break;
610
0f0dbc46 611 case 'b':
7732f92b
LP
612 if (arg_start_mode == START_PID2) {
613 log_error("--boot and --as-pid2 may not be combined.");
614 return -EINVAL;
615 }
616
617 arg_start_mode = START_BOOT;
618 arg_settings_mask |= SETTING_START_MODE;
619 break;
620
621 case 'a':
622 if (arg_start_mode == START_BOOT) {
623 log_error("--boot and --as-pid2 may not be combined.");
624 return -EINVAL;
625 }
626
627 arg_start_mode = START_PID2;
628 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
629 break;
630
144f0fc0 631 case ARG_UUID:
9444b1f2 632 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
633 if (r < 0)
634 return log_error_errno(r, "Invalid UUID: %s", optarg);
635
636 if (sd_id128_is_null(arg_uuid)) {
637 log_error("Machine UUID may not be all zeroes.");
638 return -EINVAL;
aa96c6cb 639 }
f757855e
LP
640
641 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 642 break;
aa96c6cb 643
9444b1f2 644 case 'S':
c74e630d 645 arg_slice = optarg;
144f0fc0
LP
646 break;
647
7027ff61 648 case 'M':
c1521918 649 if (isempty(optarg))
97b11eed 650 arg_machine = mfree(arg_machine);
c1521918 651 else {
0c3c4284 652 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
653 log_error("Invalid machine name: %s", optarg);
654 return -EINVAL;
655 }
7027ff61 656
0c3c4284
LP
657 r = free_and_strdup(&arg_machine, optarg);
658 if (r < 0)
eb91eb18
LP
659 return log_oom();
660
661 break;
662 }
7027ff61 663
82adf6af
LP
664 case 'Z':
665 arg_selinux_context = optarg;
a8828ed9
DW
666 break;
667
82adf6af
LP
668 case 'L':
669 arg_selinux_apifs_context = optarg;
a8828ed9
DW
670 break;
671
bc2f673e
LP
672 case ARG_READ_ONLY:
673 arg_read_only = true;
f757855e 674 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
675 break;
676
420c7379
LP
677 case ARG_CAPABILITY:
678 case ARG_DROP_CAPABILITY: {
6cbe4ed1 679 p = optarg;
9ed794a3 680 for (;;) {
6cbe4ed1 681 _cleanup_free_ char *t = NULL;
5076f0cc 682
6cbe4ed1
SS
683 r = extract_first_word(&p, &t, ",", 0);
684 if (r < 0)
685 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 686
6cbe4ed1
SS
687 if (r == 0)
688 break;
5076f0cc 689
39ed67d1
LP
690 if (streq(t, "all")) {
691 if (c == ARG_CAPABILITY)
a42c8b54 692 plus = (uint64_t) -1;
39ed67d1 693 else
a42c8b54 694 minus = (uint64_t) -1;
39ed67d1 695 } else {
2822da4f
LP
696 int cap;
697
698 cap = capability_from_name(t);
699 if (cap < 0) {
39ed67d1
LP
700 log_error("Failed to parse capability %s.", t);
701 return -EINVAL;
702 }
703
704 if (c == ARG_CAPABILITY)
a42c8b54 705 plus |= 1ULL << (uint64_t) cap;
39ed67d1 706 else
a42c8b54 707 minus |= 1ULL << (uint64_t) cap;
5076f0cc 708 }
5076f0cc
LP
709 }
710
f757855e 711 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
712 break;
713 }
714
57fb9fb5
LP
715 case 'j':
716 arg_link_journal = LINK_GUEST;
574edc90 717 arg_link_journal_try = true;
57fb9fb5
LP
718 break;
719
720 case ARG_LINK_JOURNAL:
53e438e3 721 if (streq(optarg, "auto")) {
57fb9fb5 722 arg_link_journal = LINK_AUTO;
53e438e3
LP
723 arg_link_journal_try = false;
724 } else if (streq(optarg, "no")) {
57fb9fb5 725 arg_link_journal = LINK_NO;
53e438e3
LP
726 arg_link_journal_try = false;
727 } else if (streq(optarg, "guest")) {
57fb9fb5 728 arg_link_journal = LINK_GUEST;
53e438e3
LP
729 arg_link_journal_try = false;
730 } else if (streq(optarg, "host")) {
57fb9fb5 731 arg_link_journal = LINK_HOST;
53e438e3
LP
732 arg_link_journal_try = false;
733 } else if (streq(optarg, "try-guest")) {
574edc90
MP
734 arg_link_journal = LINK_GUEST;
735 arg_link_journal_try = true;
736 } else if (streq(optarg, "try-host")) {
737 arg_link_journal = LINK_HOST;
738 arg_link_journal_try = true;
739 } else {
57fb9fb5
LP
740 log_error("Failed to parse link journal mode %s", optarg);
741 return -EINVAL;
742 }
743
744 break;
745
17fe0523 746 case ARG_BIND:
f757855e
LP
747 case ARG_BIND_RO:
748 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
749 if (r < 0)
750 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 751
f757855e 752 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 753 break;
06c17c39 754
f757855e
LP
755 case ARG_TMPFS:
756 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
757 if (r < 0)
758 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 759
f757855e 760 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 761 break;
5a8af538
LP
762
763 case ARG_OVERLAY:
764 case ARG_OVERLAY_RO: {
765 _cleanup_free_ char *upper = NULL, *destination = NULL;
766 _cleanup_strv_free_ char **lower = NULL;
767 CustomMount *m;
768 unsigned n = 0;
769 char **i;
770
62f9f39a
RM
771 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
772 if (r == -ENOMEM)
06c17c39 773 return log_oom();
62f9f39a
RM
774 else if (r < 0) {
775 log_error("Invalid overlay specification: %s", optarg);
776 return r;
777 }
06c17c39 778
5a8af538
LP
779 STRV_FOREACH(i, lower) {
780 if (!path_is_absolute(*i)) {
781 log_error("Overlay path %s is not absolute.", *i);
782 return -EINVAL;
783 }
784
785 n++;
786 }
787
788 if (n < 2) {
789 log_error("--overlay= needs at least two colon-separated directories specified.");
790 return -EINVAL;
791 }
792
793 if (n == 2) {
794 /* If two parameters are specified,
795 * the first one is the lower, the
796 * second one the upper directory. And
af86c440
ZJS
797 * we'll also define the destination
798 * mount point the same as the upper. */
5a8af538
LP
799 upper = lower[1];
800 lower[1] = NULL;
801
802 destination = strdup(upper);
803 if (!destination)
804 return log_oom();
805
806 } else {
807 upper = lower[n - 2];
808 destination = lower[n - 1];
809 lower[n - 2] = NULL;
810 }
811
f757855e 812 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
813 if (!m)
814 return log_oom();
815
816 m->destination = destination;
817 m->source = upper;
818 m->lower = lower;
819 m->read_only = c == ARG_OVERLAY_RO;
820
821 upper = destination = NULL;
822 lower = NULL;
06c17c39 823
f757855e 824 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
825 break;
826 }
827
a5f1cb3b 828 case 'E': {
f4889f65
LP
829 char **n;
830
831 if (!env_assignment_is_valid(optarg)) {
832 log_error("Environment variable assignment '%s' is not valid.", optarg);
833 return -EINVAL;
834 }
835
836 n = strv_env_set(arg_setenv, optarg);
837 if (!n)
838 return log_oom();
839
840 strv_free(arg_setenv);
841 arg_setenv = n;
f757855e
LP
842
843 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
844 break;
845 }
846
284c0b91
LP
847 case 'q':
848 arg_quiet = true;
849 break;
850
8a96d94e 851 case ARG_SHARE_SYSTEM:
a6b5216c 852 /* We don't officially support this anymore, except for compat reasons. People should use the
0c582db0
LB
853 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
854 arg_clone_ns_flags = 0;
8a96d94e
LP
855 break;
856
eb91eb18
LP
857 case ARG_REGISTER:
858 r = parse_boolean(optarg);
859 if (r < 0) {
860 log_error("Failed to parse --register= argument: %s", optarg);
861 return r;
862 }
863
864 arg_register = r;
865 break;
866
89f7c846
LP
867 case ARG_KEEP_UNIT:
868 arg_keep_unit = true;
869 break;
870
6afc95b7
LP
871 case ARG_PERSONALITY:
872
ac45f971 873 arg_personality = personality_from_string(optarg);
050f7277 874 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
875 log_error("Unknown or unsupported personality '%s'.", optarg);
876 return -EINVAL;
877 }
878
f757855e 879 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
880 break;
881
4d9f07b4
LP
882 case ARG_VOLATILE:
883
884 if (!optarg)
f757855e 885 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 886 else {
f757855e 887 VolatileMode m;
4d9f07b4 888
f757855e
LP
889 m = volatile_mode_from_string(optarg);
890 if (m < 0) {
891 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 892 return -EINVAL;
f757855e
LP
893 } else
894 arg_volatile_mode = m;
6d0b55c2
LP
895 }
896
f757855e
LP
897 arg_settings_mask |= SETTING_VOLATILE_MODE;
898 break;
6d0b55c2 899
f757855e
LP
900 case 'p':
901 r = expose_port_parse(&arg_expose_ports, optarg);
902 if (r == -EEXIST)
903 return log_error_errno(r, "Duplicate port specification: %s", optarg);
904 if (r < 0)
905 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 906
f757855e 907 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 908 break;
6d0b55c2 909
f36933fe
LP
910 case ARG_PROPERTY:
911 if (strv_extend(&arg_property, optarg) < 0)
912 return log_oom();
913
914 break;
915
ae209204
ZJS
916 case ARG_PRIVATE_USERS: {
917 int boolean = -1;
0de7acce 918
ae209204
ZJS
919 if (!optarg)
920 boolean = true;
921 else if (!in_charset(optarg, DIGITS))
922 /* do *not* parse numbers as booleans */
923 boolean = parse_boolean(optarg);
924
925 if (boolean == false) {
0de7acce
LP
926 /* no: User namespacing off */
927 arg_userns_mode = USER_NAMESPACE_NO;
928 arg_uid_shift = UID_INVALID;
929 arg_uid_range = UINT32_C(0x10000);
ae209204 930 } else if (boolean == true) {
0de7acce
LP
931 /* yes: User namespacing on, UID range is read from root dir */
932 arg_userns_mode = USER_NAMESPACE_FIXED;
933 arg_uid_shift = UID_INVALID;
934 arg_uid_range = UINT32_C(0x10000);
935 } else if (streq(optarg, "pick")) {
936 /* pick: User namespacing on, UID range is picked randomly */
937 arg_userns_mode = USER_NAMESPACE_PICK;
938 arg_uid_shift = UID_INVALID;
939 arg_uid_range = UINT32_C(0x10000);
940 } else {
6c2058b3 941 _cleanup_free_ char *buffer = NULL;
6dac160c
LP
942 const char *range, *shift;
943
0de7acce
LP
944 /* anything else: User namespacing on, UID range is explicitly configured */
945
6dac160c
LP
946 range = strchr(optarg, ':');
947 if (range) {
6c2058b3
ZJS
948 buffer = strndup(optarg, range - optarg);
949 if (!buffer)
950 return log_oom();
951 shift = buffer;
6dac160c
LP
952
953 range++;
bfd292ec
ZJS
954 r = safe_atou32(range, &arg_uid_range);
955 if (r < 0)
be715731 956 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
6dac160c
LP
957 } else
958 shift = optarg;
959
be715731
ZJS
960 r = parse_uid(shift, &arg_uid_shift);
961 if (r < 0)
962 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
0de7acce
LP
963
964 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
965 }
966
be715731
ZJS
967 if (arg_uid_range <= 0) {
968 log_error("UID range cannot be 0.");
969 return -EINVAL;
970 }
971
0de7acce 972 arg_settings_mask |= SETTING_USERNS;
6dac160c 973 break;
ae209204 974 }
6dac160c 975
0de7acce 976 case 'U':
ccabee0d
LP
977 if (userns_supported()) {
978 arg_userns_mode = USER_NAMESPACE_PICK;
979 arg_uid_shift = UID_INVALID;
980 arg_uid_range = UINT32_C(0x10000);
981
982 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
983 }
984
7336138e
LP
985 break;
986
0de7acce 987 case ARG_PRIVATE_USERS_CHOWN:
19aac838 988 arg_userns_chown = true;
0de7acce
LP
989
990 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
991 break;
992
c6c8f6e2
LP
993 case ARG_KILL_SIGNAL:
994 arg_kill_signal = signal_from_string_try_harder(optarg);
995 if (arg_kill_signal < 0) {
996 log_error("Cannot parse signal: %s", optarg);
997 return -EINVAL;
998 }
999
f757855e
LP
1000 arg_settings_mask |= SETTING_KILL_SIGNAL;
1001 break;
1002
1003 case ARG_SETTINGS:
1004
1005 /* no → do not read files
1006 * yes → read files, do not override cmdline, trust only subset
1007 * override → read files, override cmdline, trust only subset
1008 * trusted → read files, do not override cmdline, trust all
1009 */
1010
1011 r = parse_boolean(optarg);
1012 if (r < 0) {
1013 if (streq(optarg, "trusted")) {
1014 mask_all_settings = false;
1015 mask_no_settings = false;
1016 arg_settings_trusted = true;
1017
1018 } else if (streq(optarg, "override")) {
1019 mask_all_settings = false;
1020 mask_no_settings = true;
1021 arg_settings_trusted = -1;
1022 } else
1023 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1024 } else if (r > 0) {
1025 /* yes */
1026 mask_all_settings = false;
1027 mask_no_settings = false;
1028 arg_settings_trusted = -1;
1029 } else {
1030 /* no */
1031 mask_all_settings = true;
1032 mask_no_settings = false;
1033 arg_settings_trusted = false;
1034 }
1035
c6c8f6e2
LP
1036 break;
1037
5f932eb9
LP
1038 case ARG_CHDIR:
1039 if (!path_is_absolute(optarg)) {
1040 log_error("Working directory %s is not an absolute path.", optarg);
1041 return -EINVAL;
1042 }
1043
1044 r = free_and_strdup(&arg_chdir, optarg);
1045 if (r < 0)
1046 return log_oom();
1047
1048 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1049 break;
1050
9c1e04d0
AP
1051 case ARG_NOTIFY_READY:
1052 r = parse_boolean(optarg);
1053 if (r < 0) {
1054 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1055 return -EINVAL;
1056 }
1057 arg_notify_ready = r;
1058 arg_settings_mask |= SETTING_NOTIFY_READY;
1059 break;
1060
88213476
LP
1061 case '?':
1062 return -EINVAL;
1063
1064 default:
eb9da376 1065 assert_not_reached("Unhandled option");
88213476 1066 }
88213476 1067
0c582db0
LB
1068 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1069 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1070 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1071 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
a6b5216c 1072
48a8d337
LB
1073 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1074 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
eb91eb18 1075 arg_register = false;
0c582db0
LB
1076 if (arg_start_mode != START_PID1) {
1077 log_error("--boot cannot be used without namespacing.");
1078 return -EINVAL;
1079 }
1080 }
eb91eb18 1081
0de7acce 1082 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1083 arg_userns_chown = true;
1084
89f7c846
LP
1085 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1086 log_error("--keep-unit may not be used when invoked from a user session.");
1087 return -EINVAL;
1088 }
1089
1b9e5b12
LP
1090 if (arg_directory && arg_image) {
1091 log_error("--directory= and --image= may not be combined.");
1092 return -EINVAL;
1093 }
1094
ec16945e
LP
1095 if (arg_template && arg_image) {
1096 log_error("--template= and --image= may not be combined.");
1097 return -EINVAL;
1098 }
1099
1100 if (arg_template && !(arg_directory || arg_machine)) {
1101 log_error("--template= needs --directory= or --machine=.");
1102 return -EINVAL;
1103 }
1104
1105 if (arg_ephemeral && arg_template) {
1106 log_error("--ephemeral and --template= may not be combined.");
1107 return -EINVAL;
1108 }
1109
1110 if (arg_ephemeral && arg_image) {
1111 log_error("--ephemeral and --image= may not be combined.");
1112 return -EINVAL;
1113 }
1114
df9a75e4
LP
1115 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1116 log_error("--ephemeral and --link-journal= may not be combined.");
1117 return -EINVAL;
1118 }
1119
ccabee0d 1120 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1121 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1122 return -EOPNOTSUPP;
1123 }
1124
1125 if (arg_userns_chown && arg_read_only) {
1126 log_error("--read-only and --private-users-chown may not be combined.");
1127 return -EINVAL;
1128 }
f757855e 1129
22b28dfd
LP
1130 if (arg_network_bridge && arg_network_zone) {
1131 log_error("--network-bridge= and --network-zone= may not be combined.");
1132 return -EINVAL;
1133 }
1134
f757855e
LP
1135 if (argc > optind) {
1136 arg_parameters = strv_copy(argv + optind);
1137 if (!arg_parameters)
1138 return log_oom();
1139
7732f92b 1140 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1141 }
1142
1143 /* Load all settings from .nspawn files */
1144 if (mask_no_settings)
1145 arg_settings_mask = 0;
1146
1147 /* Don't load any settings from .nspawn files */
1148 if (mask_all_settings)
1149 arg_settings_mask = _SETTINGS_MASK_ALL;
1150
520e0d54 1151 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e 1152
6aadfa4c
ILG
1153 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1154 if (e)
1155 arg_container_service_name = e;
1156
5a8ff0e6
CB
1157 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1158 if (r < 0)
1159 arg_use_cgns = cg_ns_supported();
1160 else
1161 arg_use_cgns = r;
1162
f757855e
LP
1163 return 1;
1164}
1165
1166static int verify_arguments(void) {
1167
1168 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1169 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1170 return -EINVAL;
1171 }
1172
6d0b55c2
LP
1173 if (arg_expose_ports && !arg_private_network) {
1174 log_error("Cannot use --port= without private networking.");
1175 return -EINVAL;
1176 }
1177
1c1ea217
EV
1178#ifndef HAVE_LIBIPTC
1179 if (arg_expose_ports) {
1180 log_error("--port= is not supported, compiled without libiptc support.");
1181 return -EOPNOTSUPP;
1182 }
1183#endif
1184
7732f92b 1185 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1186 arg_kill_signal = SIGRTMIN+3;
1187
f757855e 1188 return 0;
88213476
LP
1189}
1190
03cfe0d5
LP
1191static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1192 assert(p);
1193
0de7acce 1194 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1195 return 0;
1196
1197 if (uid == UID_INVALID && gid == GID_INVALID)
1198 return 0;
1199
1200 if (uid != UID_INVALID) {
1201 uid += arg_uid_shift;
1202
1203 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1204 return -EOVERFLOW;
1205 }
1206
1207 if (gid != GID_INVALID) {
1208 gid += (gid_t) arg_uid_shift;
1209
1210 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1211 return -EOVERFLOW;
1212 }
1213
1214 if (lchown(p, uid, gid) < 0)
1215 return -errno;
b12afc8c
LP
1216
1217 return 0;
1218}
1219
03cfe0d5
LP
1220static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1221 const char *q;
1222
1223 q = prefix_roota(root, path);
1224 if (mkdir(q, mode) < 0) {
1225 if (errno == EEXIST)
1226 return 0;
1227 return -errno;
1228 }
1229
1230 return userns_lchown(q, uid, gid);
1231}
1232
e58a1277 1233static int setup_timezone(const char *dest) {
03cfe0d5
LP
1234 _cleanup_free_ char *p = NULL, *q = NULL;
1235 const char *where, *check, *what;
d4036145
LP
1236 char *z, *y;
1237 int r;
f8440af5 1238
e58a1277
LP
1239 assert(dest);
1240
1241 /* Fix the timezone, if possible */
d4036145
LP
1242 r = readlink_malloc("/etc/localtime", &p);
1243 if (r < 0) {
0b493a02
MP
1244 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1245 /* to handle warning, delete /etc/localtime and replace it
d23a0044 1246 * with a symbolic link to a time zone data file.
0b493a02
MP
1247 *
1248 * Example:
21dc0227 1249 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
0b493a02 1250 */
d4036145
LP
1251 return 0;
1252 }
1253
1254 z = path_startswith(p, "../usr/share/zoneinfo/");
1255 if (!z)
1256 z = path_startswith(p, "/usr/share/zoneinfo/");
1257 if (!z) {
1258 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1259 return 0;
1260 }
1261
03cfe0d5 1262 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1263 r = readlink_malloc(where, &q);
1264 if (r >= 0) {
1265 y = path_startswith(q, "../usr/share/zoneinfo/");
1266 if (!y)
1267 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1268
d4036145
LP
1269 /* Already pointing to the right place? Then do nothing .. */
1270 if (y && streq(y, z))
1271 return 0;
1272 }
1273
03cfe0d5 1274 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1275 check = prefix_roota(dest, check);
03cfe0d5 1276 if (laccess(check, F_OK) < 0) {
d4036145
LP
1277 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1278 return 0;
1279 }
68fb0892 1280
79d80fc1
TG
1281 r = unlink(where);
1282 if (r < 0 && errno != ENOENT) {
56f64d95 1283 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1284 return 0;
1285 }
4d9f07b4 1286
03cfe0d5 1287 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1288 if (symlink(what, where) < 0) {
56f64d95 1289 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1290 return 0;
1291 }
e58a1277 1292
03cfe0d5
LP
1293 r = userns_lchown(where, 0, 0);
1294 if (r < 0)
1295 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1296
e58a1277 1297 return 0;
88213476
LP
1298}
1299
2547bb41 1300static int setup_resolv_conf(const char *dest) {
03cfe0d5 1301 const char *where = NULL;
79d80fc1 1302 int r;
2547bb41
LP
1303
1304 assert(dest);
1305
1306 if (arg_private_network)
1307 return 0;
1308
1309 /* Fix resolv.conf, if possible */
03cfe0d5 1310 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1311
3539724c
LP
1312 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
1313 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1314 * container, so that the container can use the host's resolver. Given that network namespacing is
1315 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1316 * advantage that the container will be able to follow the host's DNS server configuration changes
1317 * transparently. */
1318
60e76d48
ZJS
1319 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1320 if (r >= 0)
1321 return mount_verbose(LOG_ERR, NULL, where, NULL,
1322 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
3539724c
LP
1323 }
1324
1325 /* If that didn't work, let's copy the file */
f2068bcc 1326 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1327 if (r < 0) {
3539724c
LP
1328 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1329 * resolved or something similar runs inside and the symlink points there.
68a313c5 1330 *
3539724c 1331 * If the disk image is read-only, there's also no point in complaining.
68a313c5
LP
1332 */
1333 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
3539724c 1334 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
79d80fc1
TG
1335 return 0;
1336 }
2547bb41 1337
03cfe0d5
LP
1338 r = userns_lchown(where, 0, 0);
1339 if (r < 0)
3539724c 1340 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
03cfe0d5 1341
2547bb41
LP
1342 return 0;
1343}
1344
04bc4a3f 1345static int setup_boot_id(const char *dest) {
3bbaff3e 1346 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1347 const char *from, *to;
04bc4a3f
LP
1348 int r;
1349
04bc4a3f
LP
1350 /* Generate a new randomized boot ID, so that each boot-up of
1351 * the container gets a new one */
1352
03cfe0d5
LP
1353 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1354 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1355
1356 r = sd_id128_randomize(&rnd);
f647962d
MS
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1359
15b1248a 1360 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1363
60e76d48
ZJS
1364 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1365 if (r >= 0)
1366 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1367 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
04bc4a3f 1368
3bbaff3e 1369 (void) unlink(from);
04bc4a3f
LP
1370 return r;
1371}
1372
e58a1277 1373static int copy_devnodes(const char *dest) {
88213476
LP
1374
1375 static const char devnodes[] =
1376 "null\0"
1377 "zero\0"
1378 "full\0"
1379 "random\0"
1380 "urandom\0"
85614d66
TG
1381 "tty\0"
1382 "net/tun\0";
88213476
LP
1383
1384 const char *d;
e58a1277 1385 int r = 0;
7fd1b19b 1386 _cleanup_umask_ mode_t u;
a258bf26
LP
1387
1388 assert(dest);
124640f1
LP
1389
1390 u = umask(0000);
88213476 1391
03cfe0d5
LP
1392 /* Create /dev/net, so that we can create /dev/net/tun in it */
1393 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1394 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1395
88213476 1396 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1397 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1398 struct stat st;
88213476 1399
7f112f50 1400 from = strappend("/dev/", d);
03cfe0d5 1401 to = prefix_root(dest, from);
88213476
LP
1402
1403 if (stat(from, &st) < 0) {
1404
4a62c710
MS
1405 if (errno != ENOENT)
1406 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1407
a258bf26 1408 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1409
03cfe0d5 1410 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1411 return -EIO;
a258bf26 1412
85614d66 1413 } else {
81f5049b 1414 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
41eb4362
DH
1415 /*
1416 * This is some sort of protection too against
1417 * recursive userns chown on shared /dev/
1418 */
1419 if (errno == EEXIST)
1420 log_notice("%s/dev/ should be an empty directory", dest);
81f5049b
AC
1421 if (errno != EPERM)
1422 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1423
1424 /* Some systems abusively restrict mknod but
1425 * allow bind mounts. */
1426 r = touch(to);
1427 if (r < 0)
1428 return log_error_errno(r, "touch (%s) failed: %m", to);
60e76d48
ZJS
1429 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1430 if (r < 0)
1431 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
81f5049b 1432 }
6278cf60 1433
03cfe0d5
LP
1434 r = userns_lchown(to, 0, 0);
1435 if (r < 0)
1436 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1437 }
88213476
LP
1438 }
1439
e58a1277
LP
1440 return r;
1441}
88213476 1442
03cfe0d5
LP
1443static int setup_pts(const char *dest) {
1444 _cleanup_free_ char *options = NULL;
1445 const char *p;
709f6e46 1446 int r;
03cfe0d5
LP
1447
1448#ifdef HAVE_SELINUX
1449 if (arg_selinux_apifs_context)
1450 (void) asprintf(&options,
3dce8915 1451 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1452 arg_uid_shift + TTY_GID,
1453 arg_selinux_apifs_context);
1454 else
1455#endif
1456 (void) asprintf(&options,
3dce8915 1457 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1458 arg_uid_shift + TTY_GID);
f2d88580 1459
03cfe0d5 1460 if (!options)
f2d88580
LP
1461 return log_oom();
1462
03cfe0d5 1463 /* Mount /dev/pts itself */
cc9fce65 1464 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1465 if (mkdir(p, 0755) < 0)
1466 return log_error_errno(errno, "Failed to create /dev/pts: %m");
60e76d48
ZJS
1467 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1468 if (r < 0)
1469 return r;
709f6e46
MS
1470 r = userns_lchown(p, 0, 0);
1471 if (r < 0)
1472 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1473
1474 /* Create /dev/ptmx symlink */
1475 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1476 if (symlink("pts/ptmx", p) < 0)
1477 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1478 r = userns_lchown(p, 0, 0);
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1481
03cfe0d5
LP
1482 /* And fix /dev/pts/ptmx ownership */
1483 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1484 r = userns_lchown(p, 0, 0);
1485 if (r < 0)
1486 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1487
f2d88580
LP
1488 return 0;
1489}
1490
e58a1277 1491static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1492 _cleanup_umask_ mode_t u;
1493 const char *to;
e58a1277 1494 int r;
e58a1277
LP
1495
1496 assert(dest);
1497 assert(console);
1498
1499 u = umask(0000);
1500
03cfe0d5 1501 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1502 if (r < 0)
1503 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1504
a258bf26
LP
1505 /* We need to bind mount the right tty to /dev/console since
1506 * ptys can only exist on pts file systems. To have something
81f5049b 1507 * to bind mount things on we create a empty regular file. */
a258bf26 1508
03cfe0d5 1509 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1510 r = touch(to);
1511 if (r < 0)
1512 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1513
60e76d48 1514 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
e58a1277
LP
1515}
1516
1517static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1518 const char *from, *to;
7fd1b19b 1519 _cleanup_umask_ mode_t u;
d9603714 1520 int fd, r;
e58a1277 1521
e58a1277 1522 assert(kmsg_socket >= 0);
a258bf26 1523
e58a1277 1524 u = umask(0000);
a258bf26 1525
03cfe0d5 1526 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1527 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1528 * on the reading side behave very similar to /proc/kmsg,
1529 * their writing side behaves differently from /dev/kmsg in
1530 * that writing blocks when nothing is reading. In order to
1531 * avoid any problems with containers deadlocking due to this
1532 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1533 from = prefix_roota(dest, "/run/kmsg");
1534 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1535
4a62c710 1536 if (mkfifo(from, 0600) < 0)
03cfe0d5 1537 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
60e76d48
ZJS
1538 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1539 if (r < 0)
1540 return r;
e58a1277
LP
1541
1542 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1543 if (fd < 0)
1544 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1545
e58a1277
LP
1546 /* Store away the fd in the socket, so that it stays open as
1547 * long as we run the child */
3ee897d6 1548 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1549 safe_close(fd);
e58a1277 1550
d9603714
DH
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1553
03cfe0d5
LP
1554 /* And now make the FIFO unavailable as /run/kmsg... */
1555 (void) unlink(from);
1556
25ea79fe 1557 return 0;
88213476
LP
1558}
1559
1c4baffc 1560static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1561 union in_addr_union *exposed = userdata;
1562
1563 assert(rtnl);
1564 assert(m);
1565 assert(exposed);
1566
7a8f6325 1567 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1568 return 0;
1569}
1570
3a74cea5 1571static int setup_hostname(void) {
3a74cea5 1572
0c582db0 1573 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
eb91eb18
LP
1574 return 0;
1575
605f81a8 1576 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1577 return -errno;
3a74cea5 1578
7027ff61 1579 return 0;
3a74cea5
LP
1580}
1581
57fb9fb5 1582static int setup_journal(const char *directory) {
e01ff70a 1583 sd_id128_t this_id;
0f5e1382 1584 _cleanup_free_ char *d = NULL;
e01ff70a 1585 const char *p, *q;
8054d749 1586 bool try;
e01ff70a 1587 char id[33];
57fb9fb5
LP
1588 int r;
1589
df9a75e4
LP
1590 /* Don't link journals in ephemeral mode */
1591 if (arg_ephemeral)
1592 return 0;
1593
8054d749
LP
1594 if (arg_link_journal == LINK_NO)
1595 return 0;
1596
1597 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1598
4d680aee 1599 r = sd_id128_get_machine(&this_id);
f647962d
MS
1600 if (r < 0)
1601 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1602
e01ff70a 1603 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1604 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1605 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1606 if (try)
4d680aee 1607 return 0;
df9a75e4 1608 return -EEXIST;
4d680aee
ZJS
1609 }
1610
03cfe0d5
LP
1611 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1612 if (r < 0)
1613 return log_error_errno(r, "Failed to create /var: %m");
1614
1615 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to create /var/log: %m");
1618
1619 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1620 if (r < 0)
1621 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1622
e01ff70a
MS
1623 (void) sd_id128_to_string(arg_uuid, id);
1624
03cfe0d5
LP
1625 p = strjoina("/var/log/journal/", id);
1626 q = prefix_roota(directory, p);
27407a01 1627
e26d6ce5 1628 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1629 if (try)
1630 return 0;
27407a01 1631
8054d749
LP
1632 log_error("%s: already a mount point, refusing to use for journal", p);
1633 return -EEXIST;
57fb9fb5
LP
1634 }
1635
e26d6ce5 1636 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1637 if (try)
1638 return 0;
57fb9fb5 1639
8054d749
LP
1640 log_error("%s: already a mount point, refusing to use for journal", q);
1641 return -EEXIST;
57fb9fb5
LP
1642 }
1643
1644 r = readlink_and_make_absolute(p, &d);
1645 if (r >= 0) {
1646 if ((arg_link_journal == LINK_GUEST ||
1647 arg_link_journal == LINK_AUTO) &&
1648 path_equal(d, q)) {
1649
03cfe0d5 1650 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1651 if (r < 0)
709f6e46 1652 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1653 return 0;
57fb9fb5
LP
1654 }
1655
4a62c710
MS
1656 if (unlink(p) < 0)
1657 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1658 } else if (r == -EINVAL) {
1659
1660 if (arg_link_journal == LINK_GUEST &&
1661 rmdir(p) < 0) {
1662
27407a01
ZJS
1663 if (errno == ENOTDIR) {
1664 log_error("%s already exists and is neither a symlink nor a directory", p);
1665 return r;
4314d33f
MS
1666 } else
1667 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1668 }
4314d33f
MS
1669 } else if (r != -ENOENT)
1670 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1671
1672 if (arg_link_journal == LINK_GUEST) {
1673
1674 if (symlink(q, p) < 0) {
8054d749 1675 if (try) {
56f64d95 1676 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1677 return 0;
4314d33f
MS
1678 } else
1679 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1680 }
1681
03cfe0d5 1682 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1683 if (r < 0)
709f6e46 1684 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1685 return 0;
57fb9fb5
LP
1686 }
1687
1688 if (arg_link_journal == LINK_HOST) {
ccddd104 1689 /* don't create parents here — if the host doesn't have
574edc90 1690 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1691
1692 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1693 if (try) {
56f64d95 1694 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1695 return 0;
4314d33f
MS
1696 } else
1697 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1698 }
1699
27407a01
ZJS
1700 } else if (access(p, F_OK) < 0)
1701 return 0;
57fb9fb5 1702
cdb2b9d0
LP
1703 if (dir_is_empty(q) == 0)
1704 log_warning("%s is not empty, proceeding anyway.", q);
1705
03cfe0d5 1706 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1709
60e76d48
ZJS
1710 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1711 if (r < 0)
4a62c710 1712 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1713
27407a01 1714 return 0;
57fb9fb5
LP
1715}
1716
88213476 1717static int drop_capabilities(void) {
520e0d54 1718 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1719}
1720
db999e0f
LP
1721static int reset_audit_loginuid(void) {
1722 _cleanup_free_ char *p = NULL;
1723 int r;
1724
0c582db0 1725 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
db999e0f
LP
1726 return 0;
1727
1728 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1729 if (r == -ENOENT)
db999e0f 1730 return 0;
f647962d
MS
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1733
1734 /* Already reset? */
1735 if (streq(p, "4294967295"))
1736 return 0;
1737
ad118bda 1738 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1739 if (r < 0) {
10a87006
LP
1740 log_error_errno(r,
1741 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1742 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1743 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1744 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1745 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1746
db999e0f 1747 sleep(5);
77b6e194 1748 }
db999e0f
LP
1749
1750 return 0;
77b6e194
LP
1751}
1752
24fb1112 1753
785890ac
LP
1754static int setup_propagate(const char *root) {
1755 const char *p, *q;
709f6e46 1756 int r;
785890ac
LP
1757
1758 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1759 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1760 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1761 (void) mkdir_p(p, 0600);
1762
709f6e46
MS
1763 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1764 if (r < 0)
1765 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1766
709f6e46
MS
1767 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1768 if (r < 0)
1769 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1770
709f6e46
MS
1771 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1772 if (r < 0)
1773 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1774
03cfe0d5 1775 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
60e76d48
ZJS
1776 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1777 if (r < 0)
1778 return r;
785890ac 1779
60e76d48
ZJS
1780 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1781 if (r < 0)
1782 return r;
785890ac 1783
19caffac
AC
1784 /* machined will MS_MOVE into that directory, and that's only
1785 * supported for non-shared mounts. */
60e76d48 1786 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
785890ac
LP
1787}
1788
1b9e5b12
LP
1789static int setup_image(char **device_path, int *loop_nr) {
1790 struct loop_info64 info = {
1791 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1792 };
1793 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1794 _cleanup_free_ char* loopdev = NULL;
1795 struct stat st;
1796 int r, nr;
1797
1798 assert(device_path);
1799 assert(loop_nr);
ec16945e 1800 assert(arg_image);
1b9e5b12
LP
1801
1802 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1803 if (fd < 0)
1804 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1805
4a62c710
MS
1806 if (fstat(fd, &st) < 0)
1807 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1808
1809 if (S_ISBLK(st.st_mode)) {
1810 char *p;
1811
1812 p = strdup(arg_image);
1813 if (!p)
1814 return log_oom();
1815
1816 *device_path = p;
1817
1818 *loop_nr = -1;
1819
1820 r = fd;
1821 fd = -1;
1822
1823 return r;
1824 }
1825
1826 if (!S_ISREG(st.st_mode)) {
070edd97 1827 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1828 return -EINVAL;
1829 }
1830
1831 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1832 if (control < 0)
1833 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1834
1835 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1836 if (nr < 0)
1837 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1838
1839 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1840 return log_oom();
1841
1842 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1843 if (loop < 0)
1844 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1845
4a62c710
MS
1846 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1847 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1848
1849 if (arg_read_only)
1850 info.lo_flags |= LO_FLAGS_READ_ONLY;
1851
4a62c710
MS
1852 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1853 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1854
1855 *device_path = loopdev;
1856 loopdev = NULL;
1857
1858 *loop_nr = nr;
1859
1860 r = loop;
1861 loop = -1;
1862
1863 return r;
1864}
1865
ada4799a
LP
1866#define PARTITION_TABLE_BLURB \
1867 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1868 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1869 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1870 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1871 "to be bootable with systemd-nspawn."
1872
1b9e5b12
LP
1873static int dissect_image(
1874 int fd,
727fd4fd
LP
1875 char **root_device, bool *root_device_rw,
1876 char **home_device, bool *home_device_rw,
1877 char **srv_device, bool *srv_device_rw,
a6bc7db9 1878 char **esp_device,
1b9e5b12
LP
1879 bool *secondary) {
1880
1881#ifdef HAVE_BLKID
a6bc7db9 1882 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1883#ifdef GPT_ROOT_NATIVE
1884 int root_nr = -1;
1885#endif
1886#ifdef GPT_ROOT_SECONDARY
1887 int secondary_root_nr = -1;
1888#endif
a6bc7db9 1889 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1890 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1891 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1892 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1893 _cleanup_udev_unref_ struct udev *udev = NULL;
1894 struct udev_list_entry *first, *item;
f6c51a81 1895 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1896 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1897 const char *pttype = NULL;
1898 blkid_partlist pl;
1899 struct stat st;
c09ef2e4 1900 unsigned i;
1b9e5b12
LP
1901 int r;
1902
1903 assert(fd >= 0);
1904 assert(root_device);
1905 assert(home_device);
1906 assert(srv_device);
a6bc7db9 1907 assert(esp_device);
1b9e5b12 1908 assert(secondary);
ec16945e 1909 assert(arg_image);
1b9e5b12
LP
1910
1911 b = blkid_new_probe();
1912 if (!b)
1913 return log_oom();
1914
1915 errno = 0;
1916 r = blkid_probe_set_device(b, fd, 0, 0);
1917 if (r != 0) {
1918 if (errno == 0)
1919 return log_oom();
1920
e1427b13 1921 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1922 }
1923
1924 blkid_probe_enable_partitions(b, 1);
1925 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1926
1927 errno = 0;
1928 r = blkid_do_safeprobe(b);
1929 if (r == -2 || r == 1) {
ada4799a
LP
1930 log_error("Failed to identify any partition table on\n"
1931 " %s\n"
1932 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1933 return -EINVAL;
1934 } else if (r != 0) {
1935 if (errno == 0)
1936 errno = EIO;
e1427b13 1937 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1938 }
1939
48861960 1940 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1941
1942 is_gpt = streq_ptr(pttype, "gpt");
1943 is_mbr = streq_ptr(pttype, "dos");
1944
1945 if (!is_gpt && !is_mbr) {
1946 log_error("No GPT or MBR partition table discovered on\n"
1947 " %s\n"
1948 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1949 return -EINVAL;
1950 }
1951
1952 errno = 0;
1953 pl = blkid_probe_get_partitions(b);
1954 if (!pl) {
1955 if (errno == 0)
1956 return log_oom();
1957
1958 log_error("Failed to list partitions of %s", arg_image);
1959 return -errno;
1960 }
1961
1962 udev = udev_new();
1963 if (!udev)
1964 return log_oom();
1965
4a62c710
MS
1966 if (fstat(fd, &st) < 0)
1967 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1968
c09ef2e4
LP
1969 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1970 if (!d)
1b9e5b12
LP
1971 return log_oom();
1972
c09ef2e4
LP
1973 for (i = 0;; i++) {
1974 int n, m;
1b9e5b12 1975
c09ef2e4
LP
1976 if (i >= 10) {
1977 log_error("Kernel partitions never appeared.");
1978 return -ENXIO;
1979 }
1980
1981 e = udev_enumerate_new(udev);
1982 if (!e)
1983 return log_oom();
1984
1985 r = udev_enumerate_add_match_parent(e, d);
1986 if (r < 0)
1987 return log_oom();
1988
1989 r = udev_enumerate_scan_devices(e);
1990 if (r < 0)
1991 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1992
1993 /* Count the partitions enumerated by the kernel */
1994 n = 0;
1995 first = udev_enumerate_get_list_entry(e);
1996 udev_list_entry_foreach(item, first)
1997 n++;
1998
1999 /* Count the partitions enumerated by blkid */
2000 m = blkid_partlist_numof_partitions(pl);
2001 if (n == m + 1)
2002 break;
2003 if (n > m + 1) {
2004 log_error("blkid and kernel partition list do not match.");
2005 return -EIO;
2006 }
2007 if (n < m + 1) {
2008 unsigned j;
2009
2010 /* The kernel has probed fewer partitions than
2011 * blkid? Maybe the kernel prober is still
2012 * running or it got EBUSY because udev
2013 * already opened the device. Let's reprobe
2014 * the device, which is a synchronous call
2015 * that waits until probing is complete. */
2016
2017 for (j = 0; j < 20; j++) {
2018
2019 r = ioctl(fd, BLKRRPART, 0);
2020 if (r < 0)
2021 r = -errno;
2022 if (r >= 0 || r != -EBUSY)
2023 break;
2024
2025 /* If something else has the device
2026 * open, such as an udev rule, the
2027 * ioctl will return EBUSY. Since
2028 * there's no way to wait until it
2029 * isn't busy anymore, let's just wait
2030 * a bit, and try again.
2031 *
2032 * This is really something they
2033 * should fix in the kernel! */
2034
2035 usleep(50 * USEC_PER_MSEC);
2036 }
2037
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to reread partition table: %m");
2040 }
2041
2042 e = udev_enumerate_unref(e);
2043 }
1b9e5b12
LP
2044
2045 first = udev_enumerate_get_list_entry(e);
2046 udev_list_entry_foreach(item, first) {
2047 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2048 const char *node;
727fd4fd 2049 unsigned long long flags;
1b9e5b12
LP
2050 blkid_partition pp;
2051 dev_t qn;
2052 int nr;
2053
2054 errno = 0;
2055 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2056 if (!q) {
2057 if (!errno)
2058 errno = ENOMEM;
2059
e1427b13 2060 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2061 }
2062
2063 qn = udev_device_get_devnum(q);
2064 if (major(qn) == 0)
2065 continue;
2066
2067 if (st.st_rdev == qn)
2068 continue;
2069
2070 node = udev_device_get_devnode(q);
2071 if (!node)
2072 continue;
2073
2074 pp = blkid_partlist_devno_to_partition(pl, qn);
2075 if (!pp)
2076 continue;
2077
727fd4fd 2078 flags = blkid_partition_get_flags(pp);
727fd4fd 2079
1b9e5b12
LP
2080 nr = blkid_partition_get_partno(pp);
2081 if (nr < 0)
2082 continue;
2083
ada4799a
LP
2084 if (is_gpt) {
2085 sd_id128_t type_id;
2086 const char *stype;
1b9e5b12 2087
f6c51a81
LP
2088 if (flags & GPT_FLAG_NO_AUTO)
2089 continue;
2090
ada4799a
LP
2091 stype = blkid_partition_get_type_string(pp);
2092 if (!stype)
2093 continue;
1b9e5b12 2094
ada4799a 2095 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2096 continue;
2097
ada4799a 2098 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2099
ada4799a
LP
2100 if (home && nr >= home_nr)
2101 continue;
1b9e5b12 2102
ada4799a
LP
2103 home_nr = nr;
2104 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2105
ada4799a
LP
2106 r = free_and_strdup(&home, node);
2107 if (r < 0)
2108 return log_oom();
727fd4fd 2109
ada4799a
LP
2110 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2111
2112 if (srv && nr >= srv_nr)
2113 continue;
2114
2115 srv_nr = nr;
2116 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2117
2118 r = free_and_strdup(&srv, node);
2119 if (r < 0)
2120 return log_oom();
a6bc7db9
LP
2121 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2122
2123 if (esp && nr >= esp_nr)
2124 continue;
2125
2126 esp_nr = nr;
2127
2128 r = free_and_strdup(&esp, node);
2129 if (r < 0)
2130 return log_oom();
ada4799a 2131 }
1b9e5b12 2132#ifdef GPT_ROOT_NATIVE
ada4799a 2133 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2134
ada4799a
LP
2135 if (root && nr >= root_nr)
2136 continue;
1b9e5b12 2137
ada4799a
LP
2138 root_nr = nr;
2139 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2140
ada4799a
LP
2141 r = free_and_strdup(&root, node);
2142 if (r < 0)
2143 return log_oom();
2144 }
1b9e5b12
LP
2145#endif
2146#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2147 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2148
2149 if (secondary_root && nr >= secondary_root_nr)
2150 continue;
2151
2152 secondary_root_nr = nr;
2153 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2154
2155 r = free_and_strdup(&secondary_root, node);
2156 if (r < 0)
2157 return log_oom();
2158 }
2159#endif
f6c51a81
LP
2160 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2161
2162 if (generic)
2163 multiple_generic = true;
2164 else {
2165 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2166
2167 r = free_and_strdup(&generic, node);
2168 if (r < 0)
2169 return log_oom();
2170 }
2171 }
ada4799a
LP
2172
2173 } else if (is_mbr) {
2174 int type;
1b9e5b12 2175
f6c51a81
LP
2176 if (flags != 0x80) /* Bootable flag */
2177 continue;
2178
ada4799a
LP
2179 type = blkid_partition_get_type(pp);
2180 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2181 continue;
2182
f6c51a81
LP
2183 if (generic)
2184 multiple_generic = true;
2185 else {
2186 generic_rw = true;
727fd4fd 2187
f6c51a81
LP
2188 r = free_and_strdup(&root, node);
2189 if (r < 0)
2190 return log_oom();
2191 }
1b9e5b12 2192 }
1b9e5b12
LP
2193 }
2194
1b9e5b12
LP
2195 if (root) {
2196 *root_device = root;
2197 root = NULL;
727fd4fd
LP
2198
2199 *root_device_rw = root_rw;
1b9e5b12
LP
2200 *secondary = false;
2201 } else if (secondary_root) {
2202 *root_device = secondary_root;
2203 secondary_root = NULL;
727fd4fd
LP
2204
2205 *root_device_rw = secondary_root_rw;
1b9e5b12 2206 *secondary = true;
f6c51a81
LP
2207 } else if (generic) {
2208
2209 /* There were no partitions with precise meanings
2210 * around, but we found generic partitions. In this
2211 * case, if there's only one, we can go ahead and boot
2212 * it, otherwise we bail out, because we really cannot
2213 * make any sense of it. */
2214
2215 if (multiple_generic) {
2216 log_error("Identified multiple bootable Linux partitions on\n"
2217 " %s\n"
2218 PARTITION_TABLE_BLURB, arg_image);
2219 return -EINVAL;
2220 }
2221
2222 *root_device = generic;
2223 generic = NULL;
2224
2225 *root_device_rw = generic_rw;
2226 *secondary = false;
2227 } else {
2228 log_error("Failed to identify root partition in disk image\n"
2229 " %s\n"
2230 PARTITION_TABLE_BLURB, arg_image);
2231 return -EINVAL;
1b9e5b12
LP
2232 }
2233
2234 if (home) {
2235 *home_device = home;
2236 home = NULL;
727fd4fd
LP
2237
2238 *home_device_rw = home_rw;
1b9e5b12
LP
2239 }
2240
2241 if (srv) {
2242 *srv_device = srv;
2243 srv = NULL;
727fd4fd
LP
2244
2245 *srv_device_rw = srv_rw;
1b9e5b12
LP
2246 }
2247
a6bc7db9
LP
2248 if (esp) {
2249 *esp_device = esp;
2250 esp = NULL;
2251 }
2252
1b9e5b12
LP
2253 return 0;
2254#else
2255 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2256 return -EOPNOTSUPP;
1b9e5b12
LP
2257#endif
2258}
2259
727fd4fd 2260static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2261#ifdef HAVE_BLKID
2262 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2263 const char *fstype, *p;
2264 int r;
2265
2266 assert(what);
2267 assert(where);
2268
727fd4fd
LP
2269 if (arg_read_only)
2270 rw = false;
2271
1b9e5b12 2272 if (directory)
63c372cb 2273 p = strjoina(where, directory);
1b9e5b12
LP
2274 else
2275 p = where;
2276
2277 errno = 0;
2278 b = blkid_new_probe_from_filename(what);
2279 if (!b) {
2280 if (errno == 0)
2281 return log_oom();
e1427b13 2282 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2283 }
2284
2285 blkid_probe_enable_superblocks(b, 1);
2286 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2287
2288 errno = 0;
2289 r = blkid_do_safeprobe(b);
2290 if (r == -1 || r == 1) {
2291 log_error("Cannot determine file system type of %s", what);
2292 return -EINVAL;
2293 } else if (r != 0) {
2294 if (errno == 0)
2295 errno = EIO;
e1427b13 2296 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2297 }
2298
2299 errno = 0;
2300 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2301 if (errno == 0)
2302 errno = EINVAL;
2303 log_error("Failed to determine file system type of %s", what);
2304 return -errno;
2305 }
2306
2307 if (streq(fstype, "crypto_LUKS")) {
2308 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2309 return -EOPNOTSUPP;
1b9e5b12
LP
2310 }
2311
60e76d48 2312 return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL);
1b9e5b12
LP
2313#else
2314 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2315 return -EOPNOTSUPP;
1b9e5b12
LP
2316#endif
2317}
2318
317feb4d 2319static int setup_machine_id(const char *directory) {
691675ba
LP
2320 const char *etc_machine_id;
2321 sd_id128_t id;
3bbaff3e 2322 int r;
e01ff70a 2323
317feb4d
LP
2324 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2325 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2326 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2327 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2328 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2329 * container behaves nicely). */
2330
e01ff70a
MS
2331 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2332
691675ba 2333 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2334 if (r < 0) {
2335 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2336 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2337
317feb4d
LP
2338 if (sd_id128_is_null(arg_uuid)) {
2339 r = sd_id128_randomize(&arg_uuid);
2340 if (r < 0)
2341 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2342 }
2343 } else {
2344 if (sd_id128_is_null(id)) {
2345 log_error("Machine ID in container image is zero, refusing.");
2346 return -EINVAL;
2347 }
e01ff70a 2348
317feb4d
LP
2349 arg_uuid = id;
2350 }
691675ba 2351
e01ff70a
MS
2352 return 0;
2353}
2354
7336138e
LP
2355static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2356 int r;
2357
2358 assert(directory);
2359
0de7acce 2360 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2361 return 0;
2362
2363 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2364 if (r == -EOPNOTSUPP)
2365 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2366 if (r == -EBADE)
2367 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2370 if (r == 0)
2371 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2372 else
2373 log_debug("Patched directory tree to match UID/GID range.");
2374
2375 return r;
2376}
2377
727fd4fd
LP
2378static int mount_devices(
2379 const char *where,
2380 const char *root_device, bool root_device_rw,
2381 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2382 const char *srv_device, bool srv_device_rw,
2383 const char *esp_device) {
1b9e5b12
LP
2384 int r;
2385
2386 assert(where);
2387
2388 if (root_device) {
727fd4fd 2389 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2392 }
2393
2394 if (home_device) {
727fd4fd 2395 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2396 if (r < 0)
2397 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2398 }
2399
2400 if (srv_device) {
727fd4fd 2401 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2402 if (r < 0)
2403 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2404 }
2405
a6bc7db9
LP
2406 if (esp_device) {
2407 const char *mp, *x;
2408
2409 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2410
2411 mp = "/efi";
2412 x = strjoina(arg_directory, mp);
2413 r = dir_is_empty(x);
2414 if (r == -ENOENT) {
2415 mp = "/boot";
2416 x = strjoina(arg_directory, mp);
2417 r = dir_is_empty(x);
2418 }
2419
2420 if (r > 0) {
2421 r = mount_device(esp_device, arg_directory, mp, true);
2422 if (r < 0)
2423 return log_error_errno(r, "Failed to mount ESP: %m");
2424 }
2425 }
2426
1b9e5b12
LP
2427 return 0;
2428}
2429
2430static void loop_remove(int nr, int *image_fd) {
2431 _cleanup_close_ int control = -1;
e8c8ddcc 2432 int r;
1b9e5b12
LP
2433
2434 if (nr < 0)
2435 return;
2436
2437 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2438 r = ioctl(*image_fd, LOOP_CLR_FD);
2439 if (r < 0)
5e4074aa 2440 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2441 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2442 }
2443
2444 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2445 if (control < 0) {
56f64d95 2446 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2447 return;
e8c8ddcc 2448 }
1b9e5b12 2449
e8c8ddcc
TG
2450 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2451 if (r < 0)
5e4074aa 2452 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2453}
2454
113cea80 2455/*
6d416b9c
LS
2456 * Return values:
2457 * < 0 : wait_for_terminate() failed to get the state of the
2458 * container, the container was terminated by a signal, or
2459 * failed for an unknown reason. No change is made to the
2460 * container argument.
2461 * > 0 : The program executed in the container terminated with an
2462 * error. The exit code of the program executed in the
919699ec
LP
2463 * container is returned. The container argument has been set
2464 * to CONTAINER_TERMINATED.
6d416b9c
LS
2465 * 0 : The container is being rebooted, has been shut down or exited
2466 * successfully. The container argument has been set to either
2467 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2468 *
6d416b9c
LS
2469 * That is, success is indicated by a return value of zero, and an
2470 * error is indicated by a non-zero value.
113cea80
DH
2471 */
2472static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2473 siginfo_t status;
919699ec 2474 int r;
113cea80
DH
2475
2476 r = wait_for_terminate(pid, &status);
f647962d
MS
2477 if (r < 0)
2478 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2479
2480 switch (status.si_code) {
fddbb89c 2481
113cea80 2482 case CLD_EXITED:
b5a2179b 2483 if (status.si_status == 0)
919699ec 2484 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2485 else
919699ec 2486 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2487
919699ec
LP
2488 *container = CONTAINER_TERMINATED;
2489 return status.si_status;
113cea80
DH
2490
2491 case CLD_KILLED:
2492 if (status.si_status == SIGINT) {
919699ec 2493 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2494 *container = CONTAINER_TERMINATED;
919699ec
LP
2495 return 0;
2496
113cea80 2497 } else if (status.si_status == SIGHUP) {
919699ec 2498 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2499 *container = CONTAINER_REBOOTED;
919699ec 2500 return 0;
113cea80 2501 }
919699ec 2502
113cea80
DH
2503 /* CLD_KILLED fallthrough */
2504
2505 case CLD_DUMPED:
fddbb89c 2506 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2507 return -EIO;
113cea80
DH
2508
2509 default:
fddbb89c 2510 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2511 return -EIO;
113cea80 2512 }
113cea80
DH
2513}
2514
023fb90b
LP
2515static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2516 pid_t pid;
2517
4a0b58c4 2518 pid = PTR_TO_PID(userdata);
023fb90b 2519 if (pid > 0) {
c6c8f6e2 2520 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2521 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2522 sd_event_source_set_userdata(s, NULL);
2523 return 0;
2524 }
2525 }
2526
2527 sd_event_exit(sd_event_source_get_event(s), 0);
2528 return 0;
2529}
2530
ec16945e 2531static int determine_names(void) {
1b9cebf6 2532 int r;
ec16945e 2533
c1521918
LP
2534 if (arg_template && !arg_directory && arg_machine) {
2535
2536 /* If --template= was specified then we should not
2537 * search for a machine, but instead create a new one
2538 * in /var/lib/machine. */
2539
2540 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2541 if (!arg_directory)
2542 return log_oom();
2543 }
2544
ec16945e 2545 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2546 if (arg_machine) {
2547 _cleanup_(image_unrefp) Image *i = NULL;
2548
2549 r = image_find(arg_machine, &i);
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2552 else if (r == 0) {
2553 log_error("No image for machine '%s': %m", arg_machine);
2554 return -ENOENT;
2555 }
2556
aceac2f0 2557 if (i->type == IMAGE_RAW)
0f03c2a4 2558 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2559 else
0f03c2a4 2560 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2561 if (r < 0)
2562 return log_error_errno(r, "Invalid image directory: %m");
2563
aee327b8
LP
2564 if (!arg_ephemeral)
2565 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2566 } else
ec16945e
LP
2567 arg_directory = get_current_dir_name();
2568
1b9cebf6
LP
2569 if (!arg_directory && !arg_machine) {
2570 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2571 return -EINVAL;
2572 }
2573 }
2574
2575 if (!arg_machine) {
b9ba4dab
LP
2576 if (arg_directory && path_equal(arg_directory, "/"))
2577 arg_machine = gethostname_malloc();
2578 else
2579 arg_machine = strdup(basename(arg_image ?: arg_directory));
2580
ec16945e
LP
2581 if (!arg_machine)
2582 return log_oom();
2583
ae691c1d 2584 hostname_cleanup(arg_machine);
ec16945e
LP
2585 if (!machine_name_is_valid(arg_machine)) {
2586 log_error("Failed to determine machine name automatically, please use -M.");
2587 return -EINVAL;
2588 }
b9ba4dab
LP
2589
2590 if (arg_ephemeral) {
2591 char *b;
2592
2593 /* Add a random suffix when this is an
2594 * ephemeral machine, so that we can run many
2595 * instances at once without manually having
2596 * to specify -M each time. */
2597
2598 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2599 return log_oom();
2600
2601 free(arg_machine);
2602 arg_machine = b;
2603 }
ec16945e
LP
2604 }
2605
2606 return 0;
2607}
2608
03cfe0d5 2609static int determine_uid_shift(const char *directory) {
6dac160c
LP
2610 int r;
2611
0de7acce 2612 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2613 arg_uid_shift = 0;
6dac160c 2614 return 0;
03cfe0d5 2615 }
6dac160c
LP
2616
2617 if (arg_uid_shift == UID_INVALID) {
2618 struct stat st;
2619
03cfe0d5 2620 r = stat(directory, &st);
6dac160c 2621 if (r < 0)
03cfe0d5 2622 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2623
2624 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2625
2626 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2627 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2628 return -EINVAL;
2629 }
2630
2631 arg_uid_range = UINT32_C(0x10000);
2632 }
2633
2634 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2635 log_error("UID base too high for UID range.");
2636 return -EINVAL;
2637 }
2638
6dac160c
LP
2639 return 0;
2640}
2641
03cfe0d5
LP
2642static int inner_child(
2643 Barrier *barrier,
2644 const char *directory,
2645 bool secondary,
2646 int kmsg_socket,
2647 int rtnl_socket,
f757855e 2648 FDSet *fds) {
69c79d3c 2649
03cfe0d5 2650 _cleanup_free_ char *home = NULL;
e01ff70a 2651 char as_uuid[37];
6aadfa4c 2652 unsigned n_env = 1;
03cfe0d5
LP
2653 const char *envp[] = {
2654 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2655 NULL, /* container */
03cfe0d5
LP
2656 NULL, /* TERM */
2657 NULL, /* HOME */
2658 NULL, /* USER */
2659 NULL, /* LOGNAME */
2660 NULL, /* container_uuid */
2661 NULL, /* LISTEN_FDS */
2662 NULL, /* LISTEN_PID */
9c1e04d0 2663 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2664 NULL
2665 };
88213476 2666
2371271c 2667 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2668 int r;
88213476 2669
03cfe0d5
LP
2670 assert(barrier);
2671 assert(directory);
2672 assert(kmsg_socket >= 0);
88213476 2673
efdb0237
LP
2674 cg_unified_flush();
2675
0de7acce 2676 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2677 /* Tell the parent, that it now can write the UID map. */
2678 (void) barrier_place(barrier); /* #1 */
7027ff61 2679
03cfe0d5
LP
2680 /* Wait until the parent wrote the UID map */
2681 if (!barrier_place_and_sync(barrier)) { /* #2 */
2682 log_error("Parent died too early");
2683 return -ESRCH;
2684 }
88213476
LP
2685 }
2686
6d66bd3b
EV
2687 r = reset_uid_gid();
2688 if (r < 0)
2689 return log_error_errno(r, "Couldn't become new root: %m");
2690
0de7acce
LP
2691 r = mount_all(NULL,
2692 arg_userns_mode != USER_NAMESPACE_NO,
2693 true,
2694 arg_private_network,
2695 arg_uid_shift,
2696 arg_uid_range,
2697 arg_selinux_apifs_context);
2698
03cfe0d5
LP
2699 if (r < 0)
2700 return r;
2701
d8fc6a00
LP
2702 r = mount_sysfs(NULL);
2703 if (r < 0)
2704 return r;
2705
03cfe0d5
LP
2706 /* Wait until we are cgroup-ified, so that we
2707 * can mount the right cgroup path writable */
2708 if (!barrier_place_and_sync(barrier)) { /* #3 */
2709 log_error("Parent died too early");
2710 return -ESRCH;
88213476
LP
2711 }
2712
5a8ff0e6 2713 if (arg_use_cgns && cg_ns_supported()) {
0996ef00
CB
2714 r = unshare(CLONE_NEWCGROUP);
2715 if (r < 0)
2716 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2717 r = mount_cgroups(
2718 "",
2719 arg_unified_cgroup_hierarchy,
2720 arg_userns_mode != USER_NAMESPACE_NO,
2721 arg_uid_shift,
2722 arg_uid_range,
5a8ff0e6 2723 arg_selinux_apifs_context,
ada54120 2724 true);
0996ef00
CB
2725 if (r < 0)
2726 return r;
2727 } else {
2728 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2729 if (r < 0)
2730 return r;
2731 }
ec16945e 2732
03cfe0d5
LP
2733 r = setup_boot_id(NULL);
2734 if (r < 0)
2735 return r;
ec16945e 2736
03cfe0d5
LP
2737 r = setup_kmsg(NULL, kmsg_socket);
2738 if (r < 0)
2739 return r;
2740 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2741
03cfe0d5 2742 umask(0022);
30535c16 2743
03cfe0d5
LP
2744 if (setsid() < 0)
2745 return log_error_errno(errno, "setsid() failed: %m");
2746
2747 if (arg_private_network)
2748 loopback_setup();
2749
7a8f6325
LP
2750 if (arg_expose_ports) {
2751 r = expose_port_send_rtnl(rtnl_socket);
2752 if (r < 0)
2753 return r;
2754 rtnl_socket = safe_close(rtnl_socket);
2755 }
03cfe0d5 2756
709f6e46
MS
2757 r = drop_capabilities();
2758 if (r < 0)
2759 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2760
2761 setup_hostname();
2762
050f7277 2763 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2764 if (personality(arg_personality) < 0)
2765 return log_error_errno(errno, "personality() failed: %m");
2766 } else if (secondary) {
2767 if (personality(PER_LINUX32) < 0)
2768 return log_error_errno(errno, "personality() failed: %m");
2769 }
2770
2771#ifdef HAVE_SELINUX
2772 if (arg_selinux_context)
2ed96880 2773 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2774 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2775#endif
2776
ee645080 2777 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2778 if (r < 0)
2779 return r;
2780
6aadfa4c
ILG
2781 /* LXC sets container=lxc, so follow the scheme here */
2782 envp[n_env++] = strjoina("container=", arg_container_service_name);
2783
03cfe0d5
LP
2784 envp[n_env] = strv_find_prefix(environ, "TERM=");
2785 if (envp[n_env])
313cefa1 2786 n_env++;
03cfe0d5
LP
2787
2788 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2789 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2790 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2791 return log_oom();
2792
3bbaff3e 2793 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2794
691675ba 2795 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2796 return log_oom();
03cfe0d5
LP
2797
2798 if (fdset_size(fds) > 0) {
2799 r = fdset_cloexec(fds, false);
2800 if (r < 0)
2801 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2802
2803 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2804 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2805 return log_oom();
2806 }
9c1e04d0
AP
2807 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2808 return log_oom();
03cfe0d5 2809
2371271c
TG
2810 env_use = strv_env_merge(2, envp, arg_setenv);
2811 if (!env_use)
2812 return log_oom();
03cfe0d5
LP
2813
2814 /* Let the parent know that we are ready and
2815 * wait until the parent is ready with the
2816 * setup, too... */
2817 if (!barrier_place_and_sync(barrier)) { /* #4 */
2818 log_error("Parent died too early");
2819 return -ESRCH;
2820 }
2821
5f932eb9
LP
2822 if (arg_chdir)
2823 if (chdir(arg_chdir) < 0)
2824 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2825
7732f92b
LP
2826 if (arg_start_mode == START_PID2) {
2827 r = stub_pid1();
2828 if (r < 0)
2829 return r;
2830 }
2831
03cfe0d5
LP
2832 /* Now, explicitly close the log, so that we
2833 * then can close all remaining fds. Closing
2834 * the log explicitly first has the benefit
2835 * that the logging subsystem knows about it,
2836 * and is thus ready to be reopened should we
2837 * need it again. Note that the other fds
2838 * closed here are at least the locking and
2839 * barrier fds. */
2840 log_close();
2841 (void) fdset_close_others(fds);
2842
7732f92b 2843 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2844 char **a;
2845 size_t m;
2846
2847 /* Automatically search for the init system */
2848
75f32f04
ZJS
2849 m = strv_length(arg_parameters);
2850 a = newa(char*, m + 2);
2851 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2852 a[1 + m] = NULL;
03cfe0d5
LP
2853
2854 a[0] = (char*) "/usr/lib/systemd/systemd";
2855 execve(a[0], a, env_use);
2856
2857 a[0] = (char*) "/lib/systemd/systemd";
2858 execve(a[0], a, env_use);
2859
2860 a[0] = (char*) "/sbin/init";
2861 execve(a[0], a, env_use);
f757855e
LP
2862 } else if (!strv_isempty(arg_parameters))
2863 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2864 else {
5f932eb9 2865 if (!arg_chdir)
d929b0f9
ZJS
2866 /* If we cannot change the directory, we'll end up in /, that is expected. */
2867 (void) chdir(home ?: "/root");
5f932eb9 2868
03cfe0d5
LP
2869 execle("/bin/bash", "-bash", NULL, env_use);
2870 execle("/bin/sh", "-sh", NULL, env_use);
2871 }
2872
35607a8d 2873 r = -errno;
03cfe0d5 2874 (void) log_open();
35607a8d 2875 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2876}
2877
9c1e04d0
AP
2878static int setup_sd_notify_child(void) {
2879 static const int one = 1;
2880 int fd = -1;
2881 union sockaddr_union sa = {
2882 .sa.sa_family = AF_UNIX,
2883 };
2884 int r;
2885
2886 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2887 if (fd < 0)
2888 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2889
2890 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2891 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2892
2893 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2894 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2895 if (r < 0) {
2896 safe_close(fd);
2897 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2898 }
2899
2900 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2901 if (r < 0) {
2902 safe_close(fd);
2903 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2904 }
2905
2906 return fd;
2907}
2908
03cfe0d5
LP
2909static int outer_child(
2910 Barrier *barrier,
2911 const char *directory,
2912 const char *console,
2913 const char *root_device, bool root_device_rw,
2914 const char *home_device, bool home_device_rw,
2915 const char *srv_device, bool srv_device_rw,
a6bc7db9 2916 const char *esp_device,
03cfe0d5
LP
2917 bool interactive,
2918 bool secondary,
2919 int pid_socket,
e01ff70a 2920 int uuid_socket,
9c1e04d0 2921 int notify_socket,
03cfe0d5
LP
2922 int kmsg_socket,
2923 int rtnl_socket,
825d5287 2924 int uid_shift_socket,
f757855e 2925 FDSet *fds) {
03cfe0d5
LP
2926
2927 pid_t pid;
2928 ssize_t l;
2929 int r;
9c1e04d0 2930 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2931
2932 assert(barrier);
2933 assert(directory);
2934 assert(console);
2935 assert(pid_socket >= 0);
e01ff70a 2936 assert(uuid_socket >= 0);
9c1e04d0 2937 assert(notify_socket >= 0);
03cfe0d5
LP
2938 assert(kmsg_socket >= 0);
2939
efdb0237
LP
2940 cg_unified_flush();
2941
03cfe0d5
LP
2942 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2943 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2944
2945 if (interactive) {
2946 close_nointr(STDIN_FILENO);
2947 close_nointr(STDOUT_FILENO);
2948 close_nointr(STDERR_FILENO);
2949
2950 r = open_terminal(console, O_RDWR);
2951 if (r != STDIN_FILENO) {
2952 if (r >= 0) {
2953 safe_close(r);
2954 r = -EINVAL;
2955 }
2956
2957 return log_error_errno(r, "Failed to open console: %m");
2958 }
2959
2960 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2961 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2962 return log_error_errno(errno, "Failed to duplicate console: %m");
2963 }
2964
2965 r = reset_audit_loginuid();
2966 if (r < 0)
2967 return r;
2968
2969 /* Mark everything as slave, so that we still
2970 * receive mounts from the real root, but don't
2971 * propagate mounts to the real root. */
60e76d48
ZJS
2972 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2973 if (r < 0)
2974 return r;
03cfe0d5
LP
2975
2976 r = mount_devices(directory,
2977 root_device, root_device_rw,
2978 home_device, home_device_rw,
a6bc7db9
LP
2979 srv_device, srv_device_rw,
2980 esp_device);
03cfe0d5
LP
2981 if (r < 0)
2982 return r;
2983
391567f4
LP
2984 r = determine_uid_shift(directory);
2985 if (r < 0)
2986 return r;
2987
0fd9563f
ZJS
2988 r = detect_unified_cgroup_hierarchy(directory);
2989 if (r < 0)
2990 return r;
2991
0de7acce 2992 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2993 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2994 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2995 if (l < 0)
2996 return log_error_errno(errno, "Failed to send UID shift: %m");
2997 if (l != sizeof(arg_uid_shift)) {
2998 log_error("Short write while sending UID shift.");
2999 return -EIO;
3000 }
0e7ac751 3001
0de7acce 3002 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3003 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3004 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3005 * not it will pick a different one, and send it back to us. */
3006
3007 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3008 if (l < 0)
3009 return log_error_errno(errno, "Failed to recv UID shift: %m");
3010 if (l != sizeof(arg_uid_shift)) {
595bfe7d 3011 log_error("Short read while receiving UID shift.");
0e7ac751
LP
3012 return -EIO;
3013 }
3014 }
3015
3016 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
3017 }
3018
03cfe0d5 3019 /* Turn directory into bind mount */
60e76d48
ZJS
3020 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3021 if (r < 0)
3022 return r;
03cfe0d5 3023
19caffac
AC
3024 /* Mark everything as shared so our mounts get propagated down. This is
3025 * required to make new bind mounts available in systemd services
3026 * inside the containter that create a new mount namespace.
3027 * See https://github.com/systemd/systemd/issues/3860
3028 * Further submounts (such as /dev) done after this will inherit the
3029 * shared propagation mode.*/
60e76d48
ZJS
3030 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3031 if (r < 0)
3032 return r;
19caffac 3033
7336138e 3034 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
3035 if (r < 0)
3036 return r;
3037
0de7acce
LP
3038 r = setup_volatile(
3039 directory,
3040 arg_volatile_mode,
3041 arg_userns_mode != USER_NAMESPACE_NO,
3042 arg_uid_shift,
3043 arg_uid_range,
3044 arg_selinux_context);
03cfe0d5
LP
3045 if (r < 0)
3046 return r;
3047
0de7acce
LP
3048 r = setup_volatile_state(
3049 directory,
3050 arg_volatile_mode,
3051 arg_userns_mode != USER_NAMESPACE_NO,
3052 arg_uid_shift,
3053 arg_uid_range,
3054 arg_selinux_context);
03cfe0d5
LP
3055 if (r < 0)
3056 return r;
3057
03cfe0d5
LP
3058 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3059 if (r < 0)
3060 return r;
3061
03cfe0d5 3062 if (arg_read_only) {
6b7c9f8b 3063 r = bind_remount_recursive(directory, true, NULL);
03cfe0d5
LP
3064 if (r < 0)
3065 return log_error_errno(r, "Failed to make tree read-only: %m");
3066 }
3067
0de7acce
LP
3068 r = mount_all(directory,
3069 arg_userns_mode != USER_NAMESPACE_NO,
3070 false,
3071 arg_private_network,
3072 arg_uid_shift,
3073 arg_uid_range,
3074 arg_selinux_apifs_context);
03cfe0d5
LP
3075 if (r < 0)
3076 return r;
3077
07fa00f9
LP
3078 r = copy_devnodes(directory);
3079 if (r < 0)
03cfe0d5
LP
3080 return r;
3081
3082 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3083
07fa00f9
LP
3084 r = setup_pts(directory);
3085 if (r < 0)
03cfe0d5
LP
3086 return r;
3087
3088 r = setup_propagate(directory);
3089 if (r < 0)
3090 return r;
3091
3092 r = setup_dev_console(directory, console);
3093 if (r < 0)
3094 return r;
3095
520e0d54 3096 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3097 if (r < 0)
3098 return r;
3099
3100 r = setup_timezone(directory);
3101 if (r < 0)
3102 return r;
3103
3104 r = setup_resolv_conf(directory);
3105 if (r < 0)
3106 return r;
3107
e01ff70a
MS
3108 r = setup_machine_id(directory);
3109 if (r < 0)
3110 return r;
3111
03cfe0d5
LP
3112 r = setup_journal(directory);
3113 if (r < 0)
3114 return r;
3115
0de7acce
LP
3116 r = mount_custom(
3117 directory,
3118 arg_custom_mounts,
3119 arg_n_custom_mounts,
3120 arg_userns_mode != USER_NAMESPACE_NO,
3121 arg_uid_shift,
3122 arg_uid_range,
3123 arg_selinux_apifs_context);
03cfe0d5
LP
3124 if (r < 0)
3125 return r;
3126
5a8ff0e6 3127 if (!arg_use_cgns || !cg_ns_supported()) {
0996ef00
CB
3128 r = mount_cgroups(
3129 directory,
3130 arg_unified_cgroup_hierarchy,
3131 arg_userns_mode != USER_NAMESPACE_NO,
3132 arg_uid_shift,
3133 arg_uid_range,
5a8ff0e6 3134 arg_selinux_apifs_context,
ada54120 3135 false);
0996ef00
CB
3136 if (r < 0)
3137 return r;
3138 }
03cfe0d5
LP
3139
3140 r = mount_move_root(directory);
3141 if (r < 0)
3142 return log_error_errno(r, "Failed to move root directory: %m");
3143
9c1e04d0
AP
3144 fd = setup_sd_notify_child();
3145 if (fd < 0)
3146 return fd;
3147
03cfe0d5 3148 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
0c582db0 3149 arg_clone_ns_flags |
03cfe0d5 3150 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3151 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3152 if (pid < 0)
3153 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3154 if (pid == 0) {
3155 pid_socket = safe_close(pid_socket);
e01ff70a 3156 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3157 notify_socket = safe_close(notify_socket);
825d5287 3158 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3159
3160 /* The inner child has all namespaces that are
3161 * requested, so that we all are owned by the user if
3162 * user namespaces are turned on. */
3163
f757855e 3164 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3165 if (r < 0)
3166 _exit(EXIT_FAILURE);
3167
3168 _exit(EXIT_SUCCESS);
3169 }
3170
3171 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3172 if (l < 0)
3173 return log_error_errno(errno, "Failed to send PID: %m");
3174 if (l != sizeof(pid)) {
3175 log_error("Short write while sending PID.");
3176 return -EIO;
3177 }
3178
e01ff70a
MS
3179 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3180 if (l < 0)
3181 return log_error_errno(errno, "Failed to send machine ID: %m");
3182 if (l != sizeof(arg_uuid)) {
3183 log_error("Short write while sending machine ID.");
3184 return -EIO;
3185 }
3186
9c1e04d0
AP
3187 l = send_one_fd(notify_socket, fd, 0);
3188 if (l < 0)
3189 return log_error_errno(errno, "Failed to send notify fd: %m");
3190
03cfe0d5 3191 pid_socket = safe_close(pid_socket);
e01ff70a 3192 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3193 notify_socket = safe_close(notify_socket);
327e26d6
KN
3194 kmsg_socket = safe_close(kmsg_socket);
3195 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3196
3197 return 0;
3198}
3199
0e7ac751
LP
3200static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3201 unsigned n_tries = 100;
3202 uid_t candidate;
3203 int r;
3204
3205 assert(shift);
3206 assert(ret_lock_file);
0de7acce 3207 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3208 assert(arg_uid_range == 0x10000U);
3209
3210 candidate = *shift;
3211
3212 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3213
3214 for (;;) {
3215 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3216 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3217
3218 if (--n_tries <= 0)
3219 return -EBUSY;
3220
3221 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3222 goto next;
3223 if ((candidate & UINT32_C(0xFFFF)) != 0)
3224 goto next;
3225
3226 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3227 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3228 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3229 goto next;
3230 if (r < 0)
3231 return r;
3232
3233 /* Make some superficial checks whether the range is currently known in the user database */
3234 if (getpwuid(candidate))
3235 goto next;
3236 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3237 goto next;
3238 if (getgrgid(candidate))
3239 goto next;
3240 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3241 goto next;
3242
3243 *ret_lock_file = lf;
3244 lf = (struct LockFile) LOCK_FILE_INIT;
3245 *shift = candidate;
3246 return 0;
3247
3248 next:
3249 random_bytes(&candidate, sizeof(candidate));
3250 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3251 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3252 }
3253}
3254
03cfe0d5
LP
3255static int setup_uid_map(pid_t pid) {
3256 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3257 int r;
3258
3259 assert(pid > 1);
3260
3261 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3262 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3263 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3264 if (r < 0)
3265 return log_error_errno(r, "Failed to write UID map: %m");
3266
3267 /* We always assign the same UID and GID ranges */
3268 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3269 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3270 if (r < 0)
3271 return log_error_errno(r, "Failed to write GID map: %m");
3272
3273 return 0;
3274}
3275
9c1e04d0 3276static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3277 char buf[NOTIFY_BUFFER_MAX+1];
3278 char *p = NULL;
3279 struct iovec iovec = {
3280 .iov_base = buf,
3281 .iov_len = sizeof(buf)-1,
3282 };
3283 union {
3284 struct cmsghdr cmsghdr;
3285 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3286 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3287 } control = {};
3288 struct msghdr msghdr = {
3289 .msg_iov = &iovec,
3290 .msg_iovlen = 1,
3291 .msg_control = &control,
3292 .msg_controllen = sizeof(control),
3293 };
3294 struct cmsghdr *cmsg;
3295 struct ucred *ucred = NULL;
3296 ssize_t n;
3297 pid_t inner_child_pid;
3298 _cleanup_strv_free_ char **tags = NULL;
3299
3300 assert(userdata);
3301
3302 inner_child_pid = PTR_TO_PID(userdata);
3303
3304 if (revents != EPOLLIN) {
3305 log_warning("Got unexpected poll event for notify fd.");
3306 return 0;
3307 }
3308
3309 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3310 if (n < 0) {
3311 if (errno == EAGAIN || errno == EINTR)
3312 return 0;
3313
3314 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3315 }
3316 cmsg_close_all(&msghdr);
3317
3318 CMSG_FOREACH(cmsg, &msghdr) {
3319 if (cmsg->cmsg_level == SOL_SOCKET &&
3320 cmsg->cmsg_type == SCM_CREDENTIALS &&
3321 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3322
3323 ucred = (struct ucred*) CMSG_DATA(cmsg);
3324 }
3325 }
3326
3327 if (!ucred || ucred->pid != inner_child_pid) {
3328 log_warning("Received notify message without valid credentials. Ignoring.");
3329 return 0;
3330 }
3331
3332 if ((size_t) n >= sizeof(buf)) {
3333 log_warning("Received notify message exceeded maximum size. Ignoring.");
3334 return 0;
3335 }
3336
3337 buf[n] = 0;
3338 tags = strv_split(buf, "\n\r");
3339 if (!tags)
3340 return log_oom();
3341
3342 if (strv_find(tags, "READY=1"))
3343 sd_notifyf(false, "READY=1\n");
3344
3345 p = strv_find_startswith(tags, "STATUS=");
3346 if (p)
3347 sd_notifyf(false, "STATUS=Container running: %s", p);
3348
3349 return 0;
3350}
3351
3352static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3353 int r;
3354 sd_event_source *notify_event_source;
3355
3356 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3357 if (r < 0)
3358 return log_error_errno(r, "Failed to allocate notify event source: %m");
3359
3360 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3361
3362 return 0;
3363}
3364
f757855e
LP
3365static int load_settings(void) {
3366 _cleanup_(settings_freep) Settings *settings = NULL;
3367 _cleanup_fclose_ FILE *f = NULL;
3368 _cleanup_free_ char *p = NULL;
3369 const char *fn, *i;
3370 int r;
3371
3372 /* If all settings are masked, there's no point in looking for
3373 * the settings file */
3374 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3375 return 0;
3376
3377 fn = strjoina(arg_machine, ".nspawn");
3378
3379 /* We first look in the admin's directories in /etc and /run */
3380 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3381 _cleanup_free_ char *j = NULL;
3382
3383 j = strjoin(i, "/", fn, NULL);
3384 if (!j)
3385 return log_oom();
3386
3387 f = fopen(j, "re");
3388 if (f) {
3389 p = j;
3390 j = NULL;
3391
b938cb90 3392 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3393 if (arg_settings_trusted < 0)
3394 arg_settings_trusted = true;
3395
3396 break;
3397 }
3398
3399 if (errno != ENOENT)
3400 return log_error_errno(errno, "Failed to open %s: %m", j);
3401 }
3402
3403 if (!f) {
3404 /* After that, let's look for a file next to the
3405 * actual image we shall boot. */
3406
3407 if (arg_image) {
3408 p = file_in_same_dir(arg_image, fn);
3409 if (!p)
3410 return log_oom();
3411 } else if (arg_directory) {
3412 p = file_in_same_dir(arg_directory, fn);
3413 if (!p)
3414 return log_oom();
3415 }
3416
3417 if (p) {
3418 f = fopen(p, "re");
3419 if (!f && errno != ENOENT)
3420 return log_error_errno(errno, "Failed to open %s: %m", p);
3421
b938cb90 3422 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3423 if (arg_settings_trusted < 0)
3424 arg_settings_trusted = false;
3425 }
3426 }
3427
3428 if (!f)
3429 return 0;
3430
3431 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3432
3433 r = settings_load(f, p, &settings);
3434 if (r < 0)
3435 return r;
3436
3437 /* Copy over bits from the settings, unless they have been
3438 * explicitly masked by command line switches. */
3439
7732f92b
LP
3440 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3441 settings->start_mode >= 0) {
3442 arg_start_mode = settings->start_mode;
f757855e
LP
3443
3444 strv_free(arg_parameters);
3445 arg_parameters = settings->parameters;
3446 settings->parameters = NULL;
3447 }
3448
5f932eb9
LP
3449 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3450 settings->working_directory) {
3451 free(arg_chdir);
3452 arg_chdir = settings->working_directory;
3453 settings->working_directory = NULL;
3454 }
3455
f757855e
LP
3456 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3457 settings->environment) {
3458 strv_free(arg_setenv);
3459 arg_setenv = settings->environment;
3460 settings->environment = NULL;
3461 }
3462
3463 if ((arg_settings_mask & SETTING_USER) == 0 &&
3464 settings->user) {
3465 free(arg_user);
3466 arg_user = settings->user;
3467 settings->user = NULL;
3468 }
3469
3470 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3471 uint64_t plus;
f757855e 3472
0e265674
LP
3473 plus = settings->capability;
3474 if (settings_private_network(settings))
3475 plus |= (1ULL << CAP_NET_ADMIN);
3476
3477 if (!arg_settings_trusted && plus != 0) {
3478 if (settings->capability != 0)
3479 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3480 } else
520e0d54 3481 arg_caps_retain |= plus;
f757855e 3482
520e0d54 3483 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3484 }
3485
3486 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3487 settings->kill_signal > 0)
3488 arg_kill_signal = settings->kill_signal;
3489
3490 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3491 settings->personality != PERSONALITY_INVALID)
3492 arg_personality = settings->personality;
3493
3494 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3495 !sd_id128_is_null(settings->machine_id)) {
3496
3497 if (!arg_settings_trusted)
3498 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3499 else
3500 arg_uuid = settings->machine_id;
3501 }
3502
3503 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3504 settings->read_only >= 0)
3505 arg_read_only = settings->read_only;
3506
3507 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3508 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3509 arg_volatile_mode = settings->volatile_mode;
3510
3511 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3512 settings->n_custom_mounts > 0) {
3513
3514 if (!arg_settings_trusted)
3515 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3516 else {
3517 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3518 arg_custom_mounts = settings->custom_mounts;
3519 arg_n_custom_mounts = settings->n_custom_mounts;
3520
3521 settings->custom_mounts = NULL;
3522 settings->n_custom_mounts = 0;
3523 }
3524 }
3525
3526 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3527 (settings->private_network >= 0 ||
3528 settings->network_veth >= 0 ||
3529 settings->network_bridge ||
22b28dfd 3530 settings->network_zone ||
f757855e
LP
3531 settings->network_interfaces ||
3532 settings->network_macvlan ||
f6d6bad1
LP
3533 settings->network_ipvlan ||
3534 settings->network_veth_extra)) {
f757855e
LP
3535
3536 if (!arg_settings_trusted)
3537 log_warning("Ignoring network settings, file %s is not trusted.", p);
3538 else {
f6d6bad1 3539 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3540 arg_private_network = settings_private_network(settings);
3541
f757855e
LP
3542 strv_free(arg_network_interfaces);
3543 arg_network_interfaces = settings->network_interfaces;
3544 settings->network_interfaces = NULL;
3545
3546 strv_free(arg_network_macvlan);
3547 arg_network_macvlan = settings->network_macvlan;
3548 settings->network_macvlan = NULL;
3549
3550 strv_free(arg_network_ipvlan);
3551 arg_network_ipvlan = settings->network_ipvlan;
3552 settings->network_ipvlan = NULL;
3553
f6d6bad1
LP
3554 strv_free(arg_network_veth_extra);
3555 arg_network_veth_extra = settings->network_veth_extra;
3556 settings->network_veth_extra = NULL;
3557
f757855e
LP
3558 free(arg_network_bridge);
3559 arg_network_bridge = settings->network_bridge;
3560 settings->network_bridge = NULL;
22b28dfd
LP
3561
3562 free(arg_network_zone);
3563 arg_network_zone = settings->network_zone;
3564 settings->network_zone = NULL;
f757855e
LP
3565 }
3566 }
3567
3568 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3569 settings->expose_ports) {
3570
3571 if (!arg_settings_trusted)
3572 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3573 else {
3574 expose_port_free_all(arg_expose_ports);
3575 arg_expose_ports = settings->expose_ports;
3576 settings->expose_ports = NULL;
3577 }
3578 }
3579
0de7acce
LP
3580 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3581 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3582
3583 if (!arg_settings_trusted)
3584 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3585 else {
3586 arg_userns_mode = settings->userns_mode;
3587 arg_uid_shift = settings->uid_shift;
3588 arg_uid_range = settings->uid_range;
3589 arg_userns_chown = settings->userns_chown;
3590 }
3591 }
3592
9c1e04d0
AP
3593 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3594 arg_notify_ready = settings->notify_ready;
3595
f757855e
LP
3596 return 0;
3597}
3598
b0067625
ZJS
3599static int run(int master,
3600 const char* console,
3601 const char *root_device, bool root_device_rw,
3602 const char *home_device, bool home_device_rw,
3603 const char *srv_device, bool srv_device_rw,
3604 const char *esp_device,
3605 bool interactive,
3606 bool secondary,
3607 FDSet *fds,
3608 char veth_name[IFNAMSIZ], bool *veth_created,
3609 union in_addr_union *exposed,
3610 pid_t *pid, int *ret) {
3611
3612 static const struct sigaction sa = {
3613 .sa_handler = nop_signal_handler,
3614 .sa_flags = SA_NOCLDSTOP,
3615 };
3616
3617 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3618 _cleanup_close_ int etc_passwd_lock = -1;
3619 _cleanup_close_pair_ int
3620 kmsg_socket_pair[2] = { -1, -1 },
3621 rtnl_socket_pair[2] = { -1, -1 },
3622 pid_socket_pair[2] = { -1, -1 },
3623 uuid_socket_pair[2] = { -1, -1 },
3624 notify_socket_pair[2] = { -1, -1 },
3625 uid_shift_socket_pair[2] = { -1, -1 };
3626 _cleanup_close_ int notify_socket= -1;
3627 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3628 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3629 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3630 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3631 ContainerStatus container_status = 0;
3632 char last_char = 0;
3633 int ifi = 0, r;
3634 ssize_t l;
3635 sigset_t mask_chld;
3636
3637 assert_se(sigemptyset(&mask_chld) == 0);
3638 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3639
3640 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3641 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3642 * check with getpwuid() if the specific user already exists. Note that /etc might be
3643 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3644 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3645 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3646 * really ours. */
3647
3648 etc_passwd_lock = take_etc_passwd_lock(NULL);
3649 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3650 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3651 }
3652
3653 r = barrier_create(&barrier);
3654 if (r < 0)
3655 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3656
3657 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3658 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3659
3660 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3661 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3662
3663 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3664 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3665
3666 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3667 return log_error_errno(errno, "Failed to create id socket pair: %m");
3668
3669 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3670 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3671
3672 if (arg_userns_mode != USER_NAMESPACE_NO)
3673 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3674 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3675
3676 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3677 * parent's blocking calls and give it a chance to call wait() and terminate. */
3678 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3679 if (r < 0)
3680 return log_error_errno(errno, "Failed to change the signal mask: %m");
3681
3682 r = sigaction(SIGCHLD, &sa, NULL);
3683 if (r < 0)
3684 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3685
3686 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3687 if (*pid < 0)
3688 return log_error_errno(errno, "clone() failed%s: %m",
3689 errno == EINVAL ?
3690 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3691
3692 if (*pid == 0) {
3693 /* The outer child only has a file system namespace. */
3694 barrier_set_role(&barrier, BARRIER_CHILD);
3695
3696 master = safe_close(master);
3697
3698 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3699 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3700 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3701 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3702 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3703 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3704
3705 (void) reset_all_signal_handlers();
3706 (void) reset_signal_mask();
3707
3708 r = outer_child(&barrier,
3709 arg_directory,
3710 console,
3711 root_device, root_device_rw,
3712 home_device, home_device_rw,
3713 srv_device, srv_device_rw,
3714 esp_device,
3715 interactive,
3716 secondary,
3717 pid_socket_pair[1],
3718 uuid_socket_pair[1],
3719 notify_socket_pair[1],
3720 kmsg_socket_pair[1],
3721 rtnl_socket_pair[1],
3722 uid_shift_socket_pair[1],
3723 fds);
3724 if (r < 0)
3725 _exit(EXIT_FAILURE);
3726
3727 _exit(EXIT_SUCCESS);
3728 }
3729
3730 barrier_set_role(&barrier, BARRIER_PARENT);
3731
3732 fds = fdset_free(fds);
3733
3734 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3735 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3736 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3737 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3738 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3739 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3740
3741 if (arg_userns_mode != USER_NAMESPACE_NO) {
3742 /* The child just let us know the UID shift it might have read from the image. */
3743 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3744 if (l < 0)
3745 return log_error_errno(errno, "Failed to read UID shift: %m");
3746
3747 if (l != sizeof arg_uid_shift) {
3748 log_error("Short read while reading UID shift.");
3749 return -EIO;
3750 }
3751
3752 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3753 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3754 * image, but if that's already in use, pick a new one, and report back to the child,
3755 * which one we now picked. */
3756
3757 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3758 if (r < 0)
3759 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3760
3761 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3762 if (l < 0)
3763 return log_error_errno(errno, "Failed to send UID shift: %m");
3764 if (l != sizeof arg_uid_shift) {
3765 log_error("Short write while writing UID shift.");
3766 return -EIO;
3767 }
3768 }
3769 }
3770
3771 /* Wait for the outer child. */
3772 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3773 if (r != 0)
3774 return r < 0 ? r : -EIO;
3775
3776 /* And now retrieve the PID of the inner child. */
3777 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3778 if (l < 0)
3779 return log_error_errno(errno, "Failed to read inner child PID: %m");
3780 if (l != sizeof *pid) {
3781 log_error("Short read while reading inner child PID.");
3782 return -EIO;
3783 }
3784
3785 /* We also retrieve container UUID in case it was generated by outer child */
3786 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3787 if (l < 0)
3788 return log_error_errno(errno, "Failed to read container machine ID: %m");
3789 if (l != sizeof(arg_uuid)) {
3790 log_error("Short read while reading container machined ID.");
3791 return -EIO;
3792 }
3793
3794 /* We also retrieve the socket used for notifications generated by outer child */
3795 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3796 if (notify_socket < 0)
3797 return log_error_errno(notify_socket,
3798 "Failed to receive notification socket from the outer child: %m");
3799
3800 log_debug("Init process invoked as PID "PID_FMT, *pid);
3801
3802 if (arg_userns_mode != USER_NAMESPACE_NO) {
3803 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3804 log_error("Child died too early.");
3805 return -ESRCH;
3806 }
3807
3808 r = setup_uid_map(*pid);
3809 if (r < 0)
3810 return r;
3811
3812 (void) barrier_place(&barrier); /* #2 */
3813 }
3814
3815 if (arg_private_network) {
3816
3817 r = move_network_interfaces(*pid, arg_network_interfaces);
3818 if (r < 0)
3819 return r;
3820
3821 if (arg_network_veth) {
3822 r = setup_veth(arg_machine, *pid, veth_name,
3823 arg_network_bridge || arg_network_zone);
3824 if (r < 0)
3825 return r;
3826 else if (r > 0)
3827 ifi = r;
3828
3829 if (arg_network_bridge) {
3830 /* Add the interface to a bridge */
3831 r = setup_bridge(veth_name, arg_network_bridge, false);
3832 if (r < 0)
3833 return r;
3834 if (r > 0)
3835 ifi = r;
3836 } else if (arg_network_zone) {
3837 /* Add the interface to a bridge, possibly creating it */
3838 r = setup_bridge(veth_name, arg_network_zone, true);
3839 if (r < 0)
3840 return r;
3841 if (r > 0)
3842 ifi = r;
3843 }
3844 }
3845
3846 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3847 if (r < 0)
3848 return r;
3849
3850 /* We created the primary and extra veth links now; let's remember this, so that we know to
3851 remove them later on. Note that we don't bother with removing veth links that were created
3852 here when their setup failed half-way, because in that case the kernel should be able to
3853 remove them on its own, since they cannot be referenced by anything yet. */
3854 *veth_created = true;
3855
3856 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3857 if (r < 0)
3858 return r;
3859
3860 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3861 if (r < 0)
3862 return r;
3863 }
3864
3865 if (arg_register) {
3866 r = register_machine(
3867 arg_machine,
3868 *pid,
3869 arg_directory,
3870 arg_uuid,
3871 ifi,
3872 arg_slice,
3873 arg_custom_mounts, arg_n_custom_mounts,
3874 arg_kill_signal,
3875 arg_property,
3876 arg_keep_unit,
3877 arg_container_service_name);
3878 if (r < 0)
3879 return r;
3880 }
3881
f0bef277 3882 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
b0067625
ZJS
3883 if (r < 0)
3884 return r;
3885
3886 if (arg_keep_unit) {
3887 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3888 if (r < 0)
3889 return r;
3890 }
3891
3892 r = chown_cgroup(*pid, arg_uid_shift);
3893 if (r < 0)
3894 return r;
3895
3896 /* Notify the child that the parent is ready with all
3897 * its setup (including cgroup-ification), and that
3898 * the child can now hand over control to the code to
3899 * run inside the container. */
3900 (void) barrier_place(&barrier); /* #3 */
3901
3902 /* Block SIGCHLD here, before notifying child.
3903 * process_pty() will handle it with the other signals. */
3904 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3905
3906 /* Reset signal to default */
3907 r = default_signals(SIGCHLD, -1);
3908 if (r < 0)
3909 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3910
3911 r = sd_event_new(&event);
3912 if (r < 0)
3913 return log_error_errno(r, "Failed to get default event source: %m");
3914
3915 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
3916 if (r < 0)
3917 return r;
3918
3919 /* Let the child know that we are ready and wait that the child is completely ready now. */
3920 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3921 log_error("Child died too early.");
3922 return -ESRCH;
3923 }
3924
3925 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3926 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3927 etc_passwd_lock = safe_close(etc_passwd_lock);
3928
3929 sd_notifyf(false,
3930 "STATUS=Container running.\n"
3931 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3932 if (!arg_notify_ready)
3933 sd_notify(false, "READY=1\n");
3934
3935 if (arg_kill_signal > 0) {
3936 /* Try to kill the init system on SIGINT or SIGTERM */
3937 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3938 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3939 } else {
3940 /* Immediately exit */
3941 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3942 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3943 }
3944
3945 /* simply exit on sigchld */
3946 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3947
3948 if (arg_expose_ports) {
3949 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3950 if (r < 0)
3951 return r;
3952
3953 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3954 }
3955
3956 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3957
3958 r = pty_forward_new(event, master,
3959 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3960 &forward);
3961 if (r < 0)
3962 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3963
3964 r = sd_event_loop(event);
3965 if (r < 0)
3966 return log_error_errno(r, "Failed to run event loop: %m");
3967
3968 pty_forward_get_last_char(forward, &last_char);
3969
3970 forward = pty_forward_free(forward);
3971
3972 if (!arg_quiet && last_char != '\n')
3973 putc('\n', stdout);
3974
3975 /* Kill if it is not dead yet anyway */
3976 if (arg_register && !arg_keep_unit)
3977 terminate_machine(*pid);
3978
3979 /* Normally redundant, but better safe than sorry */
3980 kill(*pid, SIGKILL);
3981
3982 r = wait_for_container(*pid, &container_status);
3983 *pid = 0;
3984
3985 if (r < 0)
3986 /* We failed to wait for the container, or the container exited abnormally. */
3987 return r;
3988 if (r > 0 || container_status == CONTAINER_TERMINATED) {
27e29a1e
ZJS
3989 /* r > 0 → The container exited with a non-zero status.
3990 * As a special case, we need to replace 133 with a different value,
3991 * because 133 is special-cased in the service file to reboot the container.
3992 * otherwise → The container exited with zero status and a reboot was not requested.
3993 */
3994 if (r == 133)
3995 r = EXIT_FAILURE; /* replace 133 with the general failure code */
b0067625 3996 *ret = r;
b0067625
ZJS
3997 return 0; /* finito */
3998 }
3999
4000 /* CONTAINER_REBOOTED, loop again */
4001
4002 if (arg_keep_unit) {
4003 /* Special handling if we are running as a service: instead of simply
4004 * restarting the machine we want to restart the entire service, so let's
4005 * inform systemd about this with the special exit code 133. The service
4006 * file uses RestartForceExitStatus=133 so that this results in a full
4007 * nspawn restart. This is necessary since we might have cgroup parameters
4008 * set we want to have flushed out. */
4009 *ret = 0;
4010 return 133;
4011 }
4012
4013 expose_port_flush(arg_expose_ports, exposed);
4014
4015 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4016 *veth_created = false;
4017 return 1; /* loop again */
4018}
4019
03cfe0d5
LP
4020int main(int argc, char *argv[]) {
4021
a6bc7db9 4022 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
4023 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4024 _cleanup_close_ int master = -1, image_fd = -1;
4025 _cleanup_fdset_free_ FDSet *fds = NULL;
b0067625 4026 int r, n_fd_passed, loop_nr = -1, ret = EXIT_FAILURE;
5aa3eba5 4027 char veth_name[IFNAMSIZ] = "";
03cfe0d5 4028 bool secondary = false, remove_subvol = false;
03cfe0d5 4029 pid_t pid = 0;
03cfe0d5
LP
4030 union in_addr_union exposed = {};
4031 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
7513c5b8 4032 bool interactive, veth_created = false;
03cfe0d5
LP
4033
4034 log_parse_environment();
4035 log_open();
4036
7732f92b
LP
4037 /* Make sure rename_process() in the stub init process can work */
4038 saved_argv = argv;
4039 saved_argc = argc;
4040
03cfe0d5
LP
4041 r = parse_argv(argc, argv);
4042 if (r <= 0)
4043 goto finish;
4044
03cfe0d5
LP
4045 if (geteuid() != 0) {
4046 log_error("Need to be root.");
4047 r = -EPERM;
4048 goto finish;
4049 }
f757855e
LP
4050 r = determine_names();
4051 if (r < 0)
4052 goto finish;
4053
4054 r = load_settings();
4055 if (r < 0)
4056 goto finish;
4057
4058 r = verify_arguments();
4059 if (r < 0)
4060 goto finish;
03cfe0d5
LP
4061
4062 n_fd_passed = sd_listen_fds(false);
4063 if (n_fd_passed > 0) {
4064 r = fdset_new_listen_fds(&fds, false);
4065 if (r < 0) {
4066 log_error_errno(r, "Failed to collect file descriptors: %m");
4067 goto finish;
4068 }
4069 }
4070
4071 if (arg_directory) {
4072 assert(!arg_image);
4073
4074 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4075 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4076 r = -EINVAL;
4077 goto finish;
4078 }
4079
4080 if (arg_ephemeral) {
4081 _cleanup_free_ char *np = NULL;
4082
4083 /* If the specified path is a mount point we
4084 * generate the new snapshot immediately
4085 * inside it under a random name. However if
4086 * the specified is not a mount point we
4087 * create the new snapshot in the parent
4088 * directory, just next to it. */
e26d6ce5 4089 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4090 if (r < 0) {
4091 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4092 goto finish;
4093 }
4094 if (r > 0)
770b5ce4 4095 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4096 else
770b5ce4 4097 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4098 if (r < 0) {
4099 log_error_errno(r, "Failed to generate name for snapshot: %m");
4100 goto finish;
4101 }
4102
4103 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4104 if (r < 0) {
4105 log_error_errno(r, "Failed to lock %s: %m", np);
4106 goto finish;
4107 }
4108
5bcd08db 4109 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
4110 if (r < 0) {
4111 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4112 goto finish;
ec16945e
LP
4113 }
4114
4115 free(arg_directory);
4116 arg_directory = np;
8a16a7b4 4117 np = NULL;
ec16945e
LP
4118
4119 remove_subvol = true;
30535c16
LP
4120
4121 } else {
4122 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4123 if (r == -EBUSY) {
4124 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4125 goto finish;
4126 }
4127 if (r < 0) {
4128 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 4129 goto finish;
30535c16
LP
4130 }
4131
4132 if (arg_template) {
5bcd08db 4133 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
4134 if (r == -EEXIST) {
4135 if (!arg_quiet)
4136 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4137 } else if (r < 0) {
83521414 4138 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4139 goto finish;
4140 } else {
4141 if (!arg_quiet)
4142 log_info("Populated %s from template %s.", arg_directory, arg_template);
4143 }
4144 }
ec16945e
LP
4145 }
4146
7732f92b 4147 if (arg_start_mode == START_BOOT) {
1b9e5b12 4148 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4149 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4150 r = -EINVAL;
1b9e5b12
LP
4151 goto finish;
4152 }
4153 } else {
4154 const char *p;
4155
16fb773e
LP
4156 p = strjoina(arg_directory, "/usr/");
4157 if (laccess(p, F_OK) < 0) {
4158 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 4159 r = -EINVAL;
1b9e5b12 4160 goto finish;
1b9e5b12
LP
4161 }
4162 }
ec16945e 4163
6b9132a9 4164 } else {
1b9e5b12 4165 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4166
ec16945e
LP
4167 assert(arg_image);
4168 assert(!arg_template);
4169
30535c16
LP
4170 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4171 if (r == -EBUSY) {
4172 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4173 goto finish;
4174 }
4175 if (r < 0) {
4176 r = log_error_errno(r, "Failed to create image lock: %m");
4177 goto finish;
4178 }
4179
1b9e5b12 4180 if (!mkdtemp(template)) {
56f64d95 4181 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4182 r = -errno;
6b9132a9 4183 goto finish;
1b9e5b12 4184 }
6b9132a9 4185
1b9e5b12
LP
4186 arg_directory = strdup(template);
4187 if (!arg_directory) {
4188 r = log_oom();
4189 goto finish;
6b9132a9 4190 }
88213476 4191
1b9e5b12
LP
4192 image_fd = setup_image(&device_path, &loop_nr);
4193 if (image_fd < 0) {
4194 r = image_fd;
842f3b0f
LP
4195 goto finish;
4196 }
1b9e5b12 4197
4d9f07b4
LP
4198 r = dissect_image(image_fd,
4199 &root_device, &root_device_rw,
4200 &home_device, &home_device_rw,
4201 &srv_device, &srv_device_rw,
a6bc7db9 4202 &esp_device,
4d9f07b4 4203 &secondary);
1b9e5b12
LP
4204 if (r < 0)
4205 goto finish;
842f3b0f 4206 }
842f3b0f 4207
5a8af538
LP
4208 r = custom_mounts_prepare();
4209 if (r < 0)
4210 goto finish;
4211
03cfe0d5
LP
4212 interactive =
4213 isatty(STDIN_FILENO) > 0 &&
4214 isatty(STDOUT_FILENO) > 0;
9c857b9d 4215
db7feb7e
LP
4216 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4217 if (master < 0) {
ec16945e 4218 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4219 goto finish;
4220 }
4221
611b312b
LP
4222 r = ptsname_malloc(master, &console);
4223 if (r < 0) {
4224 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 4225 goto finish;
68b02049
DW
4226 }
4227
4228 if (arg_selinux_apifs_context) {
4229 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4230 if (r < 0)
4231 goto finish;
a258bf26
LP
4232 }
4233
a258bf26 4234 if (unlockpt(master) < 0) {
ec16945e 4235 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4236 goto finish;
4237 }
4238
9c857b9d
LP
4239 if (!arg_quiet)
4240 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4241 arg_machine, arg_image ?: arg_directory);
4242
72c0a2c2 4243 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4244
03cfe0d5
LP
4245 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4246 r = log_error_errno(errno, "Failed to become subreaper: %m");
4247 goto finish;
4248 }
4249
d87be9b0 4250 for (;;) {
b0067625
ZJS
4251 r = run(master,
4252 console,
4253 root_device, root_device_rw,
4254 home_device, home_device_rw,
4255 srv_device, srv_device_rw,
4256 esp_device,
4257 interactive, secondary,
4258 fds,
4259 veth_name, &veth_created,
4260 &exposed,
4261 &pid, &ret);
4262 if (r <= 0)
d87be9b0 4263 break;
d87be9b0 4264 }
88213476
LP
4265
4266finish:
af4ec430
LP
4267 sd_notify(false,
4268 "STOPPING=1\n"
4269 "STATUS=Terminating...");
4270
9444b1f2
LP
4271 if (pid > 0)
4272 kill(pid, SIGKILL);
88213476 4273
503546da
LP
4274 /* Try to flush whatever is still queued in the pty */
4275 if (master >= 0)
59f448cf 4276 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 4277
03cfe0d5
LP
4278 loop_remove(loop_nr, &image_fd);
4279
ec16945e
LP
4280 if (remove_subvol && arg_directory) {
4281 int k;
4282
5bcd08db 4283 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
4284 if (k < 0)
4285 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4286 }
4287
785890ac
LP
4288 if (arg_machine) {
4289 const char *p;
4290
63c372cb 4291 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4292 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4293 }
4294
7a8f6325 4295 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4296
4297 if (veth_created)
4298 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4299 (void) remove_bridge(arg_network_zone);
f757855e 4300
04d391da 4301 free(arg_directory);
ec16945e
LP
4302 free(arg_template);
4303 free(arg_image);
7027ff61 4304 free(arg_machine);
c74e630d 4305 free(arg_user);
5f932eb9 4306 free(arg_chdir);
c74e630d 4307 strv_free(arg_setenv);
f757855e 4308 free(arg_network_bridge);
c74e630d
LP
4309 strv_free(arg_network_interfaces);
4310 strv_free(arg_network_macvlan);
4bbfe7ad 4311 strv_free(arg_network_ipvlan);
f6d6bad1 4312 strv_free(arg_network_veth_extra);
f757855e
LP
4313 strv_free(arg_parameters);
4314 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4315 expose_port_free_all(arg_expose_ports);
6d0b55c2 4316
ec16945e 4317 return r < 0 ? EXIT_FAILURE : ret;
88213476 4318}