]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
systemctl: allow disable on the unit file path, but warn about it (#3806)
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
0e7ac751 25#include <grp.h>
1b9e5b12 26#include <linux/loop.h>
0e7ac751 27#include <pwd.h>
8fe0087e 28#include <sched.h>
8fe0087e
LP
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
1b9e5b12 31#endif
8fe0087e
LP
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
1b9e5b12 42
1f0cd86b 43#include "sd-daemon.h"
1f0cd86b 44#include "sd-id128.h"
8fe0087e 45
b5efdb8a 46#include "alloc-util.h"
8fe0087e
LP
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
8fe0087e 51#include "cap-list.h"
430f0182 52#include "capability-util.h"
04d391da 53#include "cgroup-util.h"
8fe0087e 54#include "copy.h"
4fc9982c 55#include "dev-setup.h"
8fe0087e 56#include "env-util.h"
3ffd4af2 57#include "fd-util.h"
842f3b0f 58#include "fdset.h"
a5c32cff 59#include "fileio.h"
8fe0087e 60#include "formats-util.h"
f4f15635 61#include "fs-util.h"
1b9e5b12 62#include "gpt.h"
8fe0087e 63#include "hostname-util.h"
910fd145 64#include "id128-util.h"
8fe0087e
LP
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
7336138e 77#include "nspawn-patch-uid.h"
07630cea 78#include "nspawn-register.h"
910fd145 79#include "nspawn-seccomp.h"
07630cea
LP
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
7732f92b 82#include "nspawn-stub-pid1.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
8869a0b4 88#include "raw-clone.h"
8fe0087e 89#include "rm-rf.h"
68b02049 90#include "selinux-util.h"
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
0e7ac751 103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
065d31c3
LP
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
0e7ac751
LP
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
065d31c3 108
9c1e04d0
AP
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
0e7ac751 113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
57fb9fb5
LP
119typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124} LinkJournal;
88213476
LP
125
126static char *arg_directory = NULL;
ec16945e 127static char *arg_template = NULL;
5f932eb9 128static char *arg_chdir = NULL;
687d0825 129static char *arg_user = NULL;
9444b1f2 130static sd_id128_t arg_uuid = {};
7027ff61 131static char *arg_machine = NULL;
c74e630d
LP
132static const char *arg_selinux_context = NULL;
133static const char *arg_selinux_apifs_context = NULL;
9444b1f2 134static const char *arg_slice = NULL;
ff01d048 135static bool arg_private_network = false;
bc2f673e 136static bool arg_read_only = false;
7732f92b 137static StartMode arg_start_mode = START_PID1;
ec16945e 138static bool arg_ephemeral = false;
57fb9fb5 139static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 140static bool arg_link_journal_try = false;
520e0d54 141static uint64_t arg_caps_retain =
50b52222
LP
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
5076f0cc
LP
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
50b52222 153 (1ULL << CAP_MKNOD) |
5076f0cc
LP
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
5076f0cc 157 (1ULL << CAP_SETFCAP) |
50b52222 158 (1ULL << CAP_SETGID) |
5076f0cc
LP
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
50b52222 162 (1ULL << CAP_SYS_BOOT) |
5076f0cc
LP
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
d87be9b0 166 (1ULL << CAP_SYS_RESOURCE) |
50b52222 167 (1ULL << CAP_SYS_TTY_CONFIG);
5a8af538
LP
168static CustomMount *arg_custom_mounts = NULL;
169static unsigned arg_n_custom_mounts = 0;
f4889f65 170static char **arg_setenv = NULL;
284c0b91 171static bool arg_quiet = false;
8a96d94e 172static bool arg_share_system = false;
eb91eb18 173static bool arg_register = true;
89f7c846 174static bool arg_keep_unit = false;
aa28aefe 175static char **arg_network_interfaces = NULL;
c74e630d 176static char **arg_network_macvlan = NULL;
4bbfe7ad 177static char **arg_network_ipvlan = NULL;
69c79d3c 178static bool arg_network_veth = false;
f6d6bad1 179static char **arg_network_veth_extra = NULL;
f757855e 180static char *arg_network_bridge = NULL;
22b28dfd 181static char *arg_network_zone = NULL;
050f7277 182static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 183static char *arg_image = NULL;
f757855e 184static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 185static ExposePort *arg_expose_ports = NULL;
f36933fe 186static char **arg_property = NULL;
0de7acce 187static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
6dac160c 188static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
0de7acce 189static bool arg_userns_chown = false;
c6c8f6e2 190static int arg_kill_signal = 0;
efdb0237 191static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
192static SettingsMask arg_settings_mask = 0;
193static int arg_settings_trusted = -1;
194static char **arg_parameters = NULL;
6aadfa4c 195static const char *arg_container_service_name = "systemd-nspawn";
9c1e04d0 196static bool arg_notify_ready = false;
88213476 197
601185b4 198static void help(void) {
88213476
LP
199 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
200 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
201 " -h --help Show this help\n"
202 " --version Print version string\n"
69c79d3c 203 " -q --quiet Do not show status information\n"
1b9e5b12 204 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
205 " --template=PATH Initialize root directory from template directory,\n"
206 " if missing\n"
207 " -x --ephemeral Run container with snapshot of root directory, and\n"
208 " remove it after exit\n"
209 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 210 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 211 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 212 " --chdir=PATH Set working directory in the container\n"
a8828ed9 213 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 214 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 215 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 216 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 217 " --property=NAME=VALUE Set scope unit property\n"
19aac838 218 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
03cfe0d5 219 " --private-users[=UIDBASE[:NUIDS]]\n"
19aac838
LP
220 " Run within user namespace, user configured UID/GID range\n"
221 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
69c79d3c
LP
222 " --private-network Disable network in container\n"
223 " --network-interface=INTERFACE\n"
224 " Assign an existing network interface to the\n"
225 " container\n"
c74e630d
LP
226 " --network-macvlan=INTERFACE\n"
227 " Create a macvlan network interface based on an\n"
228 " existing network interface to the container\n"
4bbfe7ad
TG
229 " --network-ipvlan=INTERFACE\n"
230 " Create a ipvlan network interface based on an\n"
231 " existing network interface to the container\n"
a8eaaee7 232 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 233 " and container\n"
f6d6bad1
LP
234 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
235 " Add an additional virtual Ethernet link between\n"
236 " host and container\n"
ab046dde 237 " --network-bridge=INTERFACE\n"
a8eaaee7 238 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
239 " and container and add it to an existing bridge on\n"
240 " the host\n"
22b28dfd
LP
241 " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
242 " and add it to an automatically managed bridge interface\n"
6d0b55c2 243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 244 " Expose a container IP port on the host\n"
82adf6af
LP
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
a8828ed9
DW
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
574edc90 257 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 258 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
a8828ed9 261 " the container\n"
5e5bfa6e
EY
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
06c17c39 264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
267 " the container\n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
a5f1cb3b 270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 271 " --share-system Share system namespaces with host\n"
eb91eb18 272 " --register=BOOLEAN Register container as machine\n"
89f7c846 273 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 274 " the service unit nspawn is running in\n"
6d0b55c2 275 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 276 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
9c1e04d0
AP
277 " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
278 " accepted values: yes and no\n"
6d0b55c2 279 , program_invocation_short_name);
88213476
LP
280}
281
5a8af538
LP
282static int custom_mounts_prepare(void) {
283 unsigned i;
284 int r;
285
286 /* Ensure the mounts are applied prefix first. */
287 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
288
289 /* Allocate working directories for the overlay file systems that need it */
290 for (i = 0; i < arg_n_custom_mounts; i++) {
291 CustomMount *m = &arg_custom_mounts[i];
292
0de7acce 293 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
294
295 if (arg_userns_chown) {
296 log_error("--private-users-chown may not be combined with custom root mounts.");
297 return -EINVAL;
298 } else if (arg_uid_shift == UID_INVALID) {
299 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
300 return -EINVAL;
301 }
825d5287
RM
302 }
303
5a8af538
LP
304 if (m->type != CUSTOM_MOUNT_OVERLAY)
305 continue;
306
307 if (m->work_dir)
308 continue;
309
310 if (m->read_only)
311 continue;
312
14bcf25c 313 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
314 if (r < 0)
315 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
316 }
317
318 return 0;
319}
320
efdb0237
LP
321static int detect_unified_cgroup_hierarchy(void) {
322 const char *e;
323 int r;
324
325 /* Allow the user to control whether the unified hierarchy is used */
326 e = getenv("UNIFIED_CGROUP_HIERARCHY");
327 if (e) {
328 r = parse_boolean(e);
329 if (r < 0)
330 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
331
332 arg_unified_cgroup_hierarchy = r;
333 return 0;
334 }
335
336 /* Otherwise inherit the default from the host system */
337 r = cg_unified();
338 if (r < 0)
339 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
340
341 arg_unified_cgroup_hierarchy = r;
342 return 0;
343}
344
88213476
LP
345static int parse_argv(int argc, char *argv[]) {
346
a41fe3a2 347 enum {
acbeb427
ZJS
348 ARG_VERSION = 0x100,
349 ARG_PRIVATE_NETWORK,
bc2f673e 350 ARG_UUID,
5076f0cc 351 ARG_READ_ONLY,
57fb9fb5 352 ARG_CAPABILITY,
420c7379 353 ARG_DROP_CAPABILITY,
17fe0523
LP
354 ARG_LINK_JOURNAL,
355 ARG_BIND,
f4889f65 356 ARG_BIND_RO,
06c17c39 357 ARG_TMPFS,
5a8af538
LP
358 ARG_OVERLAY,
359 ARG_OVERLAY_RO,
eb91eb18 360 ARG_SHARE_SYSTEM,
89f7c846 361 ARG_REGISTER,
aa28aefe 362 ARG_KEEP_UNIT,
69c79d3c 363 ARG_NETWORK_INTERFACE,
c74e630d 364 ARG_NETWORK_MACVLAN,
4bbfe7ad 365 ARG_NETWORK_IPVLAN,
ab046dde 366 ARG_NETWORK_BRIDGE,
22b28dfd 367 ARG_NETWORK_ZONE,
f6d6bad1 368 ARG_NETWORK_VETH_EXTRA,
6afc95b7 369 ARG_PERSONALITY,
4d9f07b4 370 ARG_VOLATILE,
ec16945e 371 ARG_TEMPLATE,
f36933fe 372 ARG_PROPERTY,
6dac160c 373 ARG_PRIVATE_USERS,
c6c8f6e2 374 ARG_KILL_SIGNAL,
f757855e 375 ARG_SETTINGS,
5f932eb9 376 ARG_CHDIR,
7336138e 377 ARG_PRIVATE_USERS_CHOWN,
9c1e04d0 378 ARG_NOTIFY_READY,
a41fe3a2
LP
379 };
380
88213476 381 static const struct option options[] = {
aa28aefe
LP
382 { "help", no_argument, NULL, 'h' },
383 { "version", no_argument, NULL, ARG_VERSION },
384 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
385 { "template", required_argument, NULL, ARG_TEMPLATE },
386 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
387 { "user", required_argument, NULL, 'u' },
388 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
7732f92b 389 { "as-pid2", no_argument, NULL, 'a' },
aa28aefe
LP
390 { "boot", no_argument, NULL, 'b' },
391 { "uuid", required_argument, NULL, ARG_UUID },
392 { "read-only", no_argument, NULL, ARG_READ_ONLY },
393 { "capability", required_argument, NULL, ARG_CAPABILITY },
394 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
395 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
396 { "bind", required_argument, NULL, ARG_BIND },
397 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 398 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
399 { "overlay", required_argument, NULL, ARG_OVERLAY },
400 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
401 { "machine", required_argument, NULL, 'M' },
402 { "slice", required_argument, NULL, 'S' },
a5f1cb3b 403 { "setenv", required_argument, NULL, 'E' },
aa28aefe
LP
404 { "selinux-context", required_argument, NULL, 'Z' },
405 { "selinux-apifs-context", required_argument, NULL, 'L' },
406 { "quiet", no_argument, NULL, 'q' },
407 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
408 { "register", required_argument, NULL, ARG_REGISTER },
409 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
410 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 411 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 412 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 413 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 414 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 415 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
22b28dfd 416 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
6afc95b7 417 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 418 { "image", required_argument, NULL, 'i' },
4d9f07b4 419 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 420 { "port", required_argument, NULL, 'p' },
f36933fe 421 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 422 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
7336138e 423 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
c6c8f6e2 424 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 425 { "settings", required_argument, NULL, ARG_SETTINGS },
5f932eb9 426 { "chdir", required_argument, NULL, ARG_CHDIR },
9c1e04d0 427 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
eb9da376 428 {}
88213476
LP
429 };
430
9444b1f2 431 int c, r;
6aadfa4c 432 const char *p, *e;
a42c8b54 433 uint64_t plus = 0, minus = 0;
f757855e 434 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
435
436 assert(argc >= 0);
437 assert(argv);
438
19aac838 439 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
88213476
LP
440
441 switch (c) {
442
443 case 'h':
601185b4
ZJS
444 help();
445 return 0;
88213476 446
acbeb427 447 case ARG_VERSION:
3f6fd1ba 448 return version();
acbeb427 449
88213476 450 case 'D':
0f03c2a4 451 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 452 if (r < 0)
0f03c2a4 453 return r;
ec16945e
LP
454 break;
455
456 case ARG_TEMPLATE:
0f03c2a4 457 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 458 if (r < 0)
0f03c2a4 459 return r;
88213476
LP
460 break;
461
1b9e5b12 462 case 'i':
0f03c2a4 463 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 464 if (r < 0)
0f03c2a4 465 return r;
ec16945e
LP
466 break;
467
468 case 'x':
469 arg_ephemeral = true;
1b9e5b12
LP
470 break;
471
687d0825 472 case 'u':
2fc09a9c
DM
473 r = free_and_strdup(&arg_user, optarg);
474 if (r < 0)
7027ff61 475 return log_oom();
687d0825 476
f757855e 477 arg_settings_mask |= SETTING_USER;
687d0825
MV
478 break;
479
22b28dfd
LP
480 case ARG_NETWORK_ZONE: {
481 char *j;
482
483 j = strappend("vz-", optarg);
484 if (!j)
485 return log_oom();
486
487 if (!ifname_valid(j)) {
488 log_error("Network zone name not valid: %s", j);
489 free(j);
490 return -EINVAL;
491 }
492
493 free(arg_network_zone);
494 arg_network_zone = j;
495
496 arg_network_veth = true;
497 arg_private_network = true;
498 arg_settings_mask |= SETTING_NETWORK;
499 break;
500 }
501
ab046dde 502 case ARG_NETWORK_BRIDGE:
ef76dff2
LP
503
504 if (!ifname_valid(optarg)) {
505 log_error("Bridge interface name not valid: %s", optarg);
506 return -EINVAL;
507 }
508
f757855e
LP
509 r = free_and_strdup(&arg_network_bridge, optarg);
510 if (r < 0)
511 return log_oom();
ab046dde
TG
512
513 /* fall through */
514
0dfaa006 515 case 'n':
69c79d3c
LP
516 arg_network_veth = true;
517 arg_private_network = true;
f757855e 518 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
519 break;
520
f6d6bad1
LP
521 case ARG_NETWORK_VETH_EXTRA:
522 r = veth_extra_parse(&arg_network_veth_extra, optarg);
523 if (r < 0)
524 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
525
526 arg_private_network = true;
527 arg_settings_mask |= SETTING_NETWORK;
528 break;
529
aa28aefe 530 case ARG_NETWORK_INTERFACE:
ef76dff2
LP
531
532 if (!ifname_valid(optarg)) {
533 log_error("Network interface name not valid: %s", optarg);
534 return -EINVAL;
535 }
536
c74e630d
LP
537 if (strv_extend(&arg_network_interfaces, optarg) < 0)
538 return log_oom();
539
540 arg_private_network = true;
f757855e 541 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
542 break;
543
544 case ARG_NETWORK_MACVLAN:
ef76dff2
LP
545
546 if (!ifname_valid(optarg)) {
547 log_error("MACVLAN network interface name not valid: %s", optarg);
548 return -EINVAL;
549 }
550
c74e630d 551 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
552 return log_oom();
553
4bbfe7ad 554 arg_private_network = true;
f757855e 555 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
556 break;
557
558 case ARG_NETWORK_IPVLAN:
ef76dff2
LP
559
560 if (!ifname_valid(optarg)) {
561 log_error("IPVLAN network interface name not valid: %s", optarg);
562 return -EINVAL;
563 }
564
4bbfe7ad
TG
565 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
566 return log_oom();
567
aa28aefe
LP
568 /* fall through */
569
ff01d048
LP
570 case ARG_PRIVATE_NETWORK:
571 arg_private_network = true;
f757855e 572 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
573 break;
574
0f0dbc46 575 case 'b':
7732f92b
LP
576 if (arg_start_mode == START_PID2) {
577 log_error("--boot and --as-pid2 may not be combined.");
578 return -EINVAL;
579 }
580
581 arg_start_mode = START_BOOT;
582 arg_settings_mask |= SETTING_START_MODE;
583 break;
584
585 case 'a':
586 if (arg_start_mode == START_BOOT) {
587 log_error("--boot and --as-pid2 may not be combined.");
588 return -EINVAL;
589 }
590
591 arg_start_mode = START_PID2;
592 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
593 break;
594
144f0fc0 595 case ARG_UUID:
9444b1f2 596 r = sd_id128_from_string(optarg, &arg_uuid);
317feb4d
LP
597 if (r < 0)
598 return log_error_errno(r, "Invalid UUID: %s", optarg);
599
600 if (sd_id128_is_null(arg_uuid)) {
601 log_error("Machine UUID may not be all zeroes.");
602 return -EINVAL;
aa96c6cb 603 }
f757855e
LP
604
605 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 606 break;
aa96c6cb 607
9444b1f2 608 case 'S':
c74e630d 609 arg_slice = optarg;
144f0fc0
LP
610 break;
611
7027ff61 612 case 'M':
c1521918 613 if (isempty(optarg))
97b11eed 614 arg_machine = mfree(arg_machine);
c1521918 615 else {
0c3c4284 616 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
617 log_error("Invalid machine name: %s", optarg);
618 return -EINVAL;
619 }
7027ff61 620
0c3c4284
LP
621 r = free_and_strdup(&arg_machine, optarg);
622 if (r < 0)
eb91eb18
LP
623 return log_oom();
624
625 break;
626 }
7027ff61 627
82adf6af
LP
628 case 'Z':
629 arg_selinux_context = optarg;
a8828ed9
DW
630 break;
631
82adf6af
LP
632 case 'L':
633 arg_selinux_apifs_context = optarg;
a8828ed9
DW
634 break;
635
bc2f673e
LP
636 case ARG_READ_ONLY:
637 arg_read_only = true;
f757855e 638 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
639 break;
640
420c7379
LP
641 case ARG_CAPABILITY:
642 case ARG_DROP_CAPABILITY: {
6cbe4ed1 643 p = optarg;
9ed794a3 644 for (;;) {
6cbe4ed1 645 _cleanup_free_ char *t = NULL;
5076f0cc 646
6cbe4ed1
SS
647 r = extract_first_word(&p, &t, ",", 0);
648 if (r < 0)
649 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 650
6cbe4ed1
SS
651 if (r == 0)
652 break;
5076f0cc 653
39ed67d1
LP
654 if (streq(t, "all")) {
655 if (c == ARG_CAPABILITY)
a42c8b54 656 plus = (uint64_t) -1;
39ed67d1 657 else
a42c8b54 658 minus = (uint64_t) -1;
39ed67d1 659 } else {
2822da4f
LP
660 int cap;
661
662 cap = capability_from_name(t);
663 if (cap < 0) {
39ed67d1
LP
664 log_error("Failed to parse capability %s.", t);
665 return -EINVAL;
666 }
667
668 if (c == ARG_CAPABILITY)
a42c8b54 669 plus |= 1ULL << (uint64_t) cap;
39ed67d1 670 else
a42c8b54 671 minus |= 1ULL << (uint64_t) cap;
5076f0cc 672 }
5076f0cc
LP
673 }
674
f757855e 675 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
676 break;
677 }
678
57fb9fb5
LP
679 case 'j':
680 arg_link_journal = LINK_GUEST;
574edc90 681 arg_link_journal_try = true;
57fb9fb5
LP
682 break;
683
684 case ARG_LINK_JOURNAL:
53e438e3 685 if (streq(optarg, "auto")) {
57fb9fb5 686 arg_link_journal = LINK_AUTO;
53e438e3
LP
687 arg_link_journal_try = false;
688 } else if (streq(optarg, "no")) {
57fb9fb5 689 arg_link_journal = LINK_NO;
53e438e3
LP
690 arg_link_journal_try = false;
691 } else if (streq(optarg, "guest")) {
57fb9fb5 692 arg_link_journal = LINK_GUEST;
53e438e3
LP
693 arg_link_journal_try = false;
694 } else if (streq(optarg, "host")) {
57fb9fb5 695 arg_link_journal = LINK_HOST;
53e438e3
LP
696 arg_link_journal_try = false;
697 } else if (streq(optarg, "try-guest")) {
574edc90
MP
698 arg_link_journal = LINK_GUEST;
699 arg_link_journal_try = true;
700 } else if (streq(optarg, "try-host")) {
701 arg_link_journal = LINK_HOST;
702 arg_link_journal_try = true;
703 } else {
57fb9fb5
LP
704 log_error("Failed to parse link journal mode %s", optarg);
705 return -EINVAL;
706 }
707
708 break;
709
17fe0523 710 case ARG_BIND:
f757855e
LP
711 case ARG_BIND_RO:
712 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
713 if (r < 0)
714 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 715
f757855e 716 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 717 break;
06c17c39 718
f757855e
LP
719 case ARG_TMPFS:
720 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
721 if (r < 0)
722 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 723
f757855e 724 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 725 break;
5a8af538
LP
726
727 case ARG_OVERLAY:
728 case ARG_OVERLAY_RO: {
729 _cleanup_free_ char *upper = NULL, *destination = NULL;
730 _cleanup_strv_free_ char **lower = NULL;
731 CustomMount *m;
732 unsigned n = 0;
733 char **i;
734
62f9f39a
RM
735 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
736 if (r == -ENOMEM)
06c17c39 737 return log_oom();
62f9f39a
RM
738 else if (r < 0) {
739 log_error("Invalid overlay specification: %s", optarg);
740 return r;
741 }
06c17c39 742
5a8af538
LP
743 STRV_FOREACH(i, lower) {
744 if (!path_is_absolute(*i)) {
745 log_error("Overlay path %s is not absolute.", *i);
746 return -EINVAL;
747 }
748
749 n++;
750 }
751
752 if (n < 2) {
753 log_error("--overlay= needs at least two colon-separated directories specified.");
754 return -EINVAL;
755 }
756
757 if (n == 2) {
758 /* If two parameters are specified,
759 * the first one is the lower, the
760 * second one the upper directory. And
af86c440
ZJS
761 * we'll also define the destination
762 * mount point the same as the upper. */
5a8af538
LP
763 upper = lower[1];
764 lower[1] = NULL;
765
766 destination = strdup(upper);
767 if (!destination)
768 return log_oom();
769
770 } else {
771 upper = lower[n - 2];
772 destination = lower[n - 1];
773 lower[n - 2] = NULL;
774 }
775
f757855e 776 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
777 if (!m)
778 return log_oom();
779
780 m->destination = destination;
781 m->source = upper;
782 m->lower = lower;
783 m->read_only = c == ARG_OVERLAY_RO;
784
785 upper = destination = NULL;
786 lower = NULL;
06c17c39 787
f757855e 788 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
789 break;
790 }
791
a5f1cb3b 792 case 'E': {
f4889f65
LP
793 char **n;
794
795 if (!env_assignment_is_valid(optarg)) {
796 log_error("Environment variable assignment '%s' is not valid.", optarg);
797 return -EINVAL;
798 }
799
800 n = strv_env_set(arg_setenv, optarg);
801 if (!n)
802 return log_oom();
803
804 strv_free(arg_setenv);
805 arg_setenv = n;
f757855e
LP
806
807 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
808 break;
809 }
810
284c0b91
LP
811 case 'q':
812 arg_quiet = true;
813 break;
814
8a96d94e
LP
815 case ARG_SHARE_SYSTEM:
816 arg_share_system = true;
817 break;
818
eb91eb18
LP
819 case ARG_REGISTER:
820 r = parse_boolean(optarg);
821 if (r < 0) {
822 log_error("Failed to parse --register= argument: %s", optarg);
823 return r;
824 }
825
826 arg_register = r;
827 break;
828
89f7c846
LP
829 case ARG_KEEP_UNIT:
830 arg_keep_unit = true;
831 break;
832
6afc95b7
LP
833 case ARG_PERSONALITY:
834
ac45f971 835 arg_personality = personality_from_string(optarg);
050f7277 836 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
837 log_error("Unknown or unsupported personality '%s'.", optarg);
838 return -EINVAL;
839 }
840
f757855e 841 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
842 break;
843
4d9f07b4
LP
844 case ARG_VOLATILE:
845
846 if (!optarg)
f757855e 847 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 848 else {
f757855e 849 VolatileMode m;
4d9f07b4 850
f757855e
LP
851 m = volatile_mode_from_string(optarg);
852 if (m < 0) {
853 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 854 return -EINVAL;
f757855e
LP
855 } else
856 arg_volatile_mode = m;
6d0b55c2
LP
857 }
858
f757855e
LP
859 arg_settings_mask |= SETTING_VOLATILE_MODE;
860 break;
6d0b55c2 861
f757855e
LP
862 case 'p':
863 r = expose_port_parse(&arg_expose_ports, optarg);
864 if (r == -EEXIST)
865 return log_error_errno(r, "Duplicate port specification: %s", optarg);
866 if (r < 0)
867 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 868
f757855e 869 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 870 break;
6d0b55c2 871
f36933fe
LP
872 case ARG_PROPERTY:
873 if (strv_extend(&arg_property, optarg) < 0)
874 return log_oom();
875
876 break;
877
6dac160c 878 case ARG_PRIVATE_USERS:
0de7acce
LP
879
880 r = optarg ? parse_boolean(optarg) : 1;
881 if (r == 0) {
882 /* no: User namespacing off */
883 arg_userns_mode = USER_NAMESPACE_NO;
884 arg_uid_shift = UID_INVALID;
885 arg_uid_range = UINT32_C(0x10000);
886 } else if (r > 0) {
887 /* yes: User namespacing on, UID range is read from root dir */
888 arg_userns_mode = USER_NAMESPACE_FIXED;
889 arg_uid_shift = UID_INVALID;
890 arg_uid_range = UINT32_C(0x10000);
891 } else if (streq(optarg, "pick")) {
892 /* pick: User namespacing on, UID range is picked randomly */
893 arg_userns_mode = USER_NAMESPACE_PICK;
894 arg_uid_shift = UID_INVALID;
895 arg_uid_range = UINT32_C(0x10000);
896 } else {
6dac160c
LP
897 _cleanup_free_ char *buffer = NULL;
898 const char *range, *shift;
899
0de7acce
LP
900 /* anything else: User namespacing on, UID range is explicitly configured */
901
6dac160c
LP
902 range = strchr(optarg, ':');
903 if (range) {
904 buffer = strndup(optarg, range - optarg);
905 if (!buffer)
906 return log_oom();
907 shift = buffer;
908
909 range++;
910 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
911 log_error("Failed to parse UID range: %s", range);
912 return -EINVAL;
913 }
914 } else
915 shift = optarg;
916
917 if (parse_uid(shift, &arg_uid_shift) < 0) {
918 log_error("Failed to parse UID: %s", optarg);
919 return -EINVAL;
920 }
0de7acce
LP
921
922 arg_userns_mode = USER_NAMESPACE_FIXED;
6dac160c
LP
923 }
924
0de7acce 925 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
926 break;
927
0de7acce 928 case 'U':
ccabee0d
LP
929 if (userns_supported()) {
930 arg_userns_mode = USER_NAMESPACE_PICK;
931 arg_uid_shift = UID_INVALID;
932 arg_uid_range = UINT32_C(0x10000);
933
934 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
935 }
936
7336138e
LP
937 break;
938
0de7acce 939 case ARG_PRIVATE_USERS_CHOWN:
19aac838 940 arg_userns_chown = true;
0de7acce
LP
941
942 arg_settings_mask |= SETTING_USERNS;
6dac160c
LP
943 break;
944
c6c8f6e2
LP
945 case ARG_KILL_SIGNAL:
946 arg_kill_signal = signal_from_string_try_harder(optarg);
947 if (arg_kill_signal < 0) {
948 log_error("Cannot parse signal: %s", optarg);
949 return -EINVAL;
950 }
951
f757855e
LP
952 arg_settings_mask |= SETTING_KILL_SIGNAL;
953 break;
954
955 case ARG_SETTINGS:
956
957 /* no → do not read files
958 * yes → read files, do not override cmdline, trust only subset
959 * override → read files, override cmdline, trust only subset
960 * trusted → read files, do not override cmdline, trust all
961 */
962
963 r = parse_boolean(optarg);
964 if (r < 0) {
965 if (streq(optarg, "trusted")) {
966 mask_all_settings = false;
967 mask_no_settings = false;
968 arg_settings_trusted = true;
969
970 } else if (streq(optarg, "override")) {
971 mask_all_settings = false;
972 mask_no_settings = true;
973 arg_settings_trusted = -1;
974 } else
975 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
976 } else if (r > 0) {
977 /* yes */
978 mask_all_settings = false;
979 mask_no_settings = false;
980 arg_settings_trusted = -1;
981 } else {
982 /* no */
983 mask_all_settings = true;
984 mask_no_settings = false;
985 arg_settings_trusted = false;
986 }
987
c6c8f6e2
LP
988 break;
989
5f932eb9
LP
990 case ARG_CHDIR:
991 if (!path_is_absolute(optarg)) {
992 log_error("Working directory %s is not an absolute path.", optarg);
993 return -EINVAL;
994 }
995
996 r = free_and_strdup(&arg_chdir, optarg);
997 if (r < 0)
998 return log_oom();
999
1000 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1001 break;
1002
9c1e04d0
AP
1003 case ARG_NOTIFY_READY:
1004 r = parse_boolean(optarg);
1005 if (r < 0) {
1006 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1007 return -EINVAL;
1008 }
1009 arg_notify_ready = r;
1010 arg_settings_mask |= SETTING_NOTIFY_READY;
1011 break;
1012
88213476
LP
1013 case '?':
1014 return -EINVAL;
1015
1016 default:
eb9da376 1017 assert_not_reached("Unhandled option");
88213476 1018 }
88213476 1019
eb91eb18
LP
1020 if (arg_share_system)
1021 arg_register = false;
1022
0de7acce 1023 if (arg_userns_mode == USER_NAMESPACE_PICK)
0e7ac751
LP
1024 arg_userns_chown = true;
1025
7732f92b 1026 if (arg_start_mode != START_PID1 && arg_share_system) {
eb91eb18
LP
1027 log_error("--boot and --share-system may not be combined.");
1028 return -EINVAL;
1029 }
1030
89f7c846
LP
1031 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1032 log_error("--keep-unit may not be used when invoked from a user session.");
1033 return -EINVAL;
1034 }
1035
1b9e5b12
LP
1036 if (arg_directory && arg_image) {
1037 log_error("--directory= and --image= may not be combined.");
1038 return -EINVAL;
1039 }
1040
ec16945e
LP
1041 if (arg_template && arg_image) {
1042 log_error("--template= and --image= may not be combined.");
1043 return -EINVAL;
1044 }
1045
1046 if (arg_template && !(arg_directory || arg_machine)) {
1047 log_error("--template= needs --directory= or --machine=.");
1048 return -EINVAL;
1049 }
1050
1051 if (arg_ephemeral && arg_template) {
1052 log_error("--ephemeral and --template= may not be combined.");
1053 return -EINVAL;
1054 }
1055
1056 if (arg_ephemeral && arg_image) {
1057 log_error("--ephemeral and --image= may not be combined.");
1058 return -EINVAL;
1059 }
1060
df9a75e4
LP
1061 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1062 log_error("--ephemeral and --link-journal= may not be combined.");
1063 return -EINVAL;
1064 }
1065
ccabee0d 1066 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
7336138e
LP
1067 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1068 return -EOPNOTSUPP;
1069 }
1070
1071 if (arg_userns_chown && arg_read_only) {
1072 log_error("--read-only and --private-users-chown may not be combined.");
1073 return -EINVAL;
1074 }
f757855e 1075
22b28dfd
LP
1076 if (arg_network_bridge && arg_network_zone) {
1077 log_error("--network-bridge= and --network-zone= may not be combined.");
1078 return -EINVAL;
1079 }
1080
f757855e
LP
1081 if (argc > optind) {
1082 arg_parameters = strv_copy(argv + optind);
1083 if (!arg_parameters)
1084 return log_oom();
1085
7732f92b 1086 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
1087 }
1088
1089 /* Load all settings from .nspawn files */
1090 if (mask_no_settings)
1091 arg_settings_mask = 0;
1092
1093 /* Don't load any settings from .nspawn files */
1094 if (mask_all_settings)
1095 arg_settings_mask = _SETTINGS_MASK_ALL;
1096
520e0d54 1097 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
f757855e
LP
1098
1099 r = detect_unified_cgroup_hierarchy();
1100 if (r < 0)
1101 return r;
1102
6aadfa4c
ILG
1103 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1104 if (e)
1105 arg_container_service_name = e;
1106
f757855e
LP
1107 return 1;
1108}
1109
1110static int verify_arguments(void) {
1111
1112 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1113 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1114 return -EINVAL;
1115 }
1116
6d0b55c2
LP
1117 if (arg_expose_ports && !arg_private_network) {
1118 log_error("Cannot use --port= without private networking.");
1119 return -EINVAL;
1120 }
1121
1c1ea217
EV
1122#ifndef HAVE_LIBIPTC
1123 if (arg_expose_ports) {
1124 log_error("--port= is not supported, compiled without libiptc support.");
1125 return -EOPNOTSUPP;
1126 }
1127#endif
1128
7732f92b 1129 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
1130 arg_kill_signal = SIGRTMIN+3;
1131
f757855e 1132 return 0;
88213476
LP
1133}
1134
03cfe0d5
LP
1135static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1136 assert(p);
1137
0de7acce 1138 if (arg_userns_mode == USER_NAMESPACE_NO)
03cfe0d5
LP
1139 return 0;
1140
1141 if (uid == UID_INVALID && gid == GID_INVALID)
1142 return 0;
1143
1144 if (uid != UID_INVALID) {
1145 uid += arg_uid_shift;
1146
1147 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1148 return -EOVERFLOW;
1149 }
1150
1151 if (gid != GID_INVALID) {
1152 gid += (gid_t) arg_uid_shift;
1153
1154 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1155 return -EOVERFLOW;
1156 }
1157
1158 if (lchown(p, uid, gid) < 0)
1159 return -errno;
b12afc8c
LP
1160
1161 return 0;
1162}
1163
03cfe0d5
LP
1164static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1165 const char *q;
1166
1167 q = prefix_roota(root, path);
1168 if (mkdir(q, mode) < 0) {
1169 if (errno == EEXIST)
1170 return 0;
1171 return -errno;
1172 }
1173
1174 return userns_lchown(q, uid, gid);
1175}
1176
e58a1277 1177static int setup_timezone(const char *dest) {
03cfe0d5
LP
1178 _cleanup_free_ char *p = NULL, *q = NULL;
1179 const char *where, *check, *what;
d4036145
LP
1180 char *z, *y;
1181 int r;
f8440af5 1182
e58a1277
LP
1183 assert(dest);
1184
1185 /* Fix the timezone, if possible */
d4036145
LP
1186 r = readlink_malloc("/etc/localtime", &p);
1187 if (r < 0) {
1188 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1189 return 0;
1190 }
1191
1192 z = path_startswith(p, "../usr/share/zoneinfo/");
1193 if (!z)
1194 z = path_startswith(p, "/usr/share/zoneinfo/");
1195 if (!z) {
1196 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1197 return 0;
1198 }
1199
03cfe0d5 1200 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1201 r = readlink_malloc(where, &q);
1202 if (r >= 0) {
1203 y = path_startswith(q, "../usr/share/zoneinfo/");
1204 if (!y)
1205 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1206
d4036145
LP
1207 /* Already pointing to the right place? Then do nothing .. */
1208 if (y && streq(y, z))
1209 return 0;
1210 }
1211
03cfe0d5 1212 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1213 check = prefix_roota(dest, check);
03cfe0d5 1214 if (laccess(check, F_OK) < 0) {
d4036145
LP
1215 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1216 return 0;
1217 }
68fb0892 1218
79d80fc1
TG
1219 r = unlink(where);
1220 if (r < 0 && errno != ENOENT) {
56f64d95 1221 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1222 return 0;
1223 }
4d9f07b4 1224
03cfe0d5 1225 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1226 if (symlink(what, where) < 0) {
56f64d95 1227 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1228 return 0;
1229 }
e58a1277 1230
03cfe0d5
LP
1231 r = userns_lchown(where, 0, 0);
1232 if (r < 0)
1233 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1234
e58a1277 1235 return 0;
88213476
LP
1236}
1237
2547bb41 1238static int setup_resolv_conf(const char *dest) {
03cfe0d5 1239 const char *where = NULL;
79d80fc1 1240 int r;
2547bb41
LP
1241
1242 assert(dest);
1243
1244 if (arg_private_network)
1245 return 0;
1246
1247 /* Fix resolv.conf, if possible */
03cfe0d5 1248 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1249
f2068bcc 1250 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1251 if (r < 0) {
68a313c5
LP
1252 /* If the file already exists as symlink, let's
1253 * suppress the warning, under the assumption that
1254 * resolved or something similar runs inside and the
1255 * symlink points there.
1256 *
1257 * If the disk image is read-only, there's also no
1258 * point in complaining.
1259 */
1260 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1261 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1262 return 0;
1263 }
2547bb41 1264
03cfe0d5
LP
1265 r = userns_lchown(where, 0, 0);
1266 if (r < 0)
1267 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1268
2547bb41
LP
1269 return 0;
1270}
1271
04bc4a3f 1272static int setup_boot_id(const char *dest) {
3bbaff3e 1273 sd_id128_t rnd = SD_ID128_NULL;
03cfe0d5 1274 const char *from, *to;
04bc4a3f
LP
1275 int r;
1276
eb91eb18
LP
1277 if (arg_share_system)
1278 return 0;
1279
04bc4a3f
LP
1280 /* Generate a new randomized boot ID, so that each boot-up of
1281 * the container gets a new one */
1282
03cfe0d5
LP
1283 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1284 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1285
1286 r = sd_id128_randomize(&rnd);
f647962d
MS
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1289
15b1248a 1290 r = id128_write(from, ID128_UUID, rnd, false);
f647962d
MS
1291 if (r < 0)
1292 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1293
03cfe0d5
LP
1294 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1295 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1296 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
3bbaff3e 1297 log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m");
04bc4a3f 1298
3bbaff3e 1299 (void) unlink(from);
04bc4a3f
LP
1300 return r;
1301}
1302
e58a1277 1303static int copy_devnodes(const char *dest) {
88213476
LP
1304
1305 static const char devnodes[] =
1306 "null\0"
1307 "zero\0"
1308 "full\0"
1309 "random\0"
1310 "urandom\0"
85614d66
TG
1311 "tty\0"
1312 "net/tun\0";
88213476
LP
1313
1314 const char *d;
e58a1277 1315 int r = 0;
7fd1b19b 1316 _cleanup_umask_ mode_t u;
a258bf26
LP
1317
1318 assert(dest);
124640f1
LP
1319
1320 u = umask(0000);
88213476 1321
03cfe0d5
LP
1322 /* Create /dev/net, so that we can create /dev/net/tun in it */
1323 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1324 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1325
88213476 1326 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1327 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1328 struct stat st;
88213476 1329
7f112f50 1330 from = strappend("/dev/", d);
03cfe0d5 1331 to = prefix_root(dest, from);
88213476
LP
1332
1333 if (stat(from, &st) < 0) {
1334
4a62c710
MS
1335 if (errno != ENOENT)
1336 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1337
a258bf26 1338 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1339
03cfe0d5 1340 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1341 return -EIO;
a258bf26 1342
85614d66 1343 } else {
81f5049b
AC
1344 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1345 if (errno != EPERM)
1346 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1347
1348 /* Some systems abusively restrict mknod but
1349 * allow bind mounts. */
1350 r = touch(to);
1351 if (r < 0)
1352 return log_error_errno(r, "touch (%s) failed: %m", to);
1353 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1354 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1355 }
6278cf60 1356
03cfe0d5
LP
1357 r = userns_lchown(to, 0, 0);
1358 if (r < 0)
1359 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1360 }
88213476
LP
1361 }
1362
e58a1277
LP
1363 return r;
1364}
88213476 1365
03cfe0d5
LP
1366static int setup_pts(const char *dest) {
1367 _cleanup_free_ char *options = NULL;
1368 const char *p;
709f6e46 1369 int r;
03cfe0d5
LP
1370
1371#ifdef HAVE_SELINUX
1372 if (arg_selinux_apifs_context)
1373 (void) asprintf(&options,
3dce8915 1374 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1375 arg_uid_shift + TTY_GID,
1376 arg_selinux_apifs_context);
1377 else
1378#endif
1379 (void) asprintf(&options,
3dce8915 1380 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1381 arg_uid_shift + TTY_GID);
f2d88580 1382
03cfe0d5 1383 if (!options)
f2d88580
LP
1384 return log_oom();
1385
03cfe0d5 1386 /* Mount /dev/pts itself */
cc9fce65 1387 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1388 if (mkdir(p, 0755) < 0)
1389 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1390 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1391 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1392 r = userns_lchown(p, 0, 0);
1393 if (r < 0)
1394 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1395
1396 /* Create /dev/ptmx symlink */
1397 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1398 if (symlink("pts/ptmx", p) < 0)
1399 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1400 r = userns_lchown(p, 0, 0);
1401 if (r < 0)
1402 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1403
03cfe0d5
LP
1404 /* And fix /dev/pts/ptmx ownership */
1405 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1406 r = userns_lchown(p, 0, 0);
1407 if (r < 0)
1408 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1409
f2d88580
LP
1410 return 0;
1411}
1412
e58a1277 1413static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1414 _cleanup_umask_ mode_t u;
1415 const char *to;
e58a1277 1416 int r;
e58a1277
LP
1417
1418 assert(dest);
1419 assert(console);
1420
1421 u = umask(0000);
1422
03cfe0d5 1423 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1424 if (r < 0)
1425 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1426
a258bf26
LP
1427 /* We need to bind mount the right tty to /dev/console since
1428 * ptys can only exist on pts file systems. To have something
81f5049b 1429 * to bind mount things on we create a empty regular file. */
a258bf26 1430
03cfe0d5 1431 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1432 r = touch(to);
1433 if (r < 0)
1434 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1435
4543768d 1436 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1437 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1438
25ea79fe 1439 return 0;
e58a1277
LP
1440}
1441
1442static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1443 const char *from, *to;
7fd1b19b 1444 _cleanup_umask_ mode_t u;
d9603714 1445 int fd, r;
e58a1277 1446
e58a1277 1447 assert(kmsg_socket >= 0);
a258bf26 1448
e58a1277 1449 u = umask(0000);
a258bf26 1450
03cfe0d5 1451 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1452 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1453 * on the reading side behave very similar to /proc/kmsg,
1454 * their writing side behaves differently from /dev/kmsg in
1455 * that writing blocks when nothing is reading. In order to
1456 * avoid any problems with containers deadlocking due to this
1457 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1458 from = prefix_roota(dest, "/run/kmsg");
1459 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1460
4a62c710 1461 if (mkfifo(from, 0600) < 0)
03cfe0d5 1462 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1463 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1464 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1465
1466 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1467 if (fd < 0)
1468 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1469
e58a1277
LP
1470 /* Store away the fd in the socket, so that it stays open as
1471 * long as we run the child */
3ee897d6 1472 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1473 safe_close(fd);
e58a1277 1474
d9603714
DH
1475 if (r < 0)
1476 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1477
03cfe0d5
LP
1478 /* And now make the FIFO unavailable as /run/kmsg... */
1479 (void) unlink(from);
1480
25ea79fe 1481 return 0;
88213476
LP
1482}
1483
1c4baffc 1484static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1485 union in_addr_union *exposed = userdata;
1486
1487 assert(rtnl);
1488 assert(m);
1489 assert(exposed);
1490
7a8f6325 1491 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1492 return 0;
1493}
1494
3a74cea5 1495static int setup_hostname(void) {
3a74cea5 1496
eb91eb18
LP
1497 if (arg_share_system)
1498 return 0;
1499
605f81a8 1500 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1501 return -errno;
3a74cea5 1502
7027ff61 1503 return 0;
3a74cea5
LP
1504}
1505
57fb9fb5 1506static int setup_journal(const char *directory) {
e01ff70a 1507 sd_id128_t this_id;
0f5e1382 1508 _cleanup_free_ char *d = NULL;
e01ff70a 1509 const char *p, *q;
8054d749 1510 bool try;
e01ff70a 1511 char id[33];
57fb9fb5
LP
1512 int r;
1513
df9a75e4
LP
1514 /* Don't link journals in ephemeral mode */
1515 if (arg_ephemeral)
1516 return 0;
1517
8054d749
LP
1518 if (arg_link_journal == LINK_NO)
1519 return 0;
1520
1521 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1522
4d680aee 1523 r = sd_id128_get_machine(&this_id);
f647962d
MS
1524 if (r < 0)
1525 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee 1526
e01ff70a 1527 if (sd_id128_equal(arg_uuid, this_id)) {
8054d749 1528 log_full(try ? LOG_WARNING : LOG_ERR,
e192a281 1529 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
8054d749 1530 if (try)
4d680aee 1531 return 0;
df9a75e4 1532 return -EEXIST;
4d680aee
ZJS
1533 }
1534
03cfe0d5
LP
1535 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1536 if (r < 0)
1537 return log_error_errno(r, "Failed to create /var: %m");
1538
1539 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1540 if (r < 0)
1541 return log_error_errno(r, "Failed to create /var/log: %m");
1542
1543 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1544 if (r < 0)
1545 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1546
e01ff70a
MS
1547 (void) sd_id128_to_string(arg_uuid, id);
1548
03cfe0d5
LP
1549 p = strjoina("/var/log/journal/", id);
1550 q = prefix_roota(directory, p);
27407a01 1551
e26d6ce5 1552 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1553 if (try)
1554 return 0;
27407a01 1555
8054d749
LP
1556 log_error("%s: already a mount point, refusing to use for journal", p);
1557 return -EEXIST;
57fb9fb5
LP
1558 }
1559
e26d6ce5 1560 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1561 if (try)
1562 return 0;
57fb9fb5 1563
8054d749
LP
1564 log_error("%s: already a mount point, refusing to use for journal", q);
1565 return -EEXIST;
57fb9fb5
LP
1566 }
1567
1568 r = readlink_and_make_absolute(p, &d);
1569 if (r >= 0) {
1570 if ((arg_link_journal == LINK_GUEST ||
1571 arg_link_journal == LINK_AUTO) &&
1572 path_equal(d, q)) {
1573
03cfe0d5 1574 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1575 if (r < 0)
709f6e46 1576 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1577 return 0;
57fb9fb5
LP
1578 }
1579
4a62c710
MS
1580 if (unlink(p) < 0)
1581 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1582 } else if (r == -EINVAL) {
1583
1584 if (arg_link_journal == LINK_GUEST &&
1585 rmdir(p) < 0) {
1586
27407a01
ZJS
1587 if (errno == ENOTDIR) {
1588 log_error("%s already exists and is neither a symlink nor a directory", p);
1589 return r;
4314d33f
MS
1590 } else
1591 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1592 }
4314d33f
MS
1593 } else if (r != -ENOENT)
1594 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1595
1596 if (arg_link_journal == LINK_GUEST) {
1597
1598 if (symlink(q, p) < 0) {
8054d749 1599 if (try) {
56f64d95 1600 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1601 return 0;
4314d33f
MS
1602 } else
1603 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1604 }
1605
03cfe0d5 1606 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1607 if (r < 0)
709f6e46 1608 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1609 return 0;
57fb9fb5
LP
1610 }
1611
1612 if (arg_link_journal == LINK_HOST) {
ccddd104 1613 /* don't create parents here — if the host doesn't have
574edc90 1614 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1615
1616 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1617 if (try) {
56f64d95 1618 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1619 return 0;
4314d33f
MS
1620 } else
1621 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1622 }
1623
27407a01
ZJS
1624 } else if (access(p, F_OK) < 0)
1625 return 0;
57fb9fb5 1626
cdb2b9d0
LP
1627 if (dir_is_empty(q) == 0)
1628 log_warning("%s is not empty, proceeding anyway.", q);
1629
03cfe0d5 1630 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1631 if (r < 0)
1632 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1633
4543768d 1634 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1635 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1636
27407a01 1637 return 0;
57fb9fb5
LP
1638}
1639
88213476 1640static int drop_capabilities(void) {
520e0d54 1641 return capability_bounding_set_drop(arg_caps_retain, false);
88213476
LP
1642}
1643
db999e0f
LP
1644static int reset_audit_loginuid(void) {
1645 _cleanup_free_ char *p = NULL;
1646 int r;
1647
1648 if (arg_share_system)
1649 return 0;
1650
1651 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1652 if (r == -ENOENT)
db999e0f 1653 return 0;
f647962d
MS
1654 if (r < 0)
1655 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1656
1657 /* Already reset? */
1658 if (streq(p, "4294967295"))
1659 return 0;
1660
ad118bda 1661 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1662 if (r < 0) {
10a87006
LP
1663 log_error_errno(r,
1664 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1665 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1666 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1667 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1668 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1669
db999e0f 1670 sleep(5);
77b6e194 1671 }
db999e0f
LP
1672
1673 return 0;
77b6e194
LP
1674}
1675
24fb1112 1676
785890ac
LP
1677static int setup_propagate(const char *root) {
1678 const char *p, *q;
709f6e46 1679 int r;
785890ac
LP
1680
1681 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1682 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1683 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1684 (void) mkdir_p(p, 0600);
1685
709f6e46
MS
1686 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1687 if (r < 0)
1688 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1689
709f6e46
MS
1690 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1691 if (r < 0)
1692 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1693
709f6e46
MS
1694 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1695 if (r < 0)
1696 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1697
03cfe0d5 1698 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1699 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1700 return log_error_errno(errno, "Failed to install propagation bind mount.");
1701
1702 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1703 return log_error_errno(errno, "Failed to make propagation mount read-only");
1704
1705 return 0;
1706}
1707
1b9e5b12
LP
1708static int setup_image(char **device_path, int *loop_nr) {
1709 struct loop_info64 info = {
1710 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1711 };
1712 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1713 _cleanup_free_ char* loopdev = NULL;
1714 struct stat st;
1715 int r, nr;
1716
1717 assert(device_path);
1718 assert(loop_nr);
ec16945e 1719 assert(arg_image);
1b9e5b12
LP
1720
1721 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1722 if (fd < 0)
1723 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1724
4a62c710
MS
1725 if (fstat(fd, &st) < 0)
1726 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1727
1728 if (S_ISBLK(st.st_mode)) {
1729 char *p;
1730
1731 p = strdup(arg_image);
1732 if (!p)
1733 return log_oom();
1734
1735 *device_path = p;
1736
1737 *loop_nr = -1;
1738
1739 r = fd;
1740 fd = -1;
1741
1742 return r;
1743 }
1744
1745 if (!S_ISREG(st.st_mode)) {
070edd97 1746 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1747 return -EINVAL;
1748 }
1749
1750 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1751 if (control < 0)
1752 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1753
1754 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1755 if (nr < 0)
1756 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1757
1758 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1759 return log_oom();
1760
1761 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1762 if (loop < 0)
1763 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1764
4a62c710
MS
1765 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1766 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1767
1768 if (arg_read_only)
1769 info.lo_flags |= LO_FLAGS_READ_ONLY;
1770
4a62c710
MS
1771 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1772 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1773
1774 *device_path = loopdev;
1775 loopdev = NULL;
1776
1777 *loop_nr = nr;
1778
1779 r = loop;
1780 loop = -1;
1781
1782 return r;
1783}
1784
ada4799a
LP
1785#define PARTITION_TABLE_BLURB \
1786 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1787 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1788 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1789 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1790 "to be bootable with systemd-nspawn."
1791
1b9e5b12
LP
1792static int dissect_image(
1793 int fd,
727fd4fd
LP
1794 char **root_device, bool *root_device_rw,
1795 char **home_device, bool *home_device_rw,
1796 char **srv_device, bool *srv_device_rw,
a6bc7db9 1797 char **esp_device,
1b9e5b12
LP
1798 bool *secondary) {
1799
1800#ifdef HAVE_BLKID
a6bc7db9 1801 int home_nr = -1, srv_nr = -1, esp_nr = -1;
01dc33ce
ZJS
1802#ifdef GPT_ROOT_NATIVE
1803 int root_nr = -1;
1804#endif
1805#ifdef GPT_ROOT_SECONDARY
1806 int secondary_root_nr = -1;
1807#endif
a6bc7db9 1808 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1b9e5b12
LP
1809 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1810 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1811 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1812 _cleanup_udev_unref_ struct udev *udev = NULL;
1813 struct udev_list_entry *first, *item;
f6c51a81 1814 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1815 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1816 const char *pttype = NULL;
1817 blkid_partlist pl;
1818 struct stat st;
c09ef2e4 1819 unsigned i;
1b9e5b12
LP
1820 int r;
1821
1822 assert(fd >= 0);
1823 assert(root_device);
1824 assert(home_device);
1825 assert(srv_device);
a6bc7db9 1826 assert(esp_device);
1b9e5b12 1827 assert(secondary);
ec16945e 1828 assert(arg_image);
1b9e5b12
LP
1829
1830 b = blkid_new_probe();
1831 if (!b)
1832 return log_oom();
1833
1834 errno = 0;
1835 r = blkid_probe_set_device(b, fd, 0, 0);
1836 if (r != 0) {
1837 if (errno == 0)
1838 return log_oom();
1839
e1427b13 1840 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1841 }
1842
1843 blkid_probe_enable_partitions(b, 1);
1844 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1845
1846 errno = 0;
1847 r = blkid_do_safeprobe(b);
1848 if (r == -2 || r == 1) {
ada4799a
LP
1849 log_error("Failed to identify any partition table on\n"
1850 " %s\n"
1851 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1852 return -EINVAL;
1853 } else if (r != 0) {
1854 if (errno == 0)
1855 errno = EIO;
e1427b13 1856 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1857 }
1858
48861960 1859 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1860
1861 is_gpt = streq_ptr(pttype, "gpt");
1862 is_mbr = streq_ptr(pttype, "dos");
1863
1864 if (!is_gpt && !is_mbr) {
1865 log_error("No GPT or MBR partition table discovered on\n"
1866 " %s\n"
1867 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1868 return -EINVAL;
1869 }
1870
1871 errno = 0;
1872 pl = blkid_probe_get_partitions(b);
1873 if (!pl) {
1874 if (errno == 0)
1875 return log_oom();
1876
1877 log_error("Failed to list partitions of %s", arg_image);
1878 return -errno;
1879 }
1880
1881 udev = udev_new();
1882 if (!udev)
1883 return log_oom();
1884
4a62c710
MS
1885 if (fstat(fd, &st) < 0)
1886 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1887
c09ef2e4
LP
1888 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1889 if (!d)
1b9e5b12
LP
1890 return log_oom();
1891
c09ef2e4
LP
1892 for (i = 0;; i++) {
1893 int n, m;
1b9e5b12 1894
c09ef2e4
LP
1895 if (i >= 10) {
1896 log_error("Kernel partitions never appeared.");
1897 return -ENXIO;
1898 }
1899
1900 e = udev_enumerate_new(udev);
1901 if (!e)
1902 return log_oom();
1903
1904 r = udev_enumerate_add_match_parent(e, d);
1905 if (r < 0)
1906 return log_oom();
1907
1908 r = udev_enumerate_scan_devices(e);
1909 if (r < 0)
1910 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1911
1912 /* Count the partitions enumerated by the kernel */
1913 n = 0;
1914 first = udev_enumerate_get_list_entry(e);
1915 udev_list_entry_foreach(item, first)
1916 n++;
1917
1918 /* Count the partitions enumerated by blkid */
1919 m = blkid_partlist_numof_partitions(pl);
1920 if (n == m + 1)
1921 break;
1922 if (n > m + 1) {
1923 log_error("blkid and kernel partition list do not match.");
1924 return -EIO;
1925 }
1926 if (n < m + 1) {
1927 unsigned j;
1928
1929 /* The kernel has probed fewer partitions than
1930 * blkid? Maybe the kernel prober is still
1931 * running or it got EBUSY because udev
1932 * already opened the device. Let's reprobe
1933 * the device, which is a synchronous call
1934 * that waits until probing is complete. */
1935
1936 for (j = 0; j < 20; j++) {
1937
1938 r = ioctl(fd, BLKRRPART, 0);
1939 if (r < 0)
1940 r = -errno;
1941 if (r >= 0 || r != -EBUSY)
1942 break;
1943
1944 /* If something else has the device
1945 * open, such as an udev rule, the
1946 * ioctl will return EBUSY. Since
1947 * there's no way to wait until it
1948 * isn't busy anymore, let's just wait
1949 * a bit, and try again.
1950 *
1951 * This is really something they
1952 * should fix in the kernel! */
1953
1954 usleep(50 * USEC_PER_MSEC);
1955 }
1956
1957 if (r < 0)
1958 return log_error_errno(r, "Failed to reread partition table: %m");
1959 }
1960
1961 e = udev_enumerate_unref(e);
1962 }
1b9e5b12
LP
1963
1964 first = udev_enumerate_get_list_entry(e);
1965 udev_list_entry_foreach(item, first) {
1966 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1967 const char *node;
727fd4fd 1968 unsigned long long flags;
1b9e5b12
LP
1969 blkid_partition pp;
1970 dev_t qn;
1971 int nr;
1972
1973 errno = 0;
1974 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1975 if (!q) {
1976 if (!errno)
1977 errno = ENOMEM;
1978
e1427b13 1979 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1980 }
1981
1982 qn = udev_device_get_devnum(q);
1983 if (major(qn) == 0)
1984 continue;
1985
1986 if (st.st_rdev == qn)
1987 continue;
1988
1989 node = udev_device_get_devnode(q);
1990 if (!node)
1991 continue;
1992
1993 pp = blkid_partlist_devno_to_partition(pl, qn);
1994 if (!pp)
1995 continue;
1996
727fd4fd 1997 flags = blkid_partition_get_flags(pp);
727fd4fd 1998
1b9e5b12
LP
1999 nr = blkid_partition_get_partno(pp);
2000 if (nr < 0)
2001 continue;
2002
ada4799a
LP
2003 if (is_gpt) {
2004 sd_id128_t type_id;
2005 const char *stype;
1b9e5b12 2006
f6c51a81
LP
2007 if (flags & GPT_FLAG_NO_AUTO)
2008 continue;
2009
ada4799a
LP
2010 stype = blkid_partition_get_type_string(pp);
2011 if (!stype)
2012 continue;
1b9e5b12 2013
ada4799a 2014 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2015 continue;
2016
ada4799a 2017 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2018
ada4799a
LP
2019 if (home && nr >= home_nr)
2020 continue;
1b9e5b12 2021
ada4799a
LP
2022 home_nr = nr;
2023 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2024
ada4799a
LP
2025 r = free_and_strdup(&home, node);
2026 if (r < 0)
2027 return log_oom();
727fd4fd 2028
ada4799a
LP
2029 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2030
2031 if (srv && nr >= srv_nr)
2032 continue;
2033
2034 srv_nr = nr;
2035 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2036
2037 r = free_and_strdup(&srv, node);
2038 if (r < 0)
2039 return log_oom();
a6bc7db9
LP
2040 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2041
2042 if (esp && nr >= esp_nr)
2043 continue;
2044
2045 esp_nr = nr;
2046
2047 r = free_and_strdup(&esp, node);
2048 if (r < 0)
2049 return log_oom();
ada4799a 2050 }
1b9e5b12 2051#ifdef GPT_ROOT_NATIVE
ada4799a 2052 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2053
ada4799a
LP
2054 if (root && nr >= root_nr)
2055 continue;
1b9e5b12 2056
ada4799a
LP
2057 root_nr = nr;
2058 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2059
ada4799a
LP
2060 r = free_and_strdup(&root, node);
2061 if (r < 0)
2062 return log_oom();
2063 }
1b9e5b12
LP
2064#endif
2065#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2066 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2067
2068 if (secondary_root && nr >= secondary_root_nr)
2069 continue;
2070
2071 secondary_root_nr = nr;
2072 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2073
2074 r = free_and_strdup(&secondary_root, node);
2075 if (r < 0)
2076 return log_oom();
2077 }
2078#endif
f6c51a81
LP
2079 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2080
2081 if (generic)
2082 multiple_generic = true;
2083 else {
2084 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2085
2086 r = free_and_strdup(&generic, node);
2087 if (r < 0)
2088 return log_oom();
2089 }
2090 }
ada4799a
LP
2091
2092 } else if (is_mbr) {
2093 int type;
1b9e5b12 2094
f6c51a81
LP
2095 if (flags != 0x80) /* Bootable flag */
2096 continue;
2097
ada4799a
LP
2098 type = blkid_partition_get_type(pp);
2099 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2100 continue;
2101
f6c51a81
LP
2102 if (generic)
2103 multiple_generic = true;
2104 else {
2105 generic_rw = true;
727fd4fd 2106
f6c51a81
LP
2107 r = free_and_strdup(&root, node);
2108 if (r < 0)
2109 return log_oom();
2110 }
1b9e5b12 2111 }
1b9e5b12
LP
2112 }
2113
1b9e5b12
LP
2114 if (root) {
2115 *root_device = root;
2116 root = NULL;
727fd4fd
LP
2117
2118 *root_device_rw = root_rw;
1b9e5b12
LP
2119 *secondary = false;
2120 } else if (secondary_root) {
2121 *root_device = secondary_root;
2122 secondary_root = NULL;
727fd4fd
LP
2123
2124 *root_device_rw = secondary_root_rw;
1b9e5b12 2125 *secondary = true;
f6c51a81
LP
2126 } else if (generic) {
2127
2128 /* There were no partitions with precise meanings
2129 * around, but we found generic partitions. In this
2130 * case, if there's only one, we can go ahead and boot
2131 * it, otherwise we bail out, because we really cannot
2132 * make any sense of it. */
2133
2134 if (multiple_generic) {
2135 log_error("Identified multiple bootable Linux partitions on\n"
2136 " %s\n"
2137 PARTITION_TABLE_BLURB, arg_image);
2138 return -EINVAL;
2139 }
2140
2141 *root_device = generic;
2142 generic = NULL;
2143
2144 *root_device_rw = generic_rw;
2145 *secondary = false;
2146 } else {
2147 log_error("Failed to identify root partition in disk image\n"
2148 " %s\n"
2149 PARTITION_TABLE_BLURB, arg_image);
2150 return -EINVAL;
1b9e5b12
LP
2151 }
2152
2153 if (home) {
2154 *home_device = home;
2155 home = NULL;
727fd4fd
LP
2156
2157 *home_device_rw = home_rw;
1b9e5b12
LP
2158 }
2159
2160 if (srv) {
2161 *srv_device = srv;
2162 srv = NULL;
727fd4fd
LP
2163
2164 *srv_device_rw = srv_rw;
1b9e5b12
LP
2165 }
2166
a6bc7db9
LP
2167 if (esp) {
2168 *esp_device = esp;
2169 esp = NULL;
2170 }
2171
1b9e5b12
LP
2172 return 0;
2173#else
2174 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2175 return -EOPNOTSUPP;
1b9e5b12
LP
2176#endif
2177}
2178
727fd4fd 2179static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2180#ifdef HAVE_BLKID
2181 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2182 const char *fstype, *p;
2183 int r;
2184
2185 assert(what);
2186 assert(where);
2187
727fd4fd
LP
2188 if (arg_read_only)
2189 rw = false;
2190
1b9e5b12 2191 if (directory)
63c372cb 2192 p = strjoina(where, directory);
1b9e5b12
LP
2193 else
2194 p = where;
2195
2196 errno = 0;
2197 b = blkid_new_probe_from_filename(what);
2198 if (!b) {
2199 if (errno == 0)
2200 return log_oom();
e1427b13 2201 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2202 }
2203
2204 blkid_probe_enable_superblocks(b, 1);
2205 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2206
2207 errno = 0;
2208 r = blkid_do_safeprobe(b);
2209 if (r == -1 || r == 1) {
2210 log_error("Cannot determine file system type of %s", what);
2211 return -EINVAL;
2212 } else if (r != 0) {
2213 if (errno == 0)
2214 errno = EIO;
e1427b13 2215 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2216 }
2217
2218 errno = 0;
2219 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2220 if (errno == 0)
2221 errno = EINVAL;
2222 log_error("Failed to determine file system type of %s", what);
2223 return -errno;
2224 }
2225
2226 if (streq(fstype, "crypto_LUKS")) {
2227 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2228 return -EOPNOTSUPP;
1b9e5b12
LP
2229 }
2230
4a62c710
MS
2231 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2232 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2233
2234 return 0;
2235#else
2236 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2237 return -EOPNOTSUPP;
1b9e5b12
LP
2238#endif
2239}
2240
317feb4d 2241static int setup_machine_id(const char *directory) {
691675ba
LP
2242 const char *etc_machine_id;
2243 sd_id128_t id;
3bbaff3e 2244 int r;
e01ff70a 2245
317feb4d
LP
2246 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2247 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2248 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2249 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2250 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2251 * container behaves nicely). */
2252
e01ff70a
MS
2253 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2254
691675ba 2255 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
317feb4d
LP
2256 if (r < 0) {
2257 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2258 return log_error_errno(r, "Failed to read machine ID from container image: %m");
691675ba 2259
317feb4d
LP
2260 if (sd_id128_is_null(arg_uuid)) {
2261 r = sd_id128_randomize(&arg_uuid);
2262 if (r < 0)
2263 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2264 }
2265 } else {
2266 if (sd_id128_is_null(id)) {
2267 log_error("Machine ID in container image is zero, refusing.");
2268 return -EINVAL;
2269 }
e01ff70a 2270
317feb4d
LP
2271 arg_uuid = id;
2272 }
691675ba 2273
e01ff70a
MS
2274 return 0;
2275}
2276
7336138e
LP
2277static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2278 int r;
2279
2280 assert(directory);
2281
0de7acce 2282 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
7336138e
LP
2283 return 0;
2284
2285 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2286 if (r == -EOPNOTSUPP)
2287 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2288 if (r == -EBADE)
2289 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2290 if (r < 0)
2291 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2292 if (r == 0)
2293 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2294 else
2295 log_debug("Patched directory tree to match UID/GID range.");
2296
2297 return r;
2298}
2299
727fd4fd
LP
2300static int mount_devices(
2301 const char *where,
2302 const char *root_device, bool root_device_rw,
2303 const char *home_device, bool home_device_rw,
a6bc7db9
LP
2304 const char *srv_device, bool srv_device_rw,
2305 const char *esp_device) {
1b9e5b12
LP
2306 int r;
2307
2308 assert(where);
2309
2310 if (root_device) {
727fd4fd 2311 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2312 if (r < 0)
2313 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2314 }
2315
2316 if (home_device) {
727fd4fd 2317 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2318 if (r < 0)
2319 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2320 }
2321
2322 if (srv_device) {
727fd4fd 2323 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2324 if (r < 0)
2325 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2326 }
2327
a6bc7db9
LP
2328 if (esp_device) {
2329 const char *mp, *x;
2330
2331 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2332
2333 mp = "/efi";
2334 x = strjoina(arg_directory, mp);
2335 r = dir_is_empty(x);
2336 if (r == -ENOENT) {
2337 mp = "/boot";
2338 x = strjoina(arg_directory, mp);
2339 r = dir_is_empty(x);
2340 }
2341
2342 if (r > 0) {
2343 r = mount_device(esp_device, arg_directory, mp, true);
2344 if (r < 0)
2345 return log_error_errno(r, "Failed to mount ESP: %m");
2346 }
2347 }
2348
1b9e5b12
LP
2349 return 0;
2350}
2351
2352static void loop_remove(int nr, int *image_fd) {
2353 _cleanup_close_ int control = -1;
e8c8ddcc 2354 int r;
1b9e5b12
LP
2355
2356 if (nr < 0)
2357 return;
2358
2359 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2360 r = ioctl(*image_fd, LOOP_CLR_FD);
2361 if (r < 0)
5e4074aa 2362 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2363 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2364 }
2365
2366 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2367 if (control < 0) {
56f64d95 2368 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2369 return;
e8c8ddcc 2370 }
1b9e5b12 2371
e8c8ddcc
TG
2372 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2373 if (r < 0)
5e4074aa 2374 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2375}
2376
113cea80 2377/*
6d416b9c
LS
2378 * Return values:
2379 * < 0 : wait_for_terminate() failed to get the state of the
2380 * container, the container was terminated by a signal, or
2381 * failed for an unknown reason. No change is made to the
2382 * container argument.
2383 * > 0 : The program executed in the container terminated with an
2384 * error. The exit code of the program executed in the
919699ec
LP
2385 * container is returned. The container argument has been set
2386 * to CONTAINER_TERMINATED.
6d416b9c
LS
2387 * 0 : The container is being rebooted, has been shut down or exited
2388 * successfully. The container argument has been set to either
2389 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2390 *
6d416b9c
LS
2391 * That is, success is indicated by a return value of zero, and an
2392 * error is indicated by a non-zero value.
113cea80
DH
2393 */
2394static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2395 siginfo_t status;
919699ec 2396 int r;
113cea80
DH
2397
2398 r = wait_for_terminate(pid, &status);
f647962d
MS
2399 if (r < 0)
2400 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2401
2402 switch (status.si_code) {
fddbb89c 2403
113cea80 2404 case CLD_EXITED:
b5a2179b 2405 if (status.si_status == 0)
919699ec 2406 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
b5a2179b 2407 else
919699ec 2408 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2409
919699ec
LP
2410 *container = CONTAINER_TERMINATED;
2411 return status.si_status;
113cea80
DH
2412
2413 case CLD_KILLED:
2414 if (status.si_status == SIGINT) {
919699ec 2415 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2416 *container = CONTAINER_TERMINATED;
919699ec
LP
2417 return 0;
2418
113cea80 2419 } else if (status.si_status == SIGHUP) {
919699ec 2420 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2421 *container = CONTAINER_REBOOTED;
919699ec 2422 return 0;
113cea80 2423 }
919699ec 2424
113cea80
DH
2425 /* CLD_KILLED fallthrough */
2426
2427 case CLD_DUMPED:
fddbb89c 2428 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2429 return -EIO;
113cea80
DH
2430
2431 default:
fddbb89c 2432 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2433 return -EIO;
113cea80 2434 }
113cea80
DH
2435}
2436
023fb90b
LP
2437static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2438 pid_t pid;
2439
4a0b58c4 2440 pid = PTR_TO_PID(userdata);
023fb90b 2441 if (pid > 0) {
c6c8f6e2 2442 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2443 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2444 sd_event_source_set_userdata(s, NULL);
2445 return 0;
2446 }
2447 }
2448
2449 sd_event_exit(sd_event_source_get_event(s), 0);
2450 return 0;
2451}
2452
ec16945e 2453static int determine_names(void) {
1b9cebf6 2454 int r;
ec16945e 2455
c1521918
LP
2456 if (arg_template && !arg_directory && arg_machine) {
2457
2458 /* If --template= was specified then we should not
2459 * search for a machine, but instead create a new one
2460 * in /var/lib/machine. */
2461
2462 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2463 if (!arg_directory)
2464 return log_oom();
2465 }
2466
ec16945e 2467 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2468 if (arg_machine) {
2469 _cleanup_(image_unrefp) Image *i = NULL;
2470
2471 r = image_find(arg_machine, &i);
2472 if (r < 0)
2473 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2474 else if (r == 0) {
2475 log_error("No image for machine '%s': %m", arg_machine);
2476 return -ENOENT;
2477 }
2478
aceac2f0 2479 if (i->type == IMAGE_RAW)
0f03c2a4 2480 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2481 else
0f03c2a4 2482 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2483 if (r < 0)
2484 return log_error_errno(r, "Invalid image directory: %m");
2485
aee327b8
LP
2486 if (!arg_ephemeral)
2487 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2488 } else
ec16945e
LP
2489 arg_directory = get_current_dir_name();
2490
1b9cebf6
LP
2491 if (!arg_directory && !arg_machine) {
2492 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2493 return -EINVAL;
2494 }
2495 }
2496
2497 if (!arg_machine) {
b9ba4dab
LP
2498 if (arg_directory && path_equal(arg_directory, "/"))
2499 arg_machine = gethostname_malloc();
2500 else
2501 arg_machine = strdup(basename(arg_image ?: arg_directory));
2502
ec16945e
LP
2503 if (!arg_machine)
2504 return log_oom();
2505
ae691c1d 2506 hostname_cleanup(arg_machine);
ec16945e
LP
2507 if (!machine_name_is_valid(arg_machine)) {
2508 log_error("Failed to determine machine name automatically, please use -M.");
2509 return -EINVAL;
2510 }
b9ba4dab
LP
2511
2512 if (arg_ephemeral) {
2513 char *b;
2514
2515 /* Add a random suffix when this is an
2516 * ephemeral machine, so that we can run many
2517 * instances at once without manually having
2518 * to specify -M each time. */
2519
2520 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2521 return log_oom();
2522
2523 free(arg_machine);
2524 arg_machine = b;
2525 }
ec16945e
LP
2526 }
2527
2528 return 0;
2529}
2530
03cfe0d5 2531static int determine_uid_shift(const char *directory) {
6dac160c
LP
2532 int r;
2533
0de7acce 2534 if (arg_userns_mode == USER_NAMESPACE_NO) {
03cfe0d5 2535 arg_uid_shift = 0;
6dac160c 2536 return 0;
03cfe0d5 2537 }
6dac160c
LP
2538
2539 if (arg_uid_shift == UID_INVALID) {
2540 struct stat st;
2541
03cfe0d5 2542 r = stat(directory, &st);
6dac160c 2543 if (r < 0)
03cfe0d5 2544 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2545
2546 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2547
2548 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2549 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2550 return -EINVAL;
2551 }
2552
2553 arg_uid_range = UINT32_C(0x10000);
2554 }
2555
2556 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2557 log_error("UID base too high for UID range.");
2558 return -EINVAL;
2559 }
2560
6dac160c
LP
2561 return 0;
2562}
2563
03cfe0d5
LP
2564static int inner_child(
2565 Barrier *barrier,
2566 const char *directory,
2567 bool secondary,
2568 int kmsg_socket,
2569 int rtnl_socket,
f757855e 2570 FDSet *fds) {
69c79d3c 2571
03cfe0d5 2572 _cleanup_free_ char *home = NULL;
e01ff70a 2573 char as_uuid[37];
6aadfa4c 2574 unsigned n_env = 1;
03cfe0d5
LP
2575 const char *envp[] = {
2576 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2577 NULL, /* container */
03cfe0d5
LP
2578 NULL, /* TERM */
2579 NULL, /* HOME */
2580 NULL, /* USER */
2581 NULL, /* LOGNAME */
2582 NULL, /* container_uuid */
2583 NULL, /* LISTEN_FDS */
2584 NULL, /* LISTEN_PID */
9c1e04d0 2585 NULL, /* NOTIFY_SOCKET */
03cfe0d5
LP
2586 NULL
2587 };
88213476 2588
2371271c 2589 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2590 int r;
88213476 2591
03cfe0d5
LP
2592 assert(barrier);
2593 assert(directory);
2594 assert(kmsg_socket >= 0);
88213476 2595
efdb0237
LP
2596 cg_unified_flush();
2597
0de7acce 2598 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
2599 /* Tell the parent, that it now can write the UID map. */
2600 (void) barrier_place(barrier); /* #1 */
7027ff61 2601
03cfe0d5
LP
2602 /* Wait until the parent wrote the UID map */
2603 if (!barrier_place_and_sync(barrier)) { /* #2 */
2604 log_error("Parent died too early");
2605 return -ESRCH;
2606 }
88213476
LP
2607 }
2608
0de7acce
LP
2609 r = mount_all(NULL,
2610 arg_userns_mode != USER_NAMESPACE_NO,
2611 true,
2612 arg_private_network,
2613 arg_uid_shift,
2614 arg_uid_range,
2615 arg_selinux_apifs_context);
2616
03cfe0d5
LP
2617 if (r < 0)
2618 return r;
2619
d8fc6a00
LP
2620 r = mount_sysfs(NULL);
2621 if (r < 0)
2622 return r;
2623
03cfe0d5
LP
2624 /* Wait until we are cgroup-ified, so that we
2625 * can mount the right cgroup path writable */
2626 if (!barrier_place_and_sync(barrier)) { /* #3 */
2627 log_error("Parent died too early");
2628 return -ESRCH;
88213476
LP
2629 }
2630
0996ef00
CB
2631 if (cg_ns_supported()) {
2632 r = unshare(CLONE_NEWCGROUP);
2633 if (r < 0)
2634 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2635 r = mount_cgroups(
2636 "",
2637 arg_unified_cgroup_hierarchy,
2638 arg_userns_mode != USER_NAMESPACE_NO,
2639 arg_uid_shift,
2640 arg_uid_range,
2641 arg_selinux_apifs_context);
2642 if (r < 0)
2643 return r;
2644 } else {
2645 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2646 if (r < 0)
2647 return r;
2648 }
ec16945e 2649
03cfe0d5
LP
2650 r = reset_uid_gid();
2651 if (r < 0)
2652 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2653
03cfe0d5
LP
2654 r = setup_boot_id(NULL);
2655 if (r < 0)
2656 return r;
ec16945e 2657
03cfe0d5
LP
2658 r = setup_kmsg(NULL, kmsg_socket);
2659 if (r < 0)
2660 return r;
2661 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2662
03cfe0d5 2663 umask(0022);
30535c16 2664
03cfe0d5
LP
2665 if (setsid() < 0)
2666 return log_error_errno(errno, "setsid() failed: %m");
2667
2668 if (arg_private_network)
2669 loopback_setup();
2670
7a8f6325
LP
2671 if (arg_expose_ports) {
2672 r = expose_port_send_rtnl(rtnl_socket);
2673 if (r < 0)
2674 return r;
2675 rtnl_socket = safe_close(rtnl_socket);
2676 }
03cfe0d5 2677
709f6e46
MS
2678 r = drop_capabilities();
2679 if (r < 0)
2680 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2681
2682 setup_hostname();
2683
050f7277 2684 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2685 if (personality(arg_personality) < 0)
2686 return log_error_errno(errno, "personality() failed: %m");
2687 } else if (secondary) {
2688 if (personality(PER_LINUX32) < 0)
2689 return log_error_errno(errno, "personality() failed: %m");
2690 }
2691
2692#ifdef HAVE_SELINUX
2693 if (arg_selinux_context)
2ed96880 2694 if (setexeccon(arg_selinux_context) < 0)
03cfe0d5
LP
2695 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2696#endif
2697
ee645080 2698 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2699 if (r < 0)
2700 return r;
2701
6aadfa4c
ILG
2702 /* LXC sets container=lxc, so follow the scheme here */
2703 envp[n_env++] = strjoina("container=", arg_container_service_name);
2704
03cfe0d5
LP
2705 envp[n_env] = strv_find_prefix(environ, "TERM=");
2706 if (envp[n_env])
313cefa1 2707 n_env++;
03cfe0d5
LP
2708
2709 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2710 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2711 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2712 return log_oom();
2713
3bbaff3e 2714 assert(!sd_id128_is_null(arg_uuid));
03cfe0d5 2715
691675ba 2716 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
e01ff70a 2717 return log_oom();
03cfe0d5
LP
2718
2719 if (fdset_size(fds) > 0) {
2720 r = fdset_cloexec(fds, false);
2721 if (r < 0)
2722 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2723
2724 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2725 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2726 return log_oom();
2727 }
9c1e04d0
AP
2728 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2729 return log_oom();
03cfe0d5 2730
2371271c
TG
2731 env_use = strv_env_merge(2, envp, arg_setenv);
2732 if (!env_use)
2733 return log_oom();
03cfe0d5
LP
2734
2735 /* Let the parent know that we are ready and
2736 * wait until the parent is ready with the
2737 * setup, too... */
2738 if (!barrier_place_and_sync(barrier)) { /* #4 */
2739 log_error("Parent died too early");
2740 return -ESRCH;
2741 }
2742
5f932eb9
LP
2743 if (arg_chdir)
2744 if (chdir(arg_chdir) < 0)
2745 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2746
7732f92b
LP
2747 if (arg_start_mode == START_PID2) {
2748 r = stub_pid1();
2749 if (r < 0)
2750 return r;
2751 }
2752
03cfe0d5
LP
2753 /* Now, explicitly close the log, so that we
2754 * then can close all remaining fds. Closing
2755 * the log explicitly first has the benefit
2756 * that the logging subsystem knows about it,
2757 * and is thus ready to be reopened should we
2758 * need it again. Note that the other fds
2759 * closed here are at least the locking and
2760 * barrier fds. */
2761 log_close();
2762 (void) fdset_close_others(fds);
2763
7732f92b 2764 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2765 char **a;
2766 size_t m;
2767
2768 /* Automatically search for the init system */
2769
75f32f04
ZJS
2770 m = strv_length(arg_parameters);
2771 a = newa(char*, m + 2);
2772 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2773 a[1 + m] = NULL;
03cfe0d5
LP
2774
2775 a[0] = (char*) "/usr/lib/systemd/systemd";
2776 execve(a[0], a, env_use);
2777
2778 a[0] = (char*) "/lib/systemd/systemd";
2779 execve(a[0], a, env_use);
2780
2781 a[0] = (char*) "/sbin/init";
2782 execve(a[0], a, env_use);
f757855e
LP
2783 } else if (!strv_isempty(arg_parameters))
2784 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2785 else {
5f932eb9 2786 if (!arg_chdir)
d929b0f9
ZJS
2787 /* If we cannot change the directory, we'll end up in /, that is expected. */
2788 (void) chdir(home ?: "/root");
5f932eb9 2789
03cfe0d5
LP
2790 execle("/bin/bash", "-bash", NULL, env_use);
2791 execle("/bin/sh", "-sh", NULL, env_use);
2792 }
2793
35607a8d 2794 r = -errno;
03cfe0d5 2795 (void) log_open();
35607a8d 2796 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2797}
2798
9c1e04d0
AP
2799static int setup_sd_notify_child(void) {
2800 static const int one = 1;
2801 int fd = -1;
2802 union sockaddr_union sa = {
2803 .sa.sa_family = AF_UNIX,
2804 };
2805 int r;
2806
2807 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2808 if (fd < 0)
2809 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2810
2811 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2812 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2813
2814 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2815 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2816 if (r < 0) {
2817 safe_close(fd);
2818 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2819 }
2820
2821 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2822 if (r < 0) {
2823 safe_close(fd);
2824 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2825 }
2826
2827 return fd;
2828}
2829
03cfe0d5
LP
2830static int outer_child(
2831 Barrier *barrier,
2832 const char *directory,
2833 const char *console,
2834 const char *root_device, bool root_device_rw,
2835 const char *home_device, bool home_device_rw,
2836 const char *srv_device, bool srv_device_rw,
a6bc7db9 2837 const char *esp_device,
03cfe0d5
LP
2838 bool interactive,
2839 bool secondary,
2840 int pid_socket,
e01ff70a 2841 int uuid_socket,
9c1e04d0 2842 int notify_socket,
03cfe0d5
LP
2843 int kmsg_socket,
2844 int rtnl_socket,
825d5287 2845 int uid_shift_socket,
f757855e 2846 FDSet *fds) {
03cfe0d5
LP
2847
2848 pid_t pid;
2849 ssize_t l;
2850 int r;
9c1e04d0 2851 _cleanup_close_ int fd = -1;
03cfe0d5
LP
2852
2853 assert(barrier);
2854 assert(directory);
2855 assert(console);
2856 assert(pid_socket >= 0);
e01ff70a 2857 assert(uuid_socket >= 0);
9c1e04d0 2858 assert(notify_socket >= 0);
03cfe0d5
LP
2859 assert(kmsg_socket >= 0);
2860
efdb0237
LP
2861 cg_unified_flush();
2862
03cfe0d5
LP
2863 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2864 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2865
2866 if (interactive) {
2867 close_nointr(STDIN_FILENO);
2868 close_nointr(STDOUT_FILENO);
2869 close_nointr(STDERR_FILENO);
2870
2871 r = open_terminal(console, O_RDWR);
2872 if (r != STDIN_FILENO) {
2873 if (r >= 0) {
2874 safe_close(r);
2875 r = -EINVAL;
2876 }
2877
2878 return log_error_errno(r, "Failed to open console: %m");
2879 }
2880
2881 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2882 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2883 return log_error_errno(errno, "Failed to duplicate console: %m");
2884 }
2885
2886 r = reset_audit_loginuid();
2887 if (r < 0)
2888 return r;
2889
2890 /* Mark everything as slave, so that we still
2891 * receive mounts from the real root, but don't
2892 * propagate mounts to the real root. */
2893 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2894 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2895
2896 r = mount_devices(directory,
2897 root_device, root_device_rw,
2898 home_device, home_device_rw,
a6bc7db9
LP
2899 srv_device, srv_device_rw,
2900 esp_device);
03cfe0d5
LP
2901 if (r < 0)
2902 return r;
2903
391567f4
LP
2904 r = determine_uid_shift(directory);
2905 if (r < 0)
2906 return r;
2907
0de7acce 2908 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751 2909 /* Let the parent know which UID shift we read from the image */
825d5287
RM
2910 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2911 if (l < 0)
2912 return log_error_errno(errno, "Failed to send UID shift: %m");
2913 if (l != sizeof(arg_uid_shift)) {
2914 log_error("Short write while sending UID shift.");
2915 return -EIO;
2916 }
0e7ac751 2917
0de7acce 2918 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
2919 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2920 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2921 * not it will pick a different one, and send it back to us. */
2922
2923 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2924 if (l < 0)
2925 return log_error_errno(errno, "Failed to recv UID shift: %m");
2926 if (l != sizeof(arg_uid_shift)) {
595bfe7d 2927 log_error("Short read while receiving UID shift.");
0e7ac751
LP
2928 return -EIO;
2929 }
2930 }
2931
2932 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
825d5287
RM
2933 }
2934
03cfe0d5
LP
2935 /* Turn directory into bind mount */
2936 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2937 return log_error_errno(errno, "Failed to make bind mount: %m");
2938
7336138e 2939 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
03cfe0d5
LP
2940 if (r < 0)
2941 return r;
2942
0de7acce
LP
2943 r = setup_volatile(
2944 directory,
2945 arg_volatile_mode,
2946 arg_userns_mode != USER_NAMESPACE_NO,
2947 arg_uid_shift,
2948 arg_uid_range,
2949 arg_selinux_context);
03cfe0d5
LP
2950 if (r < 0)
2951 return r;
2952
0de7acce
LP
2953 r = setup_volatile_state(
2954 directory,
2955 arg_volatile_mode,
2956 arg_userns_mode != USER_NAMESPACE_NO,
2957 arg_uid_shift,
2958 arg_uid_range,
2959 arg_selinux_context);
03cfe0d5
LP
2960 if (r < 0)
2961 return r;
2962
03cfe0d5
LP
2963 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2964 if (r < 0)
2965 return r;
2966
03cfe0d5
LP
2967 if (arg_read_only) {
2968 r = bind_remount_recursive(directory, true);
2969 if (r < 0)
2970 return log_error_errno(r, "Failed to make tree read-only: %m");
2971 }
2972
0de7acce
LP
2973 r = mount_all(directory,
2974 arg_userns_mode != USER_NAMESPACE_NO,
2975 false,
2976 arg_private_network,
2977 arg_uid_shift,
2978 arg_uid_range,
2979 arg_selinux_apifs_context);
03cfe0d5
LP
2980 if (r < 0)
2981 return r;
2982
07fa00f9
LP
2983 r = copy_devnodes(directory);
2984 if (r < 0)
03cfe0d5
LP
2985 return r;
2986
2987 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2988
07fa00f9
LP
2989 r = setup_pts(directory);
2990 if (r < 0)
03cfe0d5
LP
2991 return r;
2992
2993 r = setup_propagate(directory);
2994 if (r < 0)
2995 return r;
2996
2997 r = setup_dev_console(directory, console);
2998 if (r < 0)
2999 return r;
3000
520e0d54 3001 r = setup_seccomp(arg_caps_retain);
03cfe0d5
LP
3002 if (r < 0)
3003 return r;
3004
3005 r = setup_timezone(directory);
3006 if (r < 0)
3007 return r;
3008
3009 r = setup_resolv_conf(directory);
3010 if (r < 0)
3011 return r;
3012
e01ff70a
MS
3013 r = setup_machine_id(directory);
3014 if (r < 0)
3015 return r;
3016
03cfe0d5
LP
3017 r = setup_journal(directory);
3018 if (r < 0)
3019 return r;
3020
0de7acce
LP
3021 r = mount_custom(
3022 directory,
3023 arg_custom_mounts,
3024 arg_n_custom_mounts,
3025 arg_userns_mode != USER_NAMESPACE_NO,
3026 arg_uid_shift,
3027 arg_uid_range,
3028 arg_selinux_apifs_context);
03cfe0d5
LP
3029 if (r < 0)
3030 return r;
3031
0996ef00
CB
3032 if (!cg_ns_supported()) {
3033 r = mount_cgroups(
3034 directory,
3035 arg_unified_cgroup_hierarchy,
3036 arg_userns_mode != USER_NAMESPACE_NO,
3037 arg_uid_shift,
3038 arg_uid_range,
3039 arg_selinux_apifs_context);
3040 if (r < 0)
3041 return r;
3042 }
03cfe0d5
LP
3043
3044 r = mount_move_root(directory);
3045 if (r < 0)
3046 return log_error_errno(r, "Failed to move root directory: %m");
3047
9c1e04d0
AP
3048 fd = setup_sd_notify_child();
3049 if (fd < 0)
3050 return fd;
3051
03cfe0d5
LP
3052 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3053 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
3054 (arg_private_network ? CLONE_NEWNET : 0) |
8869a0b4 3055 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
03cfe0d5
LP
3056 if (pid < 0)
3057 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3058 if (pid == 0) {
3059 pid_socket = safe_close(pid_socket);
e01ff70a 3060 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3061 notify_socket = safe_close(notify_socket);
825d5287 3062 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3063
3064 /* The inner child has all namespaces that are
3065 * requested, so that we all are owned by the user if
3066 * user namespaces are turned on. */
3067
f757855e 3068 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3069 if (r < 0)
3070 _exit(EXIT_FAILURE);
3071
3072 _exit(EXIT_SUCCESS);
3073 }
3074
3075 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3076 if (l < 0)
3077 return log_error_errno(errno, "Failed to send PID: %m");
3078 if (l != sizeof(pid)) {
3079 log_error("Short write while sending PID.");
3080 return -EIO;
3081 }
3082
e01ff70a
MS
3083 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3084 if (l < 0)
3085 return log_error_errno(errno, "Failed to send machine ID: %m");
3086 if (l != sizeof(arg_uuid)) {
3087 log_error("Short write while sending machine ID.");
3088 return -EIO;
3089 }
3090
9c1e04d0
AP
3091 l = send_one_fd(notify_socket, fd, 0);
3092 if (l < 0)
3093 return log_error_errno(errno, "Failed to send notify fd: %m");
3094
03cfe0d5 3095 pid_socket = safe_close(pid_socket);
e01ff70a 3096 uuid_socket = safe_close(uuid_socket);
9c1e04d0 3097 notify_socket = safe_close(notify_socket);
327e26d6
KN
3098 kmsg_socket = safe_close(kmsg_socket);
3099 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
3100
3101 return 0;
3102}
3103
0e7ac751
LP
3104static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3105 unsigned n_tries = 100;
3106 uid_t candidate;
3107 int r;
3108
3109 assert(shift);
3110 assert(ret_lock_file);
0de7acce 3111 assert(arg_userns_mode == USER_NAMESPACE_PICK);
0e7ac751
LP
3112 assert(arg_uid_range == 0x10000U);
3113
3114 candidate = *shift;
3115
3116 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3117
3118 for (;;) {
3119 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3120 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3121
3122 if (--n_tries <= 0)
3123 return -EBUSY;
3124
3125 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3126 goto next;
3127 if ((candidate & UINT32_C(0xFFFF)) != 0)
3128 goto next;
3129
3130 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3131 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3132 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3133 goto next;
3134 if (r < 0)
3135 return r;
3136
3137 /* Make some superficial checks whether the range is currently known in the user database */
3138 if (getpwuid(candidate))
3139 goto next;
3140 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3141 goto next;
3142 if (getgrgid(candidate))
3143 goto next;
3144 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3145 goto next;
3146
3147 *ret_lock_file = lf;
3148 lf = (struct LockFile) LOCK_FILE_INIT;
3149 *shift = candidate;
3150 return 0;
3151
3152 next:
3153 random_bytes(&candidate, sizeof(candidate));
3154 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3155 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3156 }
3157}
3158
03cfe0d5
LP
3159static int setup_uid_map(pid_t pid) {
3160 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3161 int r;
3162
3163 assert(pid > 1);
3164
3165 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3166 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3167 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3168 if (r < 0)
3169 return log_error_errno(r, "Failed to write UID map: %m");
3170
3171 /* We always assign the same UID and GID ranges */
3172 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3173 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3174 if (r < 0)
3175 return log_error_errno(r, "Failed to write GID map: %m");
3176
3177 return 0;
3178}
3179
9c1e04d0 3180static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
9c1e04d0
AP
3181 char buf[NOTIFY_BUFFER_MAX+1];
3182 char *p = NULL;
3183 struct iovec iovec = {
3184 .iov_base = buf,
3185 .iov_len = sizeof(buf)-1,
3186 };
3187 union {
3188 struct cmsghdr cmsghdr;
3189 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3190 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3191 } control = {};
3192 struct msghdr msghdr = {
3193 .msg_iov = &iovec,
3194 .msg_iovlen = 1,
3195 .msg_control = &control,
3196 .msg_controllen = sizeof(control),
3197 };
3198 struct cmsghdr *cmsg;
3199 struct ucred *ucred = NULL;
3200 ssize_t n;
3201 pid_t inner_child_pid;
3202 _cleanup_strv_free_ char **tags = NULL;
3203
3204 assert(userdata);
3205
3206 inner_child_pid = PTR_TO_PID(userdata);
3207
3208 if (revents != EPOLLIN) {
3209 log_warning("Got unexpected poll event for notify fd.");
3210 return 0;
3211 }
3212
3213 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3214 if (n < 0) {
3215 if (errno == EAGAIN || errno == EINTR)
3216 return 0;
3217
3218 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3219 }
3220 cmsg_close_all(&msghdr);
3221
3222 CMSG_FOREACH(cmsg, &msghdr) {
3223 if (cmsg->cmsg_level == SOL_SOCKET &&
3224 cmsg->cmsg_type == SCM_CREDENTIALS &&
3225 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3226
3227 ucred = (struct ucred*) CMSG_DATA(cmsg);
3228 }
3229 }
3230
3231 if (!ucred || ucred->pid != inner_child_pid) {
3232 log_warning("Received notify message without valid credentials. Ignoring.");
3233 return 0;
3234 }
3235
3236 if ((size_t) n >= sizeof(buf)) {
3237 log_warning("Received notify message exceeded maximum size. Ignoring.");
3238 return 0;
3239 }
3240
3241 buf[n] = 0;
3242 tags = strv_split(buf, "\n\r");
3243 if (!tags)
3244 return log_oom();
3245
3246 if (strv_find(tags, "READY=1"))
3247 sd_notifyf(false, "READY=1\n");
3248
3249 p = strv_find_startswith(tags, "STATUS=");
3250 if (p)
3251 sd_notifyf(false, "STATUS=Container running: %s", p);
3252
3253 return 0;
3254}
3255
3256static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3257 int r;
3258 sd_event_source *notify_event_source;
3259
3260 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3261 if (r < 0)
3262 return log_error_errno(r, "Failed to allocate notify event source: %m");
3263
3264 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3265
3266 return 0;
3267}
3268
f757855e
LP
3269static int load_settings(void) {
3270 _cleanup_(settings_freep) Settings *settings = NULL;
3271 _cleanup_fclose_ FILE *f = NULL;
3272 _cleanup_free_ char *p = NULL;
3273 const char *fn, *i;
3274 int r;
3275
3276 /* If all settings are masked, there's no point in looking for
3277 * the settings file */
3278 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3279 return 0;
3280
3281 fn = strjoina(arg_machine, ".nspawn");
3282
3283 /* We first look in the admin's directories in /etc and /run */
3284 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3285 _cleanup_free_ char *j = NULL;
3286
3287 j = strjoin(i, "/", fn, NULL);
3288 if (!j)
3289 return log_oom();
3290
3291 f = fopen(j, "re");
3292 if (f) {
3293 p = j;
3294 j = NULL;
3295
b938cb90 3296 /* By default, we trust configuration from /etc and /run */
f757855e
LP
3297 if (arg_settings_trusted < 0)
3298 arg_settings_trusted = true;
3299
3300 break;
3301 }
3302
3303 if (errno != ENOENT)
3304 return log_error_errno(errno, "Failed to open %s: %m", j);
3305 }
3306
3307 if (!f) {
3308 /* After that, let's look for a file next to the
3309 * actual image we shall boot. */
3310
3311 if (arg_image) {
3312 p = file_in_same_dir(arg_image, fn);
3313 if (!p)
3314 return log_oom();
3315 } else if (arg_directory) {
3316 p = file_in_same_dir(arg_directory, fn);
3317 if (!p)
3318 return log_oom();
3319 }
3320
3321 if (p) {
3322 f = fopen(p, "re");
3323 if (!f && errno != ENOENT)
3324 return log_error_errno(errno, "Failed to open %s: %m", p);
3325
b938cb90 3326 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
3327 if (arg_settings_trusted < 0)
3328 arg_settings_trusted = false;
3329 }
3330 }
3331
3332 if (!f)
3333 return 0;
3334
3335 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3336
3337 r = settings_load(f, p, &settings);
3338 if (r < 0)
3339 return r;
3340
3341 /* Copy over bits from the settings, unless they have been
3342 * explicitly masked by command line switches. */
3343
7732f92b
LP
3344 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3345 settings->start_mode >= 0) {
3346 arg_start_mode = settings->start_mode;
f757855e
LP
3347
3348 strv_free(arg_parameters);
3349 arg_parameters = settings->parameters;
3350 settings->parameters = NULL;
3351 }
3352
5f932eb9
LP
3353 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3354 settings->working_directory) {
3355 free(arg_chdir);
3356 arg_chdir = settings->working_directory;
3357 settings->working_directory = NULL;
3358 }
3359
f757855e
LP
3360 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3361 settings->environment) {
3362 strv_free(arg_setenv);
3363 arg_setenv = settings->environment;
3364 settings->environment = NULL;
3365 }
3366
3367 if ((arg_settings_mask & SETTING_USER) == 0 &&
3368 settings->user) {
3369 free(arg_user);
3370 arg_user = settings->user;
3371 settings->user = NULL;
3372 }
3373
3374 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 3375 uint64_t plus;
f757855e 3376
0e265674
LP
3377 plus = settings->capability;
3378 if (settings_private_network(settings))
3379 plus |= (1ULL << CAP_NET_ADMIN);
3380
3381 if (!arg_settings_trusted && plus != 0) {
3382 if (settings->capability != 0)
3383 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3384 } else
520e0d54 3385 arg_caps_retain |= plus;
f757855e 3386
520e0d54 3387 arg_caps_retain &= ~settings->drop_capability;
f757855e
LP
3388 }
3389
3390 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3391 settings->kill_signal > 0)
3392 arg_kill_signal = settings->kill_signal;
3393
3394 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3395 settings->personality != PERSONALITY_INVALID)
3396 arg_personality = settings->personality;
3397
3398 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3399 !sd_id128_is_null(settings->machine_id)) {
3400
3401 if (!arg_settings_trusted)
3402 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3403 else
3404 arg_uuid = settings->machine_id;
3405 }
3406
3407 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3408 settings->read_only >= 0)
3409 arg_read_only = settings->read_only;
3410
3411 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3412 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3413 arg_volatile_mode = settings->volatile_mode;
3414
3415 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3416 settings->n_custom_mounts > 0) {
3417
3418 if (!arg_settings_trusted)
3419 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3420 else {
3421 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3422 arg_custom_mounts = settings->custom_mounts;
3423 arg_n_custom_mounts = settings->n_custom_mounts;
3424
3425 settings->custom_mounts = NULL;
3426 settings->n_custom_mounts = 0;
3427 }
3428 }
3429
3430 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3431 (settings->private_network >= 0 ||
3432 settings->network_veth >= 0 ||
3433 settings->network_bridge ||
22b28dfd 3434 settings->network_zone ||
f757855e
LP
3435 settings->network_interfaces ||
3436 settings->network_macvlan ||
f6d6bad1
LP
3437 settings->network_ipvlan ||
3438 settings->network_veth_extra)) {
f757855e
LP
3439
3440 if (!arg_settings_trusted)
3441 log_warning("Ignoring network settings, file %s is not trusted.", p);
3442 else {
f6d6bad1 3443 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3444 arg_private_network = settings_private_network(settings);
3445
f757855e
LP
3446 strv_free(arg_network_interfaces);
3447 arg_network_interfaces = settings->network_interfaces;
3448 settings->network_interfaces = NULL;
3449
3450 strv_free(arg_network_macvlan);
3451 arg_network_macvlan = settings->network_macvlan;
3452 settings->network_macvlan = NULL;
3453
3454 strv_free(arg_network_ipvlan);
3455 arg_network_ipvlan = settings->network_ipvlan;
3456 settings->network_ipvlan = NULL;
3457
f6d6bad1
LP
3458 strv_free(arg_network_veth_extra);
3459 arg_network_veth_extra = settings->network_veth_extra;
3460 settings->network_veth_extra = NULL;
3461
f757855e
LP
3462 free(arg_network_bridge);
3463 arg_network_bridge = settings->network_bridge;
3464 settings->network_bridge = NULL;
22b28dfd
LP
3465
3466 free(arg_network_zone);
3467 arg_network_zone = settings->network_zone;
3468 settings->network_zone = NULL;
f757855e
LP
3469 }
3470 }
3471
3472 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3473 settings->expose_ports) {
3474
3475 if (!arg_settings_trusted)
3476 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3477 else {
3478 expose_port_free_all(arg_expose_ports);
3479 arg_expose_ports = settings->expose_ports;
3480 settings->expose_ports = NULL;
3481 }
3482 }
3483
0de7acce
LP
3484 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3485 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3486
3487 if (!arg_settings_trusted)
3488 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3489 else {
3490 arg_userns_mode = settings->userns_mode;
3491 arg_uid_shift = settings->uid_shift;
3492 arg_uid_range = settings->uid_range;
3493 arg_userns_chown = settings->userns_chown;
3494 }
3495 }
3496
9c1e04d0
AP
3497 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3498 arg_notify_ready = settings->notify_ready;
3499
f757855e
LP
3500 return 0;
3501}
3502
03cfe0d5
LP
3503int main(int argc, char *argv[]) {
3504
a6bc7db9 3505 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
03cfe0d5
LP
3506 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3507 _cleanup_close_ int master = -1, image_fd = -1;
3508 _cleanup_fdset_free_ FDSet *fds = NULL;
3509 int r, n_fd_passed, loop_nr = -1;
5aa3eba5 3510 char veth_name[IFNAMSIZ] = "";
03cfe0d5 3511 bool secondary = false, remove_subvol = false;
72c0a2c2 3512 sigset_t mask_chld;
03cfe0d5
LP
3513 pid_t pid = 0;
3514 int ret = EXIT_SUCCESS;
3515 union in_addr_union exposed = {};
3516 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
7513c5b8 3517 bool interactive, veth_created = false;
03cfe0d5
LP
3518
3519 log_parse_environment();
3520 log_open();
3521
7732f92b
LP
3522 /* Make sure rename_process() in the stub init process can work */
3523 saved_argv = argv;
3524 saved_argc = argc;
3525
03cfe0d5
LP
3526 r = parse_argv(argc, argv);
3527 if (r <= 0)
3528 goto finish;
3529
03cfe0d5
LP
3530 if (geteuid() != 0) {
3531 log_error("Need to be root.");
3532 r = -EPERM;
3533 goto finish;
3534 }
f757855e
LP
3535 r = determine_names();
3536 if (r < 0)
3537 goto finish;
3538
3539 r = load_settings();
3540 if (r < 0)
3541 goto finish;
3542
3543 r = verify_arguments();
3544 if (r < 0)
3545 goto finish;
03cfe0d5
LP
3546
3547 n_fd_passed = sd_listen_fds(false);
3548 if (n_fd_passed > 0) {
3549 r = fdset_new_listen_fds(&fds, false);
3550 if (r < 0) {
3551 log_error_errno(r, "Failed to collect file descriptors: %m");
3552 goto finish;
3553 }
3554 }
3555
3556 if (arg_directory) {
3557 assert(!arg_image);
3558
3559 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3560 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3561 r = -EINVAL;
3562 goto finish;
3563 }
3564
3565 if (arg_ephemeral) {
3566 _cleanup_free_ char *np = NULL;
3567
3568 /* If the specified path is a mount point we
3569 * generate the new snapshot immediately
3570 * inside it under a random name. However if
3571 * the specified is not a mount point we
3572 * create the new snapshot in the parent
3573 * directory, just next to it. */
e26d6ce5 3574 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3575 if (r < 0) {
3576 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3577 goto finish;
3578 }
3579 if (r > 0)
770b5ce4 3580 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3581 else
770b5ce4 3582 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3583 if (r < 0) {
3584 log_error_errno(r, "Failed to generate name for snapshot: %m");
3585 goto finish;
3586 }
3587
3588 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3589 if (r < 0) {
3590 log_error_errno(r, "Failed to lock %s: %m", np);
3591 goto finish;
3592 }
3593
5bcd08db 3594 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3595 if (r < 0) {
3596 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3597 goto finish;
ec16945e
LP
3598 }
3599
3600 free(arg_directory);
3601 arg_directory = np;
8a16a7b4 3602 np = NULL;
ec16945e
LP
3603
3604 remove_subvol = true;
30535c16
LP
3605
3606 } else {
3607 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3608 if (r == -EBUSY) {
3609 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3610 goto finish;
3611 }
3612 if (r < 0) {
3613 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
476b8254 3614 goto finish;
30535c16
LP
3615 }
3616
3617 if (arg_template) {
5bcd08db 3618 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3619 if (r == -EEXIST) {
3620 if (!arg_quiet)
3621 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3622 } else if (r < 0) {
83521414 3623 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3624 goto finish;
3625 } else {
3626 if (!arg_quiet)
3627 log_info("Populated %s from template %s.", arg_directory, arg_template);
3628 }
3629 }
ec16945e
LP
3630 }
3631
7732f92b 3632 if (arg_start_mode == START_BOOT) {
1b9e5b12 3633 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3634 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3635 r = -EINVAL;
1b9e5b12
LP
3636 goto finish;
3637 }
3638 } else {
3639 const char *p;
3640
16fb773e
LP
3641 p = strjoina(arg_directory, "/usr/");
3642 if (laccess(p, F_OK) < 0) {
3643 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3644 r = -EINVAL;
1b9e5b12 3645 goto finish;
1b9e5b12
LP
3646 }
3647 }
ec16945e 3648
6b9132a9 3649 } else {
1b9e5b12 3650 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3651
ec16945e
LP
3652 assert(arg_image);
3653 assert(!arg_template);
3654
30535c16
LP
3655 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3656 if (r == -EBUSY) {
3657 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3658 goto finish;
3659 }
3660 if (r < 0) {
3661 r = log_error_errno(r, "Failed to create image lock: %m");
3662 goto finish;
3663 }
3664
1b9e5b12 3665 if (!mkdtemp(template)) {
56f64d95 3666 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3667 r = -errno;
6b9132a9 3668 goto finish;
1b9e5b12 3669 }
6b9132a9 3670
1b9e5b12
LP
3671 arg_directory = strdup(template);
3672 if (!arg_directory) {
3673 r = log_oom();
3674 goto finish;
6b9132a9 3675 }
88213476 3676
1b9e5b12
LP
3677 image_fd = setup_image(&device_path, &loop_nr);
3678 if (image_fd < 0) {
3679 r = image_fd;
842f3b0f
LP
3680 goto finish;
3681 }
1b9e5b12 3682
4d9f07b4
LP
3683 r = dissect_image(image_fd,
3684 &root_device, &root_device_rw,
3685 &home_device, &home_device_rw,
3686 &srv_device, &srv_device_rw,
a6bc7db9 3687 &esp_device,
4d9f07b4 3688 &secondary);
1b9e5b12
LP
3689 if (r < 0)
3690 goto finish;
842f3b0f 3691 }
842f3b0f 3692
5a8af538
LP
3693 r = custom_mounts_prepare();
3694 if (r < 0)
3695 goto finish;
3696
03cfe0d5
LP
3697 interactive =
3698 isatty(STDIN_FILENO) > 0 &&
3699 isatty(STDOUT_FILENO) > 0;
9c857b9d 3700
db7feb7e
LP
3701 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3702 if (master < 0) {
ec16945e 3703 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3704 goto finish;
3705 }
3706
611b312b
LP
3707 r = ptsname_malloc(master, &console);
3708 if (r < 0) {
3709 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26 3710 goto finish;
68b02049
DW
3711 }
3712
3713 if (arg_selinux_apifs_context) {
3714 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3715 if (r < 0)
3716 goto finish;
a258bf26
LP
3717 }
3718
a258bf26 3719 if (unlockpt(master) < 0) {
ec16945e 3720 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3721 goto finish;
3722 }
3723
9c857b9d
LP
3724 if (!arg_quiet)
3725 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3726 arg_machine, arg_image ?: arg_directory);
3727
72c0a2c2 3728 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3729
023fb90b
LP
3730 assert_se(sigemptyset(&mask_chld) == 0);
3731 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3732
03cfe0d5
LP
3733 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3734 r = log_error_errno(errno, "Failed to become subreaper: %m");
3735 goto finish;
3736 }
3737
d87be9b0 3738 for (;;) {
03cfe0d5 3739 static const struct sigaction sa = {
189d5bac 3740 .sa_handler = nop_signal_handler,
e866af3a
DH
3741 .sa_flags = SA_NOCLDSTOP,
3742 };
0e7ac751
LP
3743
3744 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3745 _cleanup_close_ int etc_passwd_lock = -1;
3746 _cleanup_close_pair_ int
3747 kmsg_socket_pair[2] = { -1, -1 },
3748 rtnl_socket_pair[2] = { -1, -1 },
3749 pid_socket_pair[2] = { -1, -1 },
3750 uuid_socket_pair[2] = { -1, -1 },
9c1e04d0 3751 notify_socket_pair[2] = { -1, -1 },
0e7ac751 3752 uid_shift_socket_pair[2] = { -1, -1 };
9c1e04d0 3753 _cleanup_close_ int notify_socket= -1;
0e7ac751 3754 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4afd3348 3755 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3756 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3757 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
0e7ac751 3758 ContainerStatus container_status;
dbb60d69 3759 char last_char = 0;
0e7ac751
LP
3760 int ifi = 0;
3761 ssize_t l;
3762
0de7acce 3763 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3764 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3765 * check with getpwuid() if the specific user already exists. Note that /etc might be
3766 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3767 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3768 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3769 * really ours. */
3770
3771 etc_passwd_lock = take_etc_passwd_lock(NULL);
3772 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3773 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3774 goto finish;
3775 }
3776 }
e866af3a 3777
7566e267 3778 r = barrier_create(&barrier);
a2da110b 3779 if (r < 0) {
da927ba9 3780 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3781 goto finish;
3782 }
3783
4610de50 3784 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3785 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3786 goto finish;
3787 }
3788
4610de50 3789 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3790 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3791 goto finish;
3792 }
3793
4610de50 3794 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3795 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3796 goto finish;
3797 }
3798
e01ff70a
MS
3799 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3800 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3801 goto finish;
3802 }
3803
9c1e04d0
AP
3804 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
3805 r = log_error_errno(errno, "Failed to create notify socket pair: %m");
3806 goto finish;
3807 }
3808
0de7acce 3809 if (arg_userns_mode != USER_NAMESPACE_NO)
4610de50 3810 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3811 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3812 goto finish;
3813 }
3814
e866af3a
DH
3815 /* Child can be killed before execv(), so handle SIGCHLD
3816 * in order to interrupt parent's blocking calls and
3817 * give it a chance to call wait() and terminate. */
3818 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3819 if (r < 0) {
ec16945e 3820 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3821 goto finish;
3822 }
3823
e866af3a
DH
3824 r = sigaction(SIGCHLD, &sa, NULL);
3825 if (r < 0) {
ec16945e 3826 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3827 goto finish;
3828 }
3829
8869a0b4 3830 pid = raw_clone(SIGCHLD|CLONE_NEWNS);
d87be9b0
LP
3831 if (pid < 0) {
3832 if (errno == EINVAL)
ec16945e 3833 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3834 else
ec16945e 3835 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3836
d87be9b0
LP
3837 goto finish;
3838 }
a258bf26 3839
d87be9b0 3840 if (pid == 0) {
03cfe0d5 3841 /* The outer child only has a file system namespace. */
a2da110b
DH
3842 barrier_set_role(&barrier, BARRIER_CHILD);
3843
03e334a1 3844 master = safe_close(master);
a258bf26 3845
03e334a1 3846 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3847 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3848 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
e01ff70a 3849 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
9c1e04d0 3850 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
825d5287 3851 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3852
ce30c8dc
LP
3853 (void) reset_all_signal_handlers();
3854 (void) reset_signal_mask();
f5c1b9ee 3855
03cfe0d5
LP
3856 r = outer_child(&barrier,
3857 arg_directory,
3858 console,
3859 root_device, root_device_rw,
3860 home_device, home_device_rw,
3861 srv_device, srv_device_rw,
a6bc7db9 3862 esp_device,
03cfe0d5
LP
3863 interactive,
3864 secondary,
3865 pid_socket_pair[1],
e01ff70a 3866 uuid_socket_pair[1],
9c1e04d0 3867 notify_socket_pair[1],
03cfe0d5
LP
3868 kmsg_socket_pair[1],
3869 rtnl_socket_pair[1],
825d5287 3870 uid_shift_socket_pair[1],
f757855e 3871 fds);
0cb9fbcd 3872 if (r < 0)
a2da110b 3873 _exit(EXIT_FAILURE);
d87be9b0 3874
03cfe0d5 3875 _exit(EXIT_SUCCESS);
da5b3bad 3876 }
88213476 3877
a2da110b 3878 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3879
2feceb5e 3880 fds = fdset_free(fds);
842f3b0f 3881
6d0b55c2
LP
3882 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3883 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3884 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
e01ff70a 3885 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
9c1e04d0 3886 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
82116c43 3887 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3888
0de7acce 3889 if (arg_userns_mode != USER_NAMESPACE_NO) {
0e7ac751
LP
3890 /* The child just let us know the UID shift it might have read from the image. */
3891 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3892 if (l < 0) {
3893 r = log_error_errno(errno, "Failed to read UID shift: %m");
3894 goto finish;
3895 }
3896 if (l != sizeof(arg_uid_shift)) {
3897 log_error("Short read while reading UID shift.");
3898 r = EIO;
3899 goto finish;
3900 }
3901
0de7acce 3902 if (arg_userns_mode == USER_NAMESPACE_PICK) {
0e7ac751
LP
3903 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3904 * image, but if that's already in use, pick a new one, and report back to the child,
3905 * which one we now picked. */
3906
3907 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3908 if (r < 0) {
3909 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3910 goto finish;
3911 }
3912
3913 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3914 if (l < 0) {
3915 r = log_error_errno(errno, "Failed to send UID shift: %m");
3916 goto finish;
3917 }
3918 if (l != sizeof(arg_uid_shift)) {
3919 log_error("Short write while writing UID shift.");
3920 r = -EIO;
3921 goto finish;
3922 }
3923 }
3924 }
3925
03cfe0d5
LP
3926 /* Wait for the outer child. */
3927 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3928 if (r < 0)
3929 goto finish;
3930 if (r != 0) {
3931 r = -EIO;
3932 goto finish;
3933 }
3934 pid = 0;
6dac160c 3935
03cfe0d5
LP
3936 /* And now retrieve the PID of the inner child. */
3937 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3938 if (l < 0) {
3939 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3940 goto finish;
3941 }
3942 if (l != sizeof(pid)) {
76d44882 3943 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3944 r = EIO;
3945 goto finish;
3946 }
354bfd2b 3947
e01ff70a
MS
3948 /* We also retrieve container UUID in case it was generated by outer child */
3949 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3950 if (l < 0) {
3951 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3952 goto finish;
3953 }
3954 if (l != sizeof(arg_uuid)) {
3955 log_error("Short read while reading container machined ID.");
3956 r = EIO;
3957 goto finish;
3958 }
3959
9c1e04d0
AP
3960 /* We also retrieve the socket used for notifications generated by outer child */
3961 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3962 if (notify_socket < 0) {
3963 r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
3964 goto finish;
3965 }
3966
03cfe0d5 3967 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3968
0de7acce 3969 if (arg_userns_mode != USER_NAMESPACE_NO) {
03cfe0d5
LP
3970 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3971 log_error("Child died too early.");
3972 r = -ESRCH;
840295fc 3973 goto finish;
03cfe0d5 3974 }
ab046dde 3975
03cfe0d5 3976 r = setup_uid_map(pid);
840295fc
LP
3977 if (r < 0)
3978 goto finish;
ab046dde 3979
03cfe0d5
LP
3980 (void) barrier_place(&barrier); /* #2 */
3981 }
c74e630d 3982
9a2a5625 3983 if (arg_private_network) {
4bbfe7ad 3984
9a2a5625
LP
3985 r = move_network_interfaces(pid, arg_network_interfaces);
3986 if (r < 0)
3987 goto finish;
5aa4bb6b 3988
9a2a5625 3989 if (arg_network_veth) {
22b28dfd
LP
3990 r = setup_veth(arg_machine, pid, veth_name,
3991 arg_network_bridge || arg_network_zone);
9a2a5625
LP
3992 if (r < 0)
3993 goto finish;
3994 else if (r > 0)
3995 ifi = r;
6dac160c 3996
9a2a5625 3997 if (arg_network_bridge) {
22b28dfd
LP
3998 /* Add the interface to a bridge */
3999 r = setup_bridge(veth_name, arg_network_bridge, false);
4000 if (r < 0)
4001 goto finish;
4002 if (r > 0)
4003 ifi = r;
4004 } else if (arg_network_zone) {
4005 /* Add the interface to a bridge, possibly creating it */
4006 r = setup_bridge(veth_name, arg_network_zone, true);
9a2a5625
LP
4007 if (r < 0)
4008 goto finish;
4009 if (r > 0)
4010 ifi = r;
4011 }
4012 }
6dac160c 4013
f6d6bad1
LP
4014 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
4015 if (r < 0)
4016 goto finish;
4017
7513c5b8
LP
4018 /* We created the primary and extra veth links now; let's remember this, so that we know to
4019 remove them later on. Note that we don't bother with removing veth links that were created
4020 here when their setup failed half-way, because in that case the kernel should be able to
4021 remove them on its own, since they cannot be referenced by anything yet. */
4022 veth_created = true;
4023
9a2a5625
LP
4024 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
4025 if (r < 0)
4026 goto finish;
4027
4028 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
4029 if (r < 0)
4030 goto finish;
4031 }
6dac160c 4032
b7103bc5
LP
4033 if (arg_register) {
4034 r = register_machine(
4035 arg_machine,
4036 pid,
4037 arg_directory,
4038 arg_uuid,
4039 ifi,
4040 arg_slice,
4041 arg_custom_mounts, arg_n_custom_mounts,
4042 arg_kill_signal,
4043 arg_property,
6aadfa4c
ILG
4044 arg_keep_unit,
4045 arg_container_service_name);
b7103bc5
LP
4046 if (r < 0)
4047 goto finish;
4048 }
6dac160c 4049
34829a32 4050 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
4051 if (r < 0)
4052 goto finish;
4053
34829a32
LP
4054 if (arg_keep_unit) {
4055 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
4056 if (r < 0)
4057 goto finish;
4058 }
efdb0237 4059
34829a32 4060 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
4061 if (r < 0)
4062 goto finish;
6dac160c 4063
03cfe0d5
LP
4064 /* Notify the child that the parent is ready with all
4065 * its setup (including cgroup-ification), and that
4066 * the child can now hand over control to the code to
4067 * run inside the container. */
4068 (void) barrier_place(&barrier); /* #3 */
6dac160c 4069
03cfe0d5
LP
4070 /* Block SIGCHLD here, before notifying child.
4071 * process_pty() will handle it with the other signals. */
4072 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4073
03cfe0d5
LP
4074 /* Reset signal to default */
4075 r = default_signals(SIGCHLD, -1);
4076 if (r < 0) {
4077 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4078 goto finish;
4079 }
e866af3a 4080
9c1e04d0
AP
4081 r = sd_event_new(&event);
4082 if (r < 0) {
4083 log_error_errno(r, "Failed to get default event source: %m");
4084 goto finish;
4085 }
4086
4087 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
4088 if (r < 0)
4089 goto finish;
4090
03cfe0d5 4091 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
4092 if (!barrier_place_and_sync(&barrier)) { /* #4 */
4093 log_error("Child died too early.");
03cfe0d5
LP
4094 r = -ESRCH;
4095 goto finish;
4096 }
b12afc8c 4097
0e7ac751
LP
4098 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
4099 * in getpwuid(), thus we can release the /etc/passwd lock. */
4100 etc_passwd_lock = safe_close(etc_passwd_lock);
4101
03cfe0d5 4102 sd_notifyf(false,
03cfe0d5
LP
4103 "STATUS=Container running.\n"
4104 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
9c1e04d0
AP
4105 if (!arg_notify_ready)
4106 sd_notify(false, "READY=1\n");
88213476 4107
03cfe0d5
LP
4108 if (arg_kill_signal > 0) {
4109 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
4110 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
4111 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
4112 } else {
4113 /* Immediately exit */
4114 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4115 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4116 }
023fb90b 4117
03cfe0d5
LP
4118 /* simply exit on sigchld */
4119 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4120
03cfe0d5 4121 if (arg_expose_ports) {
7a8f6325 4122 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
4123 if (r < 0)
4124 goto finish;
023fb90b 4125
7a8f6325 4126 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 4127 }
023fb90b 4128
03cfe0d5 4129 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4130
ae3dde80 4131 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
4132 if (r < 0) {
4133 log_error_errno(r, "Failed to create PTY forwarder: %m");
4134 goto finish;
4135 }
023fb90b 4136
03cfe0d5
LP
4137 r = sd_event_loop(event);
4138 if (r < 0) {
4139 log_error_errno(r, "Failed to run event loop: %m");
4140 goto finish;
4141 }
6d0b55c2 4142
03cfe0d5 4143 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 4144
03cfe0d5 4145 forward = pty_forward_free(forward);
6d0b55c2 4146
03cfe0d5
LP
4147 if (!arg_quiet && last_char != '\n')
4148 putc('\n', stdout);
04d39279 4149
03cfe0d5 4150 /* Kill if it is not dead yet anyway */
b7103bc5
LP
4151 if (arg_register && !arg_keep_unit)
4152 terminate_machine(pid);
1f0cd86b 4153
840295fc 4154 /* Normally redundant, but better safe than sorry */
04d39279 4155 kill(pid, SIGKILL);
a258bf26 4156
113cea80 4157 r = wait_for_container(pid, &container_status);
04d39279
LP
4158 pid = 0;
4159
ec16945e 4160 if (r < 0)
ce9f1527
LP
4161 /* We failed to wait for the container, or the
4162 * container exited abnormally */
ec16945e 4163 goto finish;
9ed794a3 4164 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
ce9f1527
LP
4165 /* The container exited with a non-zero
4166 * status, or with zero status and no reboot
4167 * was requested. */
ec16945e 4168 ret = r;
d87be9b0 4169 break;
ec16945e 4170 }
88213476 4171
113cea80 4172 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4173
4174 if (arg_keep_unit) {
4175 /* Special handling if we are running as a
4176 * service: instead of simply restarting the
4177 * machine we want to restart the entire
4178 * service, so let's inform systemd about this
4179 * with the special exit code 133. The service
4180 * file uses RestartForceExitStatus=133 so
4181 * that this results in a full nspawn
4182 * restart. This is necessary since we might
4183 * have cgroup parameters set we want to have
4184 * flushed out. */
ec16945e
LP
4185 ret = 133;
4186 r = 0;
ce38dbc8
LP
4187 break;
4188 }
6d0b55c2 4189
7a8f6325 4190 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8 4191
ef3b2aa7 4192 (void) remove_veth_links(veth_name, arg_network_veth_extra);
7513c5b8 4193 veth_created = false;
d87be9b0 4194 }
88213476
LP
4195
4196finish:
af4ec430
LP
4197 sd_notify(false,
4198 "STOPPING=1\n"
4199 "STATUS=Terminating...");
4200
9444b1f2
LP
4201 if (pid > 0)
4202 kill(pid, SIGKILL);
88213476 4203
503546da
LP
4204 /* Try to flush whatever is still queued in the pty */
4205 if (master >= 0)
59f448cf 4206 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 4207
03cfe0d5
LP
4208 loop_remove(loop_nr, &image_fd);
4209
ec16945e
LP
4210 if (remove_subvol && arg_directory) {
4211 int k;
4212
5bcd08db 4213 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
4214 if (k < 0)
4215 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4216 }
4217
785890ac
LP
4218 if (arg_machine) {
4219 const char *p;
4220
63c372cb 4221 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4222 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4223 }
4224
7a8f6325 4225 expose_port_flush(arg_expose_ports, &exposed);
7513c5b8
LP
4226
4227 if (veth_created)
4228 (void) remove_veth_links(veth_name, arg_network_veth_extra);
22b28dfd 4229 (void) remove_bridge(arg_network_zone);
f757855e 4230
04d391da 4231 free(arg_directory);
ec16945e
LP
4232 free(arg_template);
4233 free(arg_image);
7027ff61 4234 free(arg_machine);
c74e630d 4235 free(arg_user);
5f932eb9 4236 free(arg_chdir);
c74e630d 4237 strv_free(arg_setenv);
f757855e 4238 free(arg_network_bridge);
c74e630d
LP
4239 strv_free(arg_network_interfaces);
4240 strv_free(arg_network_macvlan);
4bbfe7ad 4241 strv_free(arg_network_ipvlan);
f6d6bad1 4242 strv_free(arg_network_veth_extra);
f757855e
LP
4243 strv_free(arg_parameters);
4244 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4245 expose_port_free_all(arg_expose_ports);
6d0b55c2 4246
ec16945e 4247 return r < 0 ? EXIT_FAILURE : ret;
88213476 4248}