]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
util-lib: split out user/group/uid/gid calls into user-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e
LP
48
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
8fe0087e
LP
53#include "cap-list.h"
54#include "capability.h"
04d391da 55#include "cgroup-util.h"
8fe0087e 56#include "copy.h"
4fc9982c 57#include "dev-setup.h"
8fe0087e
LP
58#include "env-util.h"
59#include "event-util.h"
3ffd4af2 60#include "fd-util.h"
842f3b0f 61#include "fdset.h"
a5c32cff 62#include "fileio.h"
8fe0087e 63#include "formats-util.h"
1b9e5b12 64#include "gpt.h"
8fe0087e
LP
65#include "hostname-util.h"
66#include "log.h"
67#include "loopback-setup.h"
1b9cebf6 68#include "machine-image.h"
8fe0087e
LP
69#include "macro.h"
70#include "missing.h"
71#include "mkdir.h"
72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
77#include "nspawn-register.h"
78#include "nspawn-settings.h"
79#include "nspawn-setuid.h"
8fe0087e 80#include "path-util.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
84#include "rm-rf.h"
e9642be2
LP
85#ifdef HAVE_SECCOMP
86#include "seccomp-util.h"
87#endif
8fe0087e 88#include "signal-util.h"
07630cea 89#include "string-util.h"
8fe0087e
LP
90#include "strv.h"
91#include "terminal-util.h"
92#include "udev-util.h"
b1d4f8e1 93#include "user-util.h"
8fe0087e 94#include "util.h"
e9642be2 95
113cea80
DH
96typedef enum ContainerStatus {
97 CONTAINER_TERMINATED,
98 CONTAINER_REBOOTED
99} ContainerStatus;
100
57fb9fb5
LP
101typedef enum LinkJournal {
102 LINK_NO,
103 LINK_AUTO,
104 LINK_HOST,
105 LINK_GUEST
106} LinkJournal;
88213476
LP
107
108static char *arg_directory = NULL;
ec16945e 109static char *arg_template = NULL;
687d0825 110static char *arg_user = NULL;
9444b1f2 111static sd_id128_t arg_uuid = {};
7027ff61 112static char *arg_machine = NULL;
c74e630d
LP
113static const char *arg_selinux_context = NULL;
114static const char *arg_selinux_apifs_context = NULL;
9444b1f2 115static const char *arg_slice = NULL;
ff01d048 116static bool arg_private_network = false;
bc2f673e 117static bool arg_read_only = false;
0f0dbc46 118static bool arg_boot = false;
ec16945e 119static bool arg_ephemeral = false;
57fb9fb5 120static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 121static bool arg_link_journal_try = false;
5076f0cc
LP
122static uint64_t arg_retain =
123 (1ULL << CAP_CHOWN) |
124 (1ULL << CAP_DAC_OVERRIDE) |
125 (1ULL << CAP_DAC_READ_SEARCH) |
126 (1ULL << CAP_FOWNER) |
127 (1ULL << CAP_FSETID) |
128 (1ULL << CAP_IPC_OWNER) |
129 (1ULL << CAP_KILL) |
130 (1ULL << CAP_LEASE) |
131 (1ULL << CAP_LINUX_IMMUTABLE) |
132 (1ULL << CAP_NET_BIND_SERVICE) |
133 (1ULL << CAP_NET_BROADCAST) |
134 (1ULL << CAP_NET_RAW) |
135 (1ULL << CAP_SETGID) |
136 (1ULL << CAP_SETFCAP) |
137 (1ULL << CAP_SETPCAP) |
138 (1ULL << CAP_SETUID) |
139 (1ULL << CAP_SYS_ADMIN) |
140 (1ULL << CAP_SYS_CHROOT) |
141 (1ULL << CAP_SYS_NICE) |
142 (1ULL << CAP_SYS_PTRACE) |
143 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 144 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
145 (1ULL << CAP_SYS_BOOT) |
146 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
147 (1ULL << CAP_AUDIT_CONTROL) |
148 (1ULL << CAP_MKNOD);
5a8af538
LP
149static CustomMount *arg_custom_mounts = NULL;
150static unsigned arg_n_custom_mounts = 0;
f4889f65 151static char **arg_setenv = NULL;
284c0b91 152static bool arg_quiet = false;
8a96d94e 153static bool arg_share_system = false;
eb91eb18 154static bool arg_register = true;
89f7c846 155static bool arg_keep_unit = false;
aa28aefe 156static char **arg_network_interfaces = NULL;
c74e630d 157static char **arg_network_macvlan = NULL;
4bbfe7ad 158static char **arg_network_ipvlan = NULL;
69c79d3c 159static bool arg_network_veth = false;
f757855e 160static char *arg_network_bridge = NULL;
050f7277 161static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 162static char *arg_image = NULL;
f757855e 163static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 164static ExposePort *arg_expose_ports = NULL;
f36933fe 165static char **arg_property = NULL;
6dac160c
LP
166static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
167static bool arg_userns = false;
c6c8f6e2 168static int arg_kill_signal = 0;
efdb0237 169static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
170static SettingsMask arg_settings_mask = 0;
171static int arg_settings_trusted = -1;
172static char **arg_parameters = NULL;
88213476 173
601185b4 174static void help(void) {
88213476
LP
175 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
176 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
177 " -h --help Show this help\n"
178 " --version Print version string\n"
69c79d3c 179 " -q --quiet Do not show status information\n"
1b9e5b12 180 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
181 " --template=PATH Initialize root directory from template directory,\n"
182 " if missing\n"
183 " -x --ephemeral Run container with snapshot of root directory, and\n"
184 " remove it after exit\n"
185 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
186 " -b --boot Boot up full system (i.e. invoke init)\n"
187 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 188 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 189 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 190 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 191 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
192 " --private-users[=UIDBASE[:NUIDS]]\n"
193 " Run within user namespace\n"
69c79d3c
LP
194 " --private-network Disable network in container\n"
195 " --network-interface=INTERFACE\n"
196 " Assign an existing network interface to the\n"
197 " container\n"
c74e630d
LP
198 " --network-macvlan=INTERFACE\n"
199 " Create a macvlan network interface based on an\n"
200 " existing network interface to the container\n"
4bbfe7ad
TG
201 " --network-ipvlan=INTERFACE\n"
202 " Create a ipvlan network interface based on an\n"
203 " existing network interface to the container\n"
0dfaa006 204 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 205 " and container\n"
ab046dde 206 " --network-bridge=INTERFACE\n"
32457153 207 " Add a virtual ethernet connection between host\n"
ab046dde
TG
208 " and container and add it to an existing bridge on\n"
209 " the host\n"
6d0b55c2 210 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 211 " Expose a container IP port on the host\n"
82adf6af
LP
212 " -Z --selinux-context=SECLABEL\n"
213 " Set the SELinux security context to be used by\n"
214 " processes in the container\n"
215 " -L --selinux-apifs-context=SECLABEL\n"
216 " Set the SELinux security context to be used by\n"
217 " API/tmpfs file systems in the container\n"
a8828ed9
DW
218 " --capability=CAP In addition to the default, retain specified\n"
219 " capability\n"
220 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 221 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
222 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
223 " try-guest, try-host\n"
224 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 225 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
226 " --bind=PATH[:PATH[:OPTIONS]]\n"
227 " Bind mount a file or directory from the host into\n"
a8828ed9 228 " the container\n"
5e5bfa6e
EY
229 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
230 " Similar, but creates a read-only bind mount\n"
06c17c39 231 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
232 " --overlay=PATH[:PATH...]:PATH\n"
233 " Create an overlay mount from the host to \n"
234 " the container\n"
235 " --overlay-ro=PATH[:PATH...]:PATH\n"
236 " Similar, but creates a read-only overlay mount\n"
284c0b91 237 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 238 " --share-system Share system namespaces with host\n"
eb91eb18 239 " --register=BOOLEAN Register container as machine\n"
89f7c846 240 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 241 " the service unit nspawn is running in\n"
6d0b55c2 242 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 243 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 244 , program_invocation_short_name);
88213476
LP
245}
246
5a8af538
LP
247
248static int custom_mounts_prepare(void) {
249 unsigned i;
250 int r;
251
252 /* Ensure the mounts are applied prefix first. */
253 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
254
255 /* Allocate working directories for the overlay file systems that need it */
256 for (i = 0; i < arg_n_custom_mounts; i++) {
257 CustomMount *m = &arg_custom_mounts[i];
258
825d5287
RM
259 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
260 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
261 return -EINVAL;
262 }
263
5a8af538
LP
264 if (m->type != CUSTOM_MOUNT_OVERLAY)
265 continue;
266
267 if (m->work_dir)
268 continue;
269
270 if (m->read_only)
271 continue;
272
14bcf25c 273 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
274 if (r < 0)
275 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
276 }
277
278 return 0;
279}
280
efdb0237
LP
281static int detect_unified_cgroup_hierarchy(void) {
282 const char *e;
283 int r;
284
285 /* Allow the user to control whether the unified hierarchy is used */
286 e = getenv("UNIFIED_CGROUP_HIERARCHY");
287 if (e) {
288 r = parse_boolean(e);
289 if (r < 0)
290 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
291
292 arg_unified_cgroup_hierarchy = r;
293 return 0;
294 }
295
296 /* Otherwise inherit the default from the host system */
297 r = cg_unified();
298 if (r < 0)
299 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
300
301 arg_unified_cgroup_hierarchy = r;
302 return 0;
303}
304
88213476
LP
305static int parse_argv(int argc, char *argv[]) {
306
a41fe3a2 307 enum {
acbeb427
ZJS
308 ARG_VERSION = 0x100,
309 ARG_PRIVATE_NETWORK,
bc2f673e 310 ARG_UUID,
5076f0cc 311 ARG_READ_ONLY,
57fb9fb5 312 ARG_CAPABILITY,
420c7379 313 ARG_DROP_CAPABILITY,
17fe0523
LP
314 ARG_LINK_JOURNAL,
315 ARG_BIND,
f4889f65 316 ARG_BIND_RO,
06c17c39 317 ARG_TMPFS,
5a8af538
LP
318 ARG_OVERLAY,
319 ARG_OVERLAY_RO,
f4889f65 320 ARG_SETENV,
eb91eb18 321 ARG_SHARE_SYSTEM,
89f7c846 322 ARG_REGISTER,
aa28aefe 323 ARG_KEEP_UNIT,
69c79d3c 324 ARG_NETWORK_INTERFACE,
c74e630d 325 ARG_NETWORK_MACVLAN,
4bbfe7ad 326 ARG_NETWORK_IPVLAN,
ab046dde 327 ARG_NETWORK_BRIDGE,
6afc95b7 328 ARG_PERSONALITY,
4d9f07b4 329 ARG_VOLATILE,
ec16945e 330 ARG_TEMPLATE,
f36933fe 331 ARG_PROPERTY,
6dac160c 332 ARG_PRIVATE_USERS,
c6c8f6e2 333 ARG_KILL_SIGNAL,
f757855e 334 ARG_SETTINGS,
a41fe3a2
LP
335 };
336
88213476 337 static const struct option options[] = {
aa28aefe
LP
338 { "help", no_argument, NULL, 'h' },
339 { "version", no_argument, NULL, ARG_VERSION },
340 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
341 { "template", required_argument, NULL, ARG_TEMPLATE },
342 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
343 { "user", required_argument, NULL, 'u' },
344 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
345 { "boot", no_argument, NULL, 'b' },
346 { "uuid", required_argument, NULL, ARG_UUID },
347 { "read-only", no_argument, NULL, ARG_READ_ONLY },
348 { "capability", required_argument, NULL, ARG_CAPABILITY },
349 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
350 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
351 { "bind", required_argument, NULL, ARG_BIND },
352 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 353 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
354 { "overlay", required_argument, NULL, ARG_OVERLAY },
355 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
356 { "machine", required_argument, NULL, 'M' },
357 { "slice", required_argument, NULL, 'S' },
358 { "setenv", required_argument, NULL, ARG_SETENV },
359 { "selinux-context", required_argument, NULL, 'Z' },
360 { "selinux-apifs-context", required_argument, NULL, 'L' },
361 { "quiet", no_argument, NULL, 'q' },
362 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
363 { "register", required_argument, NULL, ARG_REGISTER },
364 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
365 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 366 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 367 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 368 { "network-veth", no_argument, NULL, 'n' },
ab046dde 369 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 370 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 371 { "image", required_argument, NULL, 'i' },
4d9f07b4 372 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 373 { "port", required_argument, NULL, 'p' },
f36933fe 374 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 375 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 376 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 377 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 378 {}
88213476
LP
379 };
380
9444b1f2 381 int c, r;
a42c8b54 382 uint64_t plus = 0, minus = 0;
f757855e 383 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
384
385 assert(argc >= 0);
386 assert(argv);
387
0dfaa006 388 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
389
390 switch (c) {
391
392 case 'h':
601185b4
ZJS
393 help();
394 return 0;
88213476 395
acbeb427 396 case ARG_VERSION:
3f6fd1ba 397 return version();
acbeb427 398
88213476 399 case 'D':
0f03c2a4 400 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 401 if (r < 0)
0f03c2a4 402 return r;
ec16945e
LP
403 break;
404
405 case ARG_TEMPLATE:
0f03c2a4 406 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 407 if (r < 0)
0f03c2a4 408 return r;
88213476
LP
409 break;
410
1b9e5b12 411 case 'i':
0f03c2a4 412 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 413 if (r < 0)
0f03c2a4 414 return r;
ec16945e
LP
415 break;
416
417 case 'x':
418 arg_ephemeral = true;
1b9e5b12
LP
419 break;
420
687d0825 421 case 'u':
2fc09a9c
DM
422 r = free_and_strdup(&arg_user, optarg);
423 if (r < 0)
7027ff61 424 return log_oom();
687d0825 425
f757855e 426 arg_settings_mask |= SETTING_USER;
687d0825
MV
427 break;
428
ab046dde 429 case ARG_NETWORK_BRIDGE:
f757855e
LP
430 r = free_and_strdup(&arg_network_bridge, optarg);
431 if (r < 0)
432 return log_oom();
ab046dde
TG
433
434 /* fall through */
435
0dfaa006 436 case 'n':
69c79d3c
LP
437 arg_network_veth = true;
438 arg_private_network = true;
f757855e 439 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
440 break;
441
aa28aefe 442 case ARG_NETWORK_INTERFACE:
c74e630d
LP
443 if (strv_extend(&arg_network_interfaces, optarg) < 0)
444 return log_oom();
445
446 arg_private_network = true;
f757855e 447 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
448 break;
449
450 case ARG_NETWORK_MACVLAN:
451 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
452 return log_oom();
453
4bbfe7ad 454 arg_private_network = true;
f757855e 455 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
456 break;
457
458 case ARG_NETWORK_IPVLAN:
459 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
460 return log_oom();
461
aa28aefe
LP
462 /* fall through */
463
ff01d048
LP
464 case ARG_PRIVATE_NETWORK:
465 arg_private_network = true;
f757855e 466 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
467 break;
468
0f0dbc46
LP
469 case 'b':
470 arg_boot = true;
f757855e 471 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
472 break;
473
144f0fc0 474 case ARG_UUID:
9444b1f2
LP
475 r = sd_id128_from_string(optarg, &arg_uuid);
476 if (r < 0) {
aa96c6cb 477 log_error("Invalid UUID: %s", optarg);
9444b1f2 478 return r;
aa96c6cb 479 }
f757855e
LP
480
481 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 482 break;
aa96c6cb 483
9444b1f2 484 case 'S':
c74e630d 485 arg_slice = optarg;
144f0fc0
LP
486 break;
487
7027ff61 488 case 'M':
c1521918 489 if (isempty(optarg))
97b11eed 490 arg_machine = mfree(arg_machine);
c1521918 491 else {
0c3c4284 492 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
493 log_error("Invalid machine name: %s", optarg);
494 return -EINVAL;
495 }
7027ff61 496
0c3c4284
LP
497 r = free_and_strdup(&arg_machine, optarg);
498 if (r < 0)
eb91eb18
LP
499 return log_oom();
500
501 break;
502 }
7027ff61 503
82adf6af
LP
504 case 'Z':
505 arg_selinux_context = optarg;
a8828ed9
DW
506 break;
507
82adf6af
LP
508 case 'L':
509 arg_selinux_apifs_context = optarg;
a8828ed9
DW
510 break;
511
bc2f673e
LP
512 case ARG_READ_ONLY:
513 arg_read_only = true;
f757855e 514 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
515 break;
516
420c7379
LP
517 case ARG_CAPABILITY:
518 case ARG_DROP_CAPABILITY: {
a2a5291b 519 const char *state, *word;
5076f0cc
LP
520 size_t length;
521
522 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 523 _cleanup_free_ char *t;
5076f0cc
LP
524
525 t = strndup(word, length);
0d0f0c50
SL
526 if (!t)
527 return log_oom();
5076f0cc 528
39ed67d1
LP
529 if (streq(t, "all")) {
530 if (c == ARG_CAPABILITY)
a42c8b54 531 plus = (uint64_t) -1;
39ed67d1 532 else
a42c8b54 533 minus = (uint64_t) -1;
39ed67d1 534 } else {
2822da4f
LP
535 int cap;
536
537 cap = capability_from_name(t);
538 if (cap < 0) {
39ed67d1
LP
539 log_error("Failed to parse capability %s.", t);
540 return -EINVAL;
541 }
542
543 if (c == ARG_CAPABILITY)
a42c8b54 544 plus |= 1ULL << (uint64_t) cap;
39ed67d1 545 else
a42c8b54 546 minus |= 1ULL << (uint64_t) cap;
5076f0cc 547 }
5076f0cc
LP
548 }
549
f757855e 550 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
551 break;
552 }
553
57fb9fb5
LP
554 case 'j':
555 arg_link_journal = LINK_GUEST;
574edc90 556 arg_link_journal_try = true;
57fb9fb5
LP
557 break;
558
559 case ARG_LINK_JOURNAL:
53e438e3 560 if (streq(optarg, "auto")) {
57fb9fb5 561 arg_link_journal = LINK_AUTO;
53e438e3
LP
562 arg_link_journal_try = false;
563 } else if (streq(optarg, "no")) {
57fb9fb5 564 arg_link_journal = LINK_NO;
53e438e3
LP
565 arg_link_journal_try = false;
566 } else if (streq(optarg, "guest")) {
57fb9fb5 567 arg_link_journal = LINK_GUEST;
53e438e3
LP
568 arg_link_journal_try = false;
569 } else if (streq(optarg, "host")) {
57fb9fb5 570 arg_link_journal = LINK_HOST;
53e438e3
LP
571 arg_link_journal_try = false;
572 } else if (streq(optarg, "try-guest")) {
574edc90
MP
573 arg_link_journal = LINK_GUEST;
574 arg_link_journal_try = true;
575 } else if (streq(optarg, "try-host")) {
576 arg_link_journal = LINK_HOST;
577 arg_link_journal_try = true;
578 } else {
57fb9fb5
LP
579 log_error("Failed to parse link journal mode %s", optarg);
580 return -EINVAL;
581 }
582
583 break;
584
17fe0523 585 case ARG_BIND:
f757855e
LP
586 case ARG_BIND_RO:
587 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
588 if (r < 0)
589 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 590
f757855e 591 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 592 break;
06c17c39 593
f757855e
LP
594 case ARG_TMPFS:
595 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
596 if (r < 0)
597 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 598
f757855e 599 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 600 break;
5a8af538
LP
601
602 case ARG_OVERLAY:
603 case ARG_OVERLAY_RO: {
604 _cleanup_free_ char *upper = NULL, *destination = NULL;
605 _cleanup_strv_free_ char **lower = NULL;
606 CustomMount *m;
607 unsigned n = 0;
608 char **i;
609
62f9f39a
RM
610 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
611 if (r == -ENOMEM)
06c17c39 612 return log_oom();
62f9f39a
RM
613 else if (r < 0) {
614 log_error("Invalid overlay specification: %s", optarg);
615 return r;
616 }
06c17c39 617
5a8af538
LP
618 STRV_FOREACH(i, lower) {
619 if (!path_is_absolute(*i)) {
620 log_error("Overlay path %s is not absolute.", *i);
621 return -EINVAL;
622 }
623
624 n++;
625 }
626
627 if (n < 2) {
628 log_error("--overlay= needs at least two colon-separated directories specified.");
629 return -EINVAL;
630 }
631
632 if (n == 2) {
633 /* If two parameters are specified,
634 * the first one is the lower, the
635 * second one the upper directory. And
af86c440
ZJS
636 * we'll also define the destination
637 * mount point the same as the upper. */
5a8af538
LP
638 upper = lower[1];
639 lower[1] = NULL;
640
641 destination = strdup(upper);
642 if (!destination)
643 return log_oom();
644
645 } else {
646 upper = lower[n - 2];
647 destination = lower[n - 1];
648 lower[n - 2] = NULL;
649 }
650
f757855e 651 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
652 if (!m)
653 return log_oom();
654
655 m->destination = destination;
656 m->source = upper;
657 m->lower = lower;
658 m->read_only = c == ARG_OVERLAY_RO;
659
660 upper = destination = NULL;
661 lower = NULL;
06c17c39 662
f757855e 663 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
664 break;
665 }
666
f4889f65
LP
667 case ARG_SETENV: {
668 char **n;
669
670 if (!env_assignment_is_valid(optarg)) {
671 log_error("Environment variable assignment '%s' is not valid.", optarg);
672 return -EINVAL;
673 }
674
675 n = strv_env_set(arg_setenv, optarg);
676 if (!n)
677 return log_oom();
678
679 strv_free(arg_setenv);
680 arg_setenv = n;
f757855e
LP
681
682 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
683 break;
684 }
685
284c0b91
LP
686 case 'q':
687 arg_quiet = true;
688 break;
689
8a96d94e
LP
690 case ARG_SHARE_SYSTEM:
691 arg_share_system = true;
692 break;
693
eb91eb18
LP
694 case ARG_REGISTER:
695 r = parse_boolean(optarg);
696 if (r < 0) {
697 log_error("Failed to parse --register= argument: %s", optarg);
698 return r;
699 }
700
701 arg_register = r;
702 break;
703
89f7c846
LP
704 case ARG_KEEP_UNIT:
705 arg_keep_unit = true;
706 break;
707
6afc95b7
LP
708 case ARG_PERSONALITY:
709
ac45f971 710 arg_personality = personality_from_string(optarg);
050f7277 711 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
712 log_error("Unknown or unsupported personality '%s'.", optarg);
713 return -EINVAL;
714 }
715
f757855e 716 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
717 break;
718
4d9f07b4
LP
719 case ARG_VOLATILE:
720
721 if (!optarg)
f757855e 722 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 723 else {
f757855e 724 VolatileMode m;
4d9f07b4 725
f757855e
LP
726 m = volatile_mode_from_string(optarg);
727 if (m < 0) {
728 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 729 return -EINVAL;
f757855e
LP
730 } else
731 arg_volatile_mode = m;
6d0b55c2
LP
732 }
733
f757855e
LP
734 arg_settings_mask |= SETTING_VOLATILE_MODE;
735 break;
6d0b55c2 736
f757855e
LP
737 case 'p':
738 r = expose_port_parse(&arg_expose_ports, optarg);
739 if (r == -EEXIST)
740 return log_error_errno(r, "Duplicate port specification: %s", optarg);
741 if (r < 0)
742 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 743
f757855e 744 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 745 break;
6d0b55c2 746
f36933fe
LP
747 case ARG_PROPERTY:
748 if (strv_extend(&arg_property, optarg) < 0)
749 return log_oom();
750
751 break;
752
6dac160c
LP
753 case ARG_PRIVATE_USERS:
754 if (optarg) {
755 _cleanup_free_ char *buffer = NULL;
756 const char *range, *shift;
757
758 range = strchr(optarg, ':');
759 if (range) {
760 buffer = strndup(optarg, range - optarg);
761 if (!buffer)
762 return log_oom();
763 shift = buffer;
764
765 range++;
766 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
767 log_error("Failed to parse UID range: %s", range);
768 return -EINVAL;
769 }
770 } else
771 shift = optarg;
772
773 if (parse_uid(shift, &arg_uid_shift) < 0) {
774 log_error("Failed to parse UID: %s", optarg);
775 return -EINVAL;
776 }
777 }
778
779 arg_userns = true;
780 break;
781
c6c8f6e2
LP
782 case ARG_KILL_SIGNAL:
783 arg_kill_signal = signal_from_string_try_harder(optarg);
784 if (arg_kill_signal < 0) {
785 log_error("Cannot parse signal: %s", optarg);
786 return -EINVAL;
787 }
788
f757855e
LP
789 arg_settings_mask |= SETTING_KILL_SIGNAL;
790 break;
791
792 case ARG_SETTINGS:
793
794 /* no → do not read files
795 * yes → read files, do not override cmdline, trust only subset
796 * override → read files, override cmdline, trust only subset
797 * trusted → read files, do not override cmdline, trust all
798 */
799
800 r = parse_boolean(optarg);
801 if (r < 0) {
802 if (streq(optarg, "trusted")) {
803 mask_all_settings = false;
804 mask_no_settings = false;
805 arg_settings_trusted = true;
806
807 } else if (streq(optarg, "override")) {
808 mask_all_settings = false;
809 mask_no_settings = true;
810 arg_settings_trusted = -1;
811 } else
812 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
813 } else if (r > 0) {
814 /* yes */
815 mask_all_settings = false;
816 mask_no_settings = false;
817 arg_settings_trusted = -1;
818 } else {
819 /* no */
820 mask_all_settings = true;
821 mask_no_settings = false;
822 arg_settings_trusted = false;
823 }
824
c6c8f6e2
LP
825 break;
826
88213476
LP
827 case '?':
828 return -EINVAL;
829
830 default:
eb9da376 831 assert_not_reached("Unhandled option");
88213476 832 }
88213476 833
eb91eb18
LP
834 if (arg_share_system)
835 arg_register = false;
836
837 if (arg_boot && arg_share_system) {
838 log_error("--boot and --share-system may not be combined.");
839 return -EINVAL;
840 }
841
89f7c846
LP
842 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
843 log_error("--keep-unit may not be used when invoked from a user session.");
844 return -EINVAL;
845 }
846
1b9e5b12
LP
847 if (arg_directory && arg_image) {
848 log_error("--directory= and --image= may not be combined.");
849 return -EINVAL;
850 }
851
ec16945e
LP
852 if (arg_template && arg_image) {
853 log_error("--template= and --image= may not be combined.");
854 return -EINVAL;
855 }
856
857 if (arg_template && !(arg_directory || arg_machine)) {
858 log_error("--template= needs --directory= or --machine=.");
859 return -EINVAL;
860 }
861
862 if (arg_ephemeral && arg_template) {
863 log_error("--ephemeral and --template= may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_ephemeral && arg_image) {
868 log_error("--ephemeral and --image= may not be combined.");
869 return -EINVAL;
870 }
871
df9a75e4
LP
872 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
873 log_error("--ephemeral and --link-journal= may not be combined.");
874 return -EINVAL;
875 }
876
f757855e
LP
877 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
878 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
879
880 if (argc > optind) {
881 arg_parameters = strv_copy(argv + optind);
882 if (!arg_parameters)
883 return log_oom();
884
885 arg_settings_mask |= SETTING_BOOT;
886 }
887
888 /* Load all settings from .nspawn files */
889 if (mask_no_settings)
890 arg_settings_mask = 0;
891
892 /* Don't load any settings from .nspawn files */
893 if (mask_all_settings)
894 arg_settings_mask = _SETTINGS_MASK_ALL;
895
896 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
897
898 r = detect_unified_cgroup_hierarchy();
899 if (r < 0)
900 return r;
901
902 return 1;
903}
904
905static int verify_arguments(void) {
906
907 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
908 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
909 return -EINVAL;
910 }
911
6d0b55c2
LP
912 if (arg_expose_ports && !arg_private_network) {
913 log_error("Cannot use --port= without private networking.");
914 return -EINVAL;
915 }
916
c6c8f6e2
LP
917 if (arg_boot && arg_kill_signal <= 0)
918 arg_kill_signal = SIGRTMIN+3;
919
f757855e 920 return 0;
88213476
LP
921}
922
03cfe0d5
LP
923static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
924 assert(p);
925
926 if (!arg_userns)
927 return 0;
928
929 if (uid == UID_INVALID && gid == GID_INVALID)
930 return 0;
931
932 if (uid != UID_INVALID) {
933 uid += arg_uid_shift;
934
935 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
936 return -EOVERFLOW;
937 }
938
939 if (gid != GID_INVALID) {
940 gid += (gid_t) arg_uid_shift;
941
942 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
943 return -EOVERFLOW;
944 }
945
946 if (lchown(p, uid, gid) < 0)
947 return -errno;
b12afc8c
LP
948
949 return 0;
950}
951
03cfe0d5
LP
952static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
953 const char *q;
954
955 q = prefix_roota(root, path);
956 if (mkdir(q, mode) < 0) {
957 if (errno == EEXIST)
958 return 0;
959 return -errno;
960 }
961
962 return userns_lchown(q, uid, gid);
963}
964
e58a1277 965static int setup_timezone(const char *dest) {
03cfe0d5
LP
966 _cleanup_free_ char *p = NULL, *q = NULL;
967 const char *where, *check, *what;
d4036145
LP
968 char *z, *y;
969 int r;
f8440af5 970
e58a1277
LP
971 assert(dest);
972
973 /* Fix the timezone, if possible */
d4036145
LP
974 r = readlink_malloc("/etc/localtime", &p);
975 if (r < 0) {
976 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
977 return 0;
978 }
979
980 z = path_startswith(p, "../usr/share/zoneinfo/");
981 if (!z)
982 z = path_startswith(p, "/usr/share/zoneinfo/");
983 if (!z) {
984 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
985 return 0;
986 }
987
03cfe0d5 988 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
989 r = readlink_malloc(where, &q);
990 if (r >= 0) {
991 y = path_startswith(q, "../usr/share/zoneinfo/");
992 if (!y)
993 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 994
d4036145
LP
995 /* Already pointing to the right place? Then do nothing .. */
996 if (y && streq(y, z))
997 return 0;
998 }
999
03cfe0d5
LP
1000 check = strjoina("/usr/share/zoneinfo/", z);
1001 check = prefix_root(dest, check);
1002 if (laccess(check, F_OK) < 0) {
d4036145
LP
1003 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1004 return 0;
1005 }
68fb0892 1006
79d80fc1
TG
1007 r = unlink(where);
1008 if (r < 0 && errno != ENOENT) {
56f64d95 1009 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1010 return 0;
1011 }
4d9f07b4 1012
03cfe0d5 1013 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1014 if (symlink(what, where) < 0) {
56f64d95 1015 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1016 return 0;
1017 }
e58a1277 1018
03cfe0d5
LP
1019 r = userns_lchown(where, 0, 0);
1020 if (r < 0)
1021 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1022
e58a1277 1023 return 0;
88213476
LP
1024}
1025
2547bb41 1026static int setup_resolv_conf(const char *dest) {
03cfe0d5 1027 const char *where = NULL;
79d80fc1 1028 int r;
2547bb41
LP
1029
1030 assert(dest);
1031
1032 if (arg_private_network)
1033 return 0;
1034
1035 /* Fix resolv.conf, if possible */
03cfe0d5 1036 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1037
f2068bcc 1038 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1039 if (r < 0) {
68a313c5
LP
1040 /* If the file already exists as symlink, let's
1041 * suppress the warning, under the assumption that
1042 * resolved or something similar runs inside and the
1043 * symlink points there.
1044 *
1045 * If the disk image is read-only, there's also no
1046 * point in complaining.
1047 */
1048 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1049 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1050 return 0;
1051 }
2547bb41 1052
03cfe0d5
LP
1053 r = userns_lchown(where, 0, 0);
1054 if (r < 0)
1055 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1056
2547bb41
LP
1057 return 0;
1058}
1059
9f24adc2 1060static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1061 assert(s);
9f24adc2
LP
1062
1063 snprintf(s, 37,
1064 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1065 SD_ID128_FORMAT_VAL(id));
1066
1067 return s;
1068}
1069
04bc4a3f 1070static int setup_boot_id(const char *dest) {
03cfe0d5 1071 const char *from, *to;
39883f62 1072 sd_id128_t rnd = {};
04bc4a3f
LP
1073 char as_uuid[37];
1074 int r;
1075
eb91eb18
LP
1076 if (arg_share_system)
1077 return 0;
1078
04bc4a3f
LP
1079 /* Generate a new randomized boot ID, so that each boot-up of
1080 * the container gets a new one */
1081
03cfe0d5
LP
1082 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1083 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1084
1085 r = sd_id128_randomize(&rnd);
f647962d
MS
1086 if (r < 0)
1087 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1088
9f24adc2 1089 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1090
4c1fc3e4 1091 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1092 if (r < 0)
1093 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1094
03cfe0d5
LP
1095 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1096 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1097 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1098 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1099
1100 unlink(from);
04bc4a3f
LP
1101 return r;
1102}
1103
e58a1277 1104static int copy_devnodes(const char *dest) {
88213476
LP
1105
1106 static const char devnodes[] =
1107 "null\0"
1108 "zero\0"
1109 "full\0"
1110 "random\0"
1111 "urandom\0"
85614d66
TG
1112 "tty\0"
1113 "net/tun\0";
88213476
LP
1114
1115 const char *d;
e58a1277 1116 int r = 0;
7fd1b19b 1117 _cleanup_umask_ mode_t u;
a258bf26
LP
1118
1119 assert(dest);
124640f1
LP
1120
1121 u = umask(0000);
88213476 1122
03cfe0d5
LP
1123 /* Create /dev/net, so that we can create /dev/net/tun in it */
1124 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1125 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1126
88213476 1127 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1128 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1129 struct stat st;
88213476 1130
7f112f50 1131 from = strappend("/dev/", d);
03cfe0d5 1132 to = prefix_root(dest, from);
88213476
LP
1133
1134 if (stat(from, &st) < 0) {
1135
4a62c710
MS
1136 if (errno != ENOENT)
1137 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1138
a258bf26 1139 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1140
03cfe0d5 1141 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1142 return -EIO;
a258bf26 1143
85614d66 1144 } else {
81f5049b
AC
1145 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1146 if (errno != EPERM)
1147 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1148
1149 /* Some systems abusively restrict mknod but
1150 * allow bind mounts. */
1151 r = touch(to);
1152 if (r < 0)
1153 return log_error_errno(r, "touch (%s) failed: %m", to);
1154 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1155 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1156 }
6278cf60 1157
03cfe0d5
LP
1158 r = userns_lchown(to, 0, 0);
1159 if (r < 0)
1160 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1161 }
88213476
LP
1162 }
1163
e58a1277
LP
1164 return r;
1165}
88213476 1166
03cfe0d5
LP
1167static int setup_pts(const char *dest) {
1168 _cleanup_free_ char *options = NULL;
1169 const char *p;
1170
1171#ifdef HAVE_SELINUX
1172 if (arg_selinux_apifs_context)
1173 (void) asprintf(&options,
3dce8915 1174 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1175 arg_uid_shift + TTY_GID,
1176 arg_selinux_apifs_context);
1177 else
1178#endif
1179 (void) asprintf(&options,
3dce8915 1180 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1181 arg_uid_shift + TTY_GID);
f2d88580 1182
03cfe0d5 1183 if (!options)
f2d88580
LP
1184 return log_oom();
1185
03cfe0d5 1186 /* Mount /dev/pts itself */
cc9fce65 1187 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1188 if (mkdir(p, 0755) < 0)
1189 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1190 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1191 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1192 if (userns_lchown(p, 0, 0) < 0)
1193 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1194
1195 /* Create /dev/ptmx symlink */
1196 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1197 if (symlink("pts/ptmx", p) < 0)
1198 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1199 if (userns_lchown(p, 0, 0) < 0)
1200 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1201
03cfe0d5
LP
1202 /* And fix /dev/pts/ptmx ownership */
1203 p = prefix_roota(dest, "/dev/pts/ptmx");
1204 if (userns_lchown(p, 0, 0) < 0)
1205 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1206
f2d88580
LP
1207 return 0;
1208}
1209
e58a1277 1210static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1211 _cleanup_umask_ mode_t u;
1212 const char *to;
e58a1277 1213 int r;
e58a1277
LP
1214
1215 assert(dest);
1216 assert(console);
1217
1218 u = umask(0000);
1219
03cfe0d5 1220 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1221 if (r < 0)
1222 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1223
a258bf26
LP
1224 /* We need to bind mount the right tty to /dev/console since
1225 * ptys can only exist on pts file systems. To have something
81f5049b 1226 * to bind mount things on we create a empty regular file. */
a258bf26 1227
03cfe0d5 1228 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1229 r = touch(to);
1230 if (r < 0)
1231 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1232
4543768d 1233 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1234 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1235
25ea79fe 1236 return 0;
e58a1277
LP
1237}
1238
1239static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1240 const char *from, *to;
7fd1b19b 1241 _cleanup_umask_ mode_t u;
d9603714 1242 int fd, r;
e58a1277 1243
e58a1277 1244 assert(kmsg_socket >= 0);
a258bf26 1245
e58a1277 1246 u = umask(0000);
a258bf26 1247
03cfe0d5 1248 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1249 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1250 * on the reading side behave very similar to /proc/kmsg,
1251 * their writing side behaves differently from /dev/kmsg in
1252 * that writing blocks when nothing is reading. In order to
1253 * avoid any problems with containers deadlocking due to this
1254 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1255 from = prefix_roota(dest, "/run/kmsg");
1256 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1257
4a62c710 1258 if (mkfifo(from, 0600) < 0)
03cfe0d5 1259 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1260 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1261 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1262
1263 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1264 if (fd < 0)
1265 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1266
e58a1277
LP
1267 /* Store away the fd in the socket, so that it stays open as
1268 * long as we run the child */
3ee897d6 1269 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1270 safe_close(fd);
e58a1277 1271
d9603714
DH
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1274
03cfe0d5
LP
1275 /* And now make the FIFO unavailable as /run/kmsg... */
1276 (void) unlink(from);
1277
25ea79fe 1278 return 0;
88213476
LP
1279}
1280
1c4baffc 1281static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1282 union in_addr_union *exposed = userdata;
1283
1284 assert(rtnl);
1285 assert(m);
1286 assert(exposed);
1287
7a8f6325 1288 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1289 return 0;
1290}
1291
3a74cea5 1292static int setup_hostname(void) {
3a74cea5 1293
eb91eb18
LP
1294 if (arg_share_system)
1295 return 0;
1296
605f81a8 1297 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1298 return -errno;
3a74cea5 1299
7027ff61 1300 return 0;
3a74cea5
LP
1301}
1302
57fb9fb5 1303static int setup_journal(const char *directory) {
4d680aee 1304 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1305 _cleanup_free_ char *b = NULL, *d = NULL;
1306 const char *etc_machine_id, *p, *q;
27407a01 1307 char *id;
57fb9fb5
LP
1308 int r;
1309
df9a75e4
LP
1310 /* Don't link journals in ephemeral mode */
1311 if (arg_ephemeral)
1312 return 0;
1313
03cfe0d5 1314 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1315
03cfe0d5 1316 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1317 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1318 return 0;
f647962d 1319 else if (r < 0)
03cfe0d5 1320 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1321
27407a01
ZJS
1322 id = strstrip(b);
1323 if (isempty(id) && arg_link_journal == LINK_AUTO)
1324 return 0;
57fb9fb5 1325
27407a01
ZJS
1326 /* Verify validity */
1327 r = sd_id128_from_string(id, &machine_id);
f647962d 1328 if (r < 0)
03cfe0d5 1329 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1330
4d680aee 1331 r = sd_id128_get_machine(&this_id);
f647962d
MS
1332 if (r < 0)
1333 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1334
1335 if (sd_id128_equal(machine_id, this_id)) {
1336 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1337 "Host and machine ids are equal (%s): refusing to link journals", id);
1338 if (arg_link_journal == LINK_AUTO)
1339 return 0;
df9a75e4 1340 return -EEXIST;
4d680aee
ZJS
1341 }
1342
1343 if (arg_link_journal == LINK_NO)
1344 return 0;
1345
03cfe0d5
LP
1346 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1347 if (r < 0)
1348 return log_error_errno(r, "Failed to create /var: %m");
1349
1350 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1351 if (r < 0)
1352 return log_error_errno(r, "Failed to create /var/log: %m");
1353
1354 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1355 if (r < 0)
1356 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1357
1358 p = strjoina("/var/log/journal/", id);
1359 q = prefix_roota(directory, p);
27407a01 1360
e26d6ce5 1361 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1362 if (arg_link_journal != LINK_AUTO) {
1363 log_error("%s: already a mount point, refusing to use for journal", p);
1364 return -EEXIST;
1365 }
1366
1367 return 0;
57fb9fb5
LP
1368 }
1369
e26d6ce5 1370 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1371 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1372 log_error("%s: already a mount point, refusing to use for journal", q);
1373 return -EEXIST;
57fb9fb5
LP
1374 }
1375
27407a01 1376 return 0;
57fb9fb5
LP
1377 }
1378
1379 r = readlink_and_make_absolute(p, &d);
1380 if (r >= 0) {
1381 if ((arg_link_journal == LINK_GUEST ||
1382 arg_link_journal == LINK_AUTO) &&
1383 path_equal(d, q)) {
1384
03cfe0d5 1385 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1386 if (r < 0)
56f64d95 1387 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1388 return 0;
57fb9fb5
LP
1389 }
1390
4a62c710
MS
1391 if (unlink(p) < 0)
1392 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1393 } else if (r == -EINVAL) {
1394
1395 if (arg_link_journal == LINK_GUEST &&
1396 rmdir(p) < 0) {
1397
27407a01
ZJS
1398 if (errno == ENOTDIR) {
1399 log_error("%s already exists and is neither a symlink nor a directory", p);
1400 return r;
1401 } else {
56f64d95 1402 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1403 return -errno;
57fb9fb5 1404 }
57fb9fb5
LP
1405 }
1406 } else if (r != -ENOENT) {
56f64d95 1407 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1408 return r;
57fb9fb5
LP
1409 }
1410
1411 if (arg_link_journal == LINK_GUEST) {
1412
1413 if (symlink(q, p) < 0) {
574edc90 1414 if (arg_link_journal_try) {
56f64d95 1415 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1416 return 0;
1417 } else {
56f64d95 1418 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1419 return -errno;
1420 }
57fb9fb5
LP
1421 }
1422
03cfe0d5 1423 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1424 if (r < 0)
56f64d95 1425 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1426 return 0;
57fb9fb5
LP
1427 }
1428
1429 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1430 /* don't create parents here -- if the host doesn't have
1431 * permanent journal set up, don't force it here */
1432 r = mkdir(p, 0755);
57fb9fb5 1433 if (r < 0) {
574edc90 1434 if (arg_link_journal_try) {
56f64d95 1435 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1436 return 0;
1437 } else {
56f64d95 1438 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1439 return r;
1440 }
57fb9fb5
LP
1441 }
1442
27407a01
ZJS
1443 } else if (access(p, F_OK) < 0)
1444 return 0;
57fb9fb5 1445
cdb2b9d0
LP
1446 if (dir_is_empty(q) == 0)
1447 log_warning("%s is not empty, proceeding anyway.", q);
1448
03cfe0d5 1449 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 1450 if (r < 0) {
56f64d95 1451 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1452 return r;
57fb9fb5
LP
1453 }
1454
4543768d 1455 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1456 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1457
27407a01 1458 return 0;
57fb9fb5
LP
1459}
1460
88213476 1461static int drop_capabilities(void) {
5076f0cc 1462 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1463}
1464
db999e0f
LP
1465static int reset_audit_loginuid(void) {
1466 _cleanup_free_ char *p = NULL;
1467 int r;
1468
1469 if (arg_share_system)
1470 return 0;
1471
1472 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1473 if (r == -ENOENT)
db999e0f 1474 return 0;
f647962d
MS
1475 if (r < 0)
1476 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1477
1478 /* Already reset? */
1479 if (streq(p, "4294967295"))
1480 return 0;
1481
ad118bda 1482 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1483 if (r < 0) {
10a87006
LP
1484 log_error_errno(r,
1485 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1486 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1487 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1488 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1489 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1490
db999e0f 1491 sleep(5);
77b6e194 1492 }
db999e0f
LP
1493
1494 return 0;
77b6e194
LP
1495}
1496
28650077 1497static int setup_seccomp(void) {
24fb1112
LP
1498
1499#ifdef HAVE_SECCOMP
9a71b112
JF
1500 static const struct {
1501 uint64_t capability;
1502 int syscall_num;
1503 } blacklist[] = {
5ba7a268
LP
1504 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1505 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1506 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1507 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1508 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1509 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1510 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1511 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1512 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1513 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1514 };
1515
24fb1112 1516 scmp_filter_ctx seccomp;
28650077 1517 unsigned i;
24fb1112
LP
1518 int r;
1519
24fb1112
LP
1520 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1521 if (!seccomp)
1522 return log_oom();
1523
e9642be2 1524 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1525 if (r < 0) {
da927ba9 1526 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1527 goto finish;
1528 }
1529
28650077 1530 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1531 if (arg_retain & (1ULL << blacklist[i].capability))
1532 continue;
1533
1534 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1535 if (r == -EFAULT)
1536 continue; /* unknown syscall */
1537 if (r < 0) {
da927ba9 1538 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1539 goto finish;
1540 }
1541 }
1542
d0a0ccf3 1543
28650077
LP
1544 /*
1545 Audit is broken in containers, much of the userspace audit
1546 hookup will fail if running inside a container. We don't
1547 care and just turn off creation of audit sockets.
1548
1549 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1550 with EAFNOSUPPORT which audit userspace uses as indication
1551 that audit is disabled in the kernel.
1552 */
1553
3302da46 1554 r = seccomp_rule_add(
24fb1112
LP
1555 seccomp,
1556 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1557 SCMP_SYS(socket),
1558 2,
1559 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1560 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1561 if (r < 0) {
da927ba9 1562 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1563 goto finish;
1564 }
1565
1566 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1567 if (r < 0) {
da927ba9 1568 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1569 goto finish;
1570 }
1571
1572 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1573 if (r == -EINVAL) {
1574 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1575 r = 0;
1576 goto finish;
1577 }
1578 if (r < 0) {
da927ba9 1579 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1580 goto finish;
1581 }
24fb1112
LP
1582
1583finish:
1584 seccomp_release(seccomp);
1585 return r;
1586#else
1587 return 0;
1588#endif
1589
1590}
1591
785890ac
LP
1592static int setup_propagate(const char *root) {
1593 const char *p, *q;
1594
1595 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1596 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1597 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1598 (void) mkdir_p(p, 0600);
1599
03cfe0d5
LP
1600 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1601 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1602
1603 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1604 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1605
1606 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1607 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1608
03cfe0d5 1609 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1610 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1611 return log_error_errno(errno, "Failed to install propagation bind mount.");
1612
1613 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1614 return log_error_errno(errno, "Failed to make propagation mount read-only");
1615
1616 return 0;
1617}
1618
1b9e5b12
LP
1619static int setup_image(char **device_path, int *loop_nr) {
1620 struct loop_info64 info = {
1621 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1622 };
1623 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1624 _cleanup_free_ char* loopdev = NULL;
1625 struct stat st;
1626 int r, nr;
1627
1628 assert(device_path);
1629 assert(loop_nr);
ec16945e 1630 assert(arg_image);
1b9e5b12
LP
1631
1632 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1633 if (fd < 0)
1634 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1635
4a62c710
MS
1636 if (fstat(fd, &st) < 0)
1637 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1638
1639 if (S_ISBLK(st.st_mode)) {
1640 char *p;
1641
1642 p = strdup(arg_image);
1643 if (!p)
1644 return log_oom();
1645
1646 *device_path = p;
1647
1648 *loop_nr = -1;
1649
1650 r = fd;
1651 fd = -1;
1652
1653 return r;
1654 }
1655
1656 if (!S_ISREG(st.st_mode)) {
56f64d95 1657 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
1658 return -EINVAL;
1659 }
1660
1661 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1662 if (control < 0)
1663 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1664
1665 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1666 if (nr < 0)
1667 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1668
1669 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1670 return log_oom();
1671
1672 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1673 if (loop < 0)
1674 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1675
4a62c710
MS
1676 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1677 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1678
1679 if (arg_read_only)
1680 info.lo_flags |= LO_FLAGS_READ_ONLY;
1681
4a62c710
MS
1682 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1683 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1684
1685 *device_path = loopdev;
1686 loopdev = NULL;
1687
1688 *loop_nr = nr;
1689
1690 r = loop;
1691 loop = -1;
1692
1693 return r;
1694}
1695
ada4799a
LP
1696#define PARTITION_TABLE_BLURB \
1697 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1698 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1699 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1700 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1701 "to be bootable with systemd-nspawn."
1702
1b9e5b12
LP
1703static int dissect_image(
1704 int fd,
727fd4fd
LP
1705 char **root_device, bool *root_device_rw,
1706 char **home_device, bool *home_device_rw,
1707 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1708 bool *secondary) {
1709
1710#ifdef HAVE_BLKID
01dc33ce
ZJS
1711 int home_nr = -1, srv_nr = -1;
1712#ifdef GPT_ROOT_NATIVE
1713 int root_nr = -1;
1714#endif
1715#ifdef GPT_ROOT_SECONDARY
1716 int secondary_root_nr = -1;
1717#endif
f6c51a81 1718 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1719 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1720 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1721 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1722 _cleanup_udev_unref_ struct udev *udev = NULL;
1723 struct udev_list_entry *first, *item;
f6c51a81 1724 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1725 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1726 const char *pttype = NULL;
1727 blkid_partlist pl;
1728 struct stat st;
c09ef2e4 1729 unsigned i;
1b9e5b12
LP
1730 int r;
1731
1732 assert(fd >= 0);
1733 assert(root_device);
1734 assert(home_device);
1735 assert(srv_device);
1736 assert(secondary);
ec16945e 1737 assert(arg_image);
1b9e5b12
LP
1738
1739 b = blkid_new_probe();
1740 if (!b)
1741 return log_oom();
1742
1743 errno = 0;
1744 r = blkid_probe_set_device(b, fd, 0, 0);
1745 if (r != 0) {
1746 if (errno == 0)
1747 return log_oom();
1748
56f64d95 1749 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1750 return -errno;
1751 }
1752
1753 blkid_probe_enable_partitions(b, 1);
1754 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1755
1756 errno = 0;
1757 r = blkid_do_safeprobe(b);
1758 if (r == -2 || r == 1) {
ada4799a
LP
1759 log_error("Failed to identify any partition table on\n"
1760 " %s\n"
1761 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1762 return -EINVAL;
1763 } else if (r != 0) {
1764 if (errno == 0)
1765 errno = EIO;
56f64d95 1766 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1767 return -errno;
1768 }
1769
48861960 1770 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1771
1772 is_gpt = streq_ptr(pttype, "gpt");
1773 is_mbr = streq_ptr(pttype, "dos");
1774
1775 if (!is_gpt && !is_mbr) {
1776 log_error("No GPT or MBR partition table discovered on\n"
1777 " %s\n"
1778 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1779 return -EINVAL;
1780 }
1781
1782 errno = 0;
1783 pl = blkid_probe_get_partitions(b);
1784 if (!pl) {
1785 if (errno == 0)
1786 return log_oom();
1787
1788 log_error("Failed to list partitions of %s", arg_image);
1789 return -errno;
1790 }
1791
1792 udev = udev_new();
1793 if (!udev)
1794 return log_oom();
1795
4a62c710
MS
1796 if (fstat(fd, &st) < 0)
1797 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1798
c09ef2e4
LP
1799 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1800 if (!d)
1b9e5b12
LP
1801 return log_oom();
1802
c09ef2e4
LP
1803 for (i = 0;; i++) {
1804 int n, m;
1b9e5b12 1805
c09ef2e4
LP
1806 if (i >= 10) {
1807 log_error("Kernel partitions never appeared.");
1808 return -ENXIO;
1809 }
1810
1811 e = udev_enumerate_new(udev);
1812 if (!e)
1813 return log_oom();
1814
1815 r = udev_enumerate_add_match_parent(e, d);
1816 if (r < 0)
1817 return log_oom();
1818
1819 r = udev_enumerate_scan_devices(e);
1820 if (r < 0)
1821 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1822
1823 /* Count the partitions enumerated by the kernel */
1824 n = 0;
1825 first = udev_enumerate_get_list_entry(e);
1826 udev_list_entry_foreach(item, first)
1827 n++;
1828
1829 /* Count the partitions enumerated by blkid */
1830 m = blkid_partlist_numof_partitions(pl);
1831 if (n == m + 1)
1832 break;
1833 if (n > m + 1) {
1834 log_error("blkid and kernel partition list do not match.");
1835 return -EIO;
1836 }
1837 if (n < m + 1) {
1838 unsigned j;
1839
1840 /* The kernel has probed fewer partitions than
1841 * blkid? Maybe the kernel prober is still
1842 * running or it got EBUSY because udev
1843 * already opened the device. Let's reprobe
1844 * the device, which is a synchronous call
1845 * that waits until probing is complete. */
1846
1847 for (j = 0; j < 20; j++) {
1848
1849 r = ioctl(fd, BLKRRPART, 0);
1850 if (r < 0)
1851 r = -errno;
1852 if (r >= 0 || r != -EBUSY)
1853 break;
1854
1855 /* If something else has the device
1856 * open, such as an udev rule, the
1857 * ioctl will return EBUSY. Since
1858 * there's no way to wait until it
1859 * isn't busy anymore, let's just wait
1860 * a bit, and try again.
1861 *
1862 * This is really something they
1863 * should fix in the kernel! */
1864
1865 usleep(50 * USEC_PER_MSEC);
1866 }
1867
1868 if (r < 0)
1869 return log_error_errno(r, "Failed to reread partition table: %m");
1870 }
1871
1872 e = udev_enumerate_unref(e);
1873 }
1b9e5b12
LP
1874
1875 first = udev_enumerate_get_list_entry(e);
1876 udev_list_entry_foreach(item, first) {
1877 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1878 const char *node;
727fd4fd 1879 unsigned long long flags;
1b9e5b12
LP
1880 blkid_partition pp;
1881 dev_t qn;
1882 int nr;
1883
1884 errno = 0;
1885 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1886 if (!q) {
1887 if (!errno)
1888 errno = ENOMEM;
1889
56f64d95 1890 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1891 return -errno;
1892 }
1893
1894 qn = udev_device_get_devnum(q);
1895 if (major(qn) == 0)
1896 continue;
1897
1898 if (st.st_rdev == qn)
1899 continue;
1900
1901 node = udev_device_get_devnode(q);
1902 if (!node)
1903 continue;
1904
1905 pp = blkid_partlist_devno_to_partition(pl, qn);
1906 if (!pp)
1907 continue;
1908
727fd4fd 1909 flags = blkid_partition_get_flags(pp);
727fd4fd 1910
1b9e5b12
LP
1911 nr = blkid_partition_get_partno(pp);
1912 if (nr < 0)
1913 continue;
1914
ada4799a
LP
1915 if (is_gpt) {
1916 sd_id128_t type_id;
1917 const char *stype;
1b9e5b12 1918
f6c51a81
LP
1919 if (flags & GPT_FLAG_NO_AUTO)
1920 continue;
1921
ada4799a
LP
1922 stype = blkid_partition_get_type_string(pp);
1923 if (!stype)
1924 continue;
1b9e5b12 1925
ada4799a 1926 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1927 continue;
1928
ada4799a 1929 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1930
ada4799a
LP
1931 if (home && nr >= home_nr)
1932 continue;
1b9e5b12 1933
ada4799a
LP
1934 home_nr = nr;
1935 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1936
ada4799a
LP
1937 r = free_and_strdup(&home, node);
1938 if (r < 0)
1939 return log_oom();
727fd4fd 1940
ada4799a
LP
1941 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1942
1943 if (srv && nr >= srv_nr)
1944 continue;
1945
1946 srv_nr = nr;
1947 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1948
1949 r = free_and_strdup(&srv, node);
1950 if (r < 0)
1951 return log_oom();
1952 }
1b9e5b12 1953#ifdef GPT_ROOT_NATIVE
ada4799a 1954 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1955
ada4799a
LP
1956 if (root && nr >= root_nr)
1957 continue;
1b9e5b12 1958
ada4799a
LP
1959 root_nr = nr;
1960 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1961
ada4799a
LP
1962 r = free_and_strdup(&root, node);
1963 if (r < 0)
1964 return log_oom();
1965 }
1b9e5b12
LP
1966#endif
1967#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1968 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1969
1970 if (secondary_root && nr >= secondary_root_nr)
1971 continue;
1972
1973 secondary_root_nr = nr;
1974 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1975
1976 r = free_and_strdup(&secondary_root, node);
1977 if (r < 0)
1978 return log_oom();
1979 }
1980#endif
f6c51a81
LP
1981 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1982
1983 if (generic)
1984 multiple_generic = true;
1985 else {
1986 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1987
1988 r = free_and_strdup(&generic, node);
1989 if (r < 0)
1990 return log_oom();
1991 }
1992 }
ada4799a
LP
1993
1994 } else if (is_mbr) {
1995 int type;
1b9e5b12 1996
f6c51a81
LP
1997 if (flags != 0x80) /* Bootable flag */
1998 continue;
1999
ada4799a
LP
2000 type = blkid_partition_get_type(pp);
2001 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2002 continue;
2003
f6c51a81
LP
2004 if (generic)
2005 multiple_generic = true;
2006 else {
2007 generic_rw = true;
727fd4fd 2008
f6c51a81
LP
2009 r = free_and_strdup(&root, node);
2010 if (r < 0)
2011 return log_oom();
2012 }
1b9e5b12 2013 }
1b9e5b12
LP
2014 }
2015
1b9e5b12
LP
2016 if (root) {
2017 *root_device = root;
2018 root = NULL;
727fd4fd
LP
2019
2020 *root_device_rw = root_rw;
1b9e5b12
LP
2021 *secondary = false;
2022 } else if (secondary_root) {
2023 *root_device = secondary_root;
2024 secondary_root = NULL;
727fd4fd
LP
2025
2026 *root_device_rw = secondary_root_rw;
1b9e5b12 2027 *secondary = true;
f6c51a81
LP
2028 } else if (generic) {
2029
2030 /* There were no partitions with precise meanings
2031 * around, but we found generic partitions. In this
2032 * case, if there's only one, we can go ahead and boot
2033 * it, otherwise we bail out, because we really cannot
2034 * make any sense of it. */
2035
2036 if (multiple_generic) {
2037 log_error("Identified multiple bootable Linux partitions on\n"
2038 " %s\n"
2039 PARTITION_TABLE_BLURB, arg_image);
2040 return -EINVAL;
2041 }
2042
2043 *root_device = generic;
2044 generic = NULL;
2045
2046 *root_device_rw = generic_rw;
2047 *secondary = false;
2048 } else {
2049 log_error("Failed to identify root partition in disk image\n"
2050 " %s\n"
2051 PARTITION_TABLE_BLURB, arg_image);
2052 return -EINVAL;
1b9e5b12
LP
2053 }
2054
2055 if (home) {
2056 *home_device = home;
2057 home = NULL;
727fd4fd
LP
2058
2059 *home_device_rw = home_rw;
1b9e5b12
LP
2060 }
2061
2062 if (srv) {
2063 *srv_device = srv;
2064 srv = NULL;
727fd4fd
LP
2065
2066 *srv_device_rw = srv_rw;
1b9e5b12
LP
2067 }
2068
2069 return 0;
2070#else
2071 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2072 return -EOPNOTSUPP;
1b9e5b12
LP
2073#endif
2074}
2075
727fd4fd 2076static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2077#ifdef HAVE_BLKID
2078 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2079 const char *fstype, *p;
2080 int r;
2081
2082 assert(what);
2083 assert(where);
2084
727fd4fd
LP
2085 if (arg_read_only)
2086 rw = false;
2087
1b9e5b12 2088 if (directory)
63c372cb 2089 p = strjoina(where, directory);
1b9e5b12
LP
2090 else
2091 p = where;
2092
2093 errno = 0;
2094 b = blkid_new_probe_from_filename(what);
2095 if (!b) {
2096 if (errno == 0)
2097 return log_oom();
56f64d95 2098 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2099 return -errno;
2100 }
2101
2102 blkid_probe_enable_superblocks(b, 1);
2103 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2104
2105 errno = 0;
2106 r = blkid_do_safeprobe(b);
2107 if (r == -1 || r == 1) {
2108 log_error("Cannot determine file system type of %s", what);
2109 return -EINVAL;
2110 } else if (r != 0) {
2111 if (errno == 0)
2112 errno = EIO;
56f64d95 2113 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2114 return -errno;
2115 }
2116
2117 errno = 0;
2118 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2119 if (errno == 0)
2120 errno = EINVAL;
2121 log_error("Failed to determine file system type of %s", what);
2122 return -errno;
2123 }
2124
2125 if (streq(fstype, "crypto_LUKS")) {
2126 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2127 return -EOPNOTSUPP;
1b9e5b12
LP
2128 }
2129
4a62c710
MS
2130 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2131 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2132
2133 return 0;
2134#else
2135 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2136 return -EOPNOTSUPP;
1b9e5b12
LP
2137#endif
2138}
2139
727fd4fd
LP
2140static int mount_devices(
2141 const char *where,
2142 const char *root_device, bool root_device_rw,
2143 const char *home_device, bool home_device_rw,
2144 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2145 int r;
2146
2147 assert(where);
2148
2149 if (root_device) {
727fd4fd 2150 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2151 if (r < 0)
2152 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2153 }
2154
2155 if (home_device) {
727fd4fd 2156 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2157 if (r < 0)
2158 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2159 }
2160
2161 if (srv_device) {
727fd4fd 2162 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2165 }
2166
2167 return 0;
2168}
2169
2170static void loop_remove(int nr, int *image_fd) {
2171 _cleanup_close_ int control = -1;
e8c8ddcc 2172 int r;
1b9e5b12
LP
2173
2174 if (nr < 0)
2175 return;
2176
2177 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2178 r = ioctl(*image_fd, LOOP_CLR_FD);
2179 if (r < 0)
5e4074aa 2180 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2181 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2182 }
2183
2184 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2185 if (control < 0) {
56f64d95 2186 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2187 return;
e8c8ddcc 2188 }
1b9e5b12 2189
e8c8ddcc
TG
2190 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2191 if (r < 0)
5e4074aa 2192 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2193}
2194
113cea80 2195/*
6d416b9c
LS
2196 * Return values:
2197 * < 0 : wait_for_terminate() failed to get the state of the
2198 * container, the container was terminated by a signal, or
2199 * failed for an unknown reason. No change is made to the
2200 * container argument.
2201 * > 0 : The program executed in the container terminated with an
2202 * error. The exit code of the program executed in the
919699ec
LP
2203 * container is returned. The container argument has been set
2204 * to CONTAINER_TERMINATED.
6d416b9c
LS
2205 * 0 : The container is being rebooted, has been shut down or exited
2206 * successfully. The container argument has been set to either
2207 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2208 *
6d416b9c
LS
2209 * That is, success is indicated by a return value of zero, and an
2210 * error is indicated by a non-zero value.
113cea80
DH
2211 */
2212static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2213 siginfo_t status;
919699ec 2214 int r;
113cea80
DH
2215
2216 r = wait_for_terminate(pid, &status);
f647962d
MS
2217 if (r < 0)
2218 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2219
2220 switch (status.si_code) {
fddbb89c 2221
113cea80 2222 case CLD_EXITED:
919699ec
LP
2223 if (status.si_status == 0) {
2224 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2225
fddbb89c 2226 } else
919699ec 2227 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2228
919699ec
LP
2229 *container = CONTAINER_TERMINATED;
2230 return status.si_status;
113cea80
DH
2231
2232 case CLD_KILLED:
2233 if (status.si_status == SIGINT) {
113cea80 2234
919699ec 2235 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2236 *container = CONTAINER_TERMINATED;
919699ec
LP
2237 return 0;
2238
113cea80 2239 } else if (status.si_status == SIGHUP) {
113cea80 2240
919699ec 2241 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2242 *container = CONTAINER_REBOOTED;
919699ec 2243 return 0;
113cea80 2244 }
919699ec 2245
113cea80
DH
2246 /* CLD_KILLED fallthrough */
2247
2248 case CLD_DUMPED:
fddbb89c 2249 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2250 return -EIO;
113cea80
DH
2251
2252 default:
fddbb89c 2253 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2254 return -EIO;
113cea80
DH
2255 }
2256
2257 return r;
2258}
2259
023fb90b
LP
2260static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2261 pid_t pid;
2262
2263 pid = PTR_TO_UINT32(userdata);
2264 if (pid > 0) {
c6c8f6e2 2265 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2266 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2267 sd_event_source_set_userdata(s, NULL);
2268 return 0;
2269 }
2270 }
2271
2272 sd_event_exit(sd_event_source_get_event(s), 0);
2273 return 0;
2274}
2275
ec16945e 2276static int determine_names(void) {
1b9cebf6 2277 int r;
ec16945e 2278
c1521918
LP
2279 if (arg_template && !arg_directory && arg_machine) {
2280
2281 /* If --template= was specified then we should not
2282 * search for a machine, but instead create a new one
2283 * in /var/lib/machine. */
2284
2285 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2286 if (!arg_directory)
2287 return log_oom();
2288 }
2289
ec16945e 2290 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2291 if (arg_machine) {
2292 _cleanup_(image_unrefp) Image *i = NULL;
2293
2294 r = image_find(arg_machine, &i);
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2297 else if (r == 0) {
2298 log_error("No image for machine '%s': %m", arg_machine);
2299 return -ENOENT;
2300 }
2301
aceac2f0 2302 if (i->type == IMAGE_RAW)
0f03c2a4 2303 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2304 else
0f03c2a4 2305 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2306 if (r < 0)
2307 return log_error_errno(r, "Invalid image directory: %m");
2308
aee327b8
LP
2309 if (!arg_ephemeral)
2310 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2311 } else
ec16945e
LP
2312 arg_directory = get_current_dir_name();
2313
1b9cebf6
LP
2314 if (!arg_directory && !arg_machine) {
2315 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2316 return -EINVAL;
2317 }
2318 }
2319
2320 if (!arg_machine) {
b9ba4dab
LP
2321 if (arg_directory && path_equal(arg_directory, "/"))
2322 arg_machine = gethostname_malloc();
2323 else
2324 arg_machine = strdup(basename(arg_image ?: arg_directory));
2325
ec16945e
LP
2326 if (!arg_machine)
2327 return log_oom();
2328
ae691c1d 2329 hostname_cleanup(arg_machine);
ec16945e
LP
2330 if (!machine_name_is_valid(arg_machine)) {
2331 log_error("Failed to determine machine name automatically, please use -M.");
2332 return -EINVAL;
2333 }
b9ba4dab
LP
2334
2335 if (arg_ephemeral) {
2336 char *b;
2337
2338 /* Add a random suffix when this is an
2339 * ephemeral machine, so that we can run many
2340 * instances at once without manually having
2341 * to specify -M each time. */
2342
2343 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2344 return log_oom();
2345
2346 free(arg_machine);
2347 arg_machine = b;
2348 }
ec16945e
LP
2349 }
2350
2351 return 0;
2352}
2353
03cfe0d5 2354static int determine_uid_shift(const char *directory) {
6dac160c
LP
2355 int r;
2356
03cfe0d5
LP
2357 if (!arg_userns) {
2358 arg_uid_shift = 0;
6dac160c 2359 return 0;
03cfe0d5 2360 }
6dac160c
LP
2361
2362 if (arg_uid_shift == UID_INVALID) {
2363 struct stat st;
2364
03cfe0d5 2365 r = stat(directory, &st);
6dac160c 2366 if (r < 0)
03cfe0d5 2367 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2368
2369 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2370
2371 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2372 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2373 return -EINVAL;
2374 }
2375
2376 arg_uid_range = UINT32_C(0x10000);
2377 }
2378
2379 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2380 log_error("UID base too high for UID range.");
2381 return -EINVAL;
2382 }
2383
2384 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2385 return 0;
2386}
2387
03cfe0d5
LP
2388static int inner_child(
2389 Barrier *barrier,
2390 const char *directory,
2391 bool secondary,
2392 int kmsg_socket,
2393 int rtnl_socket,
f757855e 2394 FDSet *fds) {
69c79d3c 2395
03cfe0d5
LP
2396 _cleanup_free_ char *home = NULL;
2397 unsigned n_env = 2;
2398 const char *envp[] = {
2399 "PATH=" DEFAULT_PATH_SPLIT_USR,
2400 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2401 NULL, /* TERM */
2402 NULL, /* HOME */
2403 NULL, /* USER */
2404 NULL, /* LOGNAME */
2405 NULL, /* container_uuid */
2406 NULL, /* LISTEN_FDS */
2407 NULL, /* LISTEN_PID */
2408 NULL
2409 };
88213476 2410
2371271c 2411 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2412 int r;
88213476 2413
03cfe0d5
LP
2414 assert(barrier);
2415 assert(directory);
2416 assert(kmsg_socket >= 0);
88213476 2417
efdb0237
LP
2418 cg_unified_flush();
2419
03cfe0d5
LP
2420 if (arg_userns) {
2421 /* Tell the parent, that it now can write the UID map. */
2422 (void) barrier_place(barrier); /* #1 */
7027ff61 2423
03cfe0d5
LP
2424 /* Wait until the parent wrote the UID map */
2425 if (!barrier_place_and_sync(barrier)) { /* #2 */
2426 log_error("Parent died too early");
2427 return -ESRCH;
2428 }
88213476
LP
2429 }
2430
d1678248 2431 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2432 if (r < 0)
2433 return r;
2434
d8fc6a00
LP
2435 r = mount_sysfs(NULL);
2436 if (r < 0)
2437 return r;
2438
03cfe0d5
LP
2439 /* Wait until we are cgroup-ified, so that we
2440 * can mount the right cgroup path writable */
2441 if (!barrier_place_and_sync(barrier)) { /* #3 */
2442 log_error("Parent died too early");
2443 return -ESRCH;
88213476
LP
2444 }
2445
e83bebef 2446 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2447 if (r < 0)
2448 return r;
ec16945e 2449
03cfe0d5
LP
2450 r = reset_uid_gid();
2451 if (r < 0)
2452 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2453
03cfe0d5
LP
2454 r = setup_boot_id(NULL);
2455 if (r < 0)
2456 return r;
ec16945e 2457
03cfe0d5
LP
2458 r = setup_kmsg(NULL, kmsg_socket);
2459 if (r < 0)
2460 return r;
2461 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2462
03cfe0d5 2463 umask(0022);
30535c16 2464
03cfe0d5
LP
2465 if (setsid() < 0)
2466 return log_error_errno(errno, "setsid() failed: %m");
2467
2468 if (arg_private_network)
2469 loopback_setup();
2470
7a8f6325
LP
2471 if (arg_expose_ports) {
2472 r = expose_port_send_rtnl(rtnl_socket);
2473 if (r < 0)
2474 return r;
2475 rtnl_socket = safe_close(rtnl_socket);
2476 }
03cfe0d5
LP
2477
2478 if (drop_capabilities() < 0)
2479 return log_error_errno(errno, "drop_capabilities() failed: %m");
2480
2481 setup_hostname();
2482
050f7277 2483 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2484 if (personality(arg_personality) < 0)
2485 return log_error_errno(errno, "personality() failed: %m");
2486 } else if (secondary) {
2487 if (personality(PER_LINUX32) < 0)
2488 return log_error_errno(errno, "personality() failed: %m");
2489 }
2490
2491#ifdef HAVE_SELINUX
2492 if (arg_selinux_context)
2493 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2494 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2495#endif
2496
ee645080 2497 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2498 if (r < 0)
2499 return r;
2500
2501 envp[n_env] = strv_find_prefix(environ, "TERM=");
2502 if (envp[n_env])
2503 n_env ++;
2504
2505 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2506 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2507 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2508 return log_oom();
2509
2510 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2511 char as_uuid[37];
2512
2513 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2514 return log_oom();
2515 }
2516
2517 if (fdset_size(fds) > 0) {
2518 r = fdset_cloexec(fds, false);
2519 if (r < 0)
2520 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2521
2522 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2523 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2524 return log_oom();
2525 }
2526
2371271c
TG
2527 env_use = strv_env_merge(2, envp, arg_setenv);
2528 if (!env_use)
2529 return log_oom();
03cfe0d5
LP
2530
2531 /* Let the parent know that we are ready and
2532 * wait until the parent is ready with the
2533 * setup, too... */
2534 if (!barrier_place_and_sync(barrier)) { /* #4 */
2535 log_error("Parent died too early");
2536 return -ESRCH;
2537 }
2538
2539 /* Now, explicitly close the log, so that we
2540 * then can close all remaining fds. Closing
2541 * the log explicitly first has the benefit
2542 * that the logging subsystem knows about it,
2543 * and is thus ready to be reopened should we
2544 * need it again. Note that the other fds
2545 * closed here are at least the locking and
2546 * barrier fds. */
2547 log_close();
2548 (void) fdset_close_others(fds);
2549
2550 if (arg_boot) {
2551 char **a;
2552 size_t m;
2553
2554 /* Automatically search for the init system */
2555
f757855e 2556 m = 1 + strv_length(arg_parameters);
03cfe0d5 2557 a = newa(char*, m + 1);
f757855e
LP
2558 if (strv_isempty(arg_parameters))
2559 a[1] = NULL;
2560 else
2561 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2562
2563 a[0] = (char*) "/usr/lib/systemd/systemd";
2564 execve(a[0], a, env_use);
2565
2566 a[0] = (char*) "/lib/systemd/systemd";
2567 execve(a[0], a, env_use);
2568
2569 a[0] = (char*) "/sbin/init";
2570 execve(a[0], a, env_use);
f757855e
LP
2571 } else if (!strv_isempty(arg_parameters))
2572 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2573 else {
f757855e 2574 chdir(home ?: "/root");
03cfe0d5
LP
2575 execle("/bin/bash", "-bash", NULL, env_use);
2576 execle("/bin/sh", "-sh", NULL, env_use);
2577 }
2578
2579 (void) log_open();
2580 return log_error_errno(errno, "execv() failed: %m");
2581}
2582
2583static int outer_child(
2584 Barrier *barrier,
2585 const char *directory,
2586 const char *console,
2587 const char *root_device, bool root_device_rw,
2588 const char *home_device, bool home_device_rw,
2589 const char *srv_device, bool srv_device_rw,
2590 bool interactive,
2591 bool secondary,
2592 int pid_socket,
2593 int kmsg_socket,
2594 int rtnl_socket,
825d5287 2595 int uid_shift_socket,
f757855e 2596 FDSet *fds) {
03cfe0d5
LP
2597
2598 pid_t pid;
2599 ssize_t l;
2600 int r;
2601
2602 assert(barrier);
2603 assert(directory);
2604 assert(console);
2605 assert(pid_socket >= 0);
2606 assert(kmsg_socket >= 0);
2607
efdb0237
LP
2608 cg_unified_flush();
2609
03cfe0d5
LP
2610 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2611 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2612
2613 if (interactive) {
2614 close_nointr(STDIN_FILENO);
2615 close_nointr(STDOUT_FILENO);
2616 close_nointr(STDERR_FILENO);
2617
2618 r = open_terminal(console, O_RDWR);
2619 if (r != STDIN_FILENO) {
2620 if (r >= 0) {
2621 safe_close(r);
2622 r = -EINVAL;
2623 }
2624
2625 return log_error_errno(r, "Failed to open console: %m");
2626 }
2627
2628 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2629 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2630 return log_error_errno(errno, "Failed to duplicate console: %m");
2631 }
2632
2633 r = reset_audit_loginuid();
2634 if (r < 0)
2635 return r;
2636
2637 /* Mark everything as slave, so that we still
2638 * receive mounts from the real root, but don't
2639 * propagate mounts to the real root. */
2640 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2641 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2642
2643 r = mount_devices(directory,
2644 root_device, root_device_rw,
2645 home_device, home_device_rw,
2646 srv_device, srv_device_rw);
2647 if (r < 0)
2648 return r;
2649
391567f4
LP
2650 r = determine_uid_shift(directory);
2651 if (r < 0)
2652 return r;
2653
825d5287
RM
2654 if (arg_userns) {
2655 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2656 if (l < 0)
2657 return log_error_errno(errno, "Failed to send UID shift: %m");
2658 if (l != sizeof(arg_uid_shift)) {
2659 log_error("Short write while sending UID shift.");
2660 return -EIO;
2661 }
2662 }
2663
03cfe0d5
LP
2664 /* Turn directory into bind mount */
2665 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2666 return log_error_errno(errno, "Failed to make bind mount: %m");
2667
e83bebef 2668 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2669 if (r < 0)
2670 return r;
2671
e83bebef 2672 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2673 if (r < 0)
2674 return r;
2675
03cfe0d5
LP
2676 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2677 if (r < 0)
2678 return r;
2679
03cfe0d5
LP
2680 if (arg_read_only) {
2681 r = bind_remount_recursive(directory, true);
2682 if (r < 0)
2683 return log_error_errno(r, "Failed to make tree read-only: %m");
2684 }
2685
d1678248 2686 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2687 if (r < 0)
2688 return r;
2689
07fa00f9
LP
2690 r = copy_devnodes(directory);
2691 if (r < 0)
03cfe0d5
LP
2692 return r;
2693
2694 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2695
07fa00f9
LP
2696 r = setup_pts(directory);
2697 if (r < 0)
03cfe0d5
LP
2698 return r;
2699
2700 r = setup_propagate(directory);
2701 if (r < 0)
2702 return r;
2703
2704 r = setup_dev_console(directory, console);
2705 if (r < 0)
2706 return r;
2707
2708 r = setup_seccomp();
2709 if (r < 0)
2710 return r;
2711
2712 r = setup_timezone(directory);
2713 if (r < 0)
2714 return r;
2715
2716 r = setup_resolv_conf(directory);
2717 if (r < 0)
2718 return r;
2719
2720 r = setup_journal(directory);
2721 if (r < 0)
2722 return r;
2723
e83bebef 2724 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2725 if (r < 0)
2726 return r;
2727
e83bebef 2728 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2729 if (r < 0)
2730 return r;
2731
2732 r = mount_move_root(directory);
2733 if (r < 0)
2734 return log_error_errno(r, "Failed to move root directory: %m");
2735
2736 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2737 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2738 (arg_private_network ? CLONE_NEWNET : 0) |
2739 (arg_userns ? CLONE_NEWUSER : 0),
2740 NULL);
2741 if (pid < 0)
2742 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2743 if (pid == 0) {
2744 pid_socket = safe_close(pid_socket);
825d5287 2745 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2746
2747 /* The inner child has all namespaces that are
2748 * requested, so that we all are owned by the user if
2749 * user namespaces are turned on. */
2750
f757855e 2751 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2752 if (r < 0)
2753 _exit(EXIT_FAILURE);
2754
2755 _exit(EXIT_SUCCESS);
2756 }
2757
2758 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2759 if (l < 0)
2760 return log_error_errno(errno, "Failed to send PID: %m");
2761 if (l != sizeof(pid)) {
2762 log_error("Short write while sending PID.");
2763 return -EIO;
2764 }
2765
2766 pid_socket = safe_close(pid_socket);
327e26d6
KN
2767 kmsg_socket = safe_close(kmsg_socket);
2768 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2769
2770 return 0;
2771}
2772
2773static int setup_uid_map(pid_t pid) {
2774 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2775 int r;
2776
2777 assert(pid > 1);
2778
2779 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2780 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2781 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2782 if (r < 0)
2783 return log_error_errno(r, "Failed to write UID map: %m");
2784
2785 /* We always assign the same UID and GID ranges */
2786 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2787 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2788 if (r < 0)
2789 return log_error_errno(r, "Failed to write GID map: %m");
2790
2791 return 0;
2792}
2793
f757855e
LP
2794static int load_settings(void) {
2795 _cleanup_(settings_freep) Settings *settings = NULL;
2796 _cleanup_fclose_ FILE *f = NULL;
2797 _cleanup_free_ char *p = NULL;
2798 const char *fn, *i;
2799 int r;
2800
2801 /* If all settings are masked, there's no point in looking for
2802 * the settings file */
2803 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2804 return 0;
2805
2806 fn = strjoina(arg_machine, ".nspawn");
2807
2808 /* We first look in the admin's directories in /etc and /run */
2809 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2810 _cleanup_free_ char *j = NULL;
2811
2812 j = strjoin(i, "/", fn, NULL);
2813 if (!j)
2814 return log_oom();
2815
2816 f = fopen(j, "re");
2817 if (f) {
2818 p = j;
2819 j = NULL;
2820
2821 /* By default we trust configuration from /etc and /run */
2822 if (arg_settings_trusted < 0)
2823 arg_settings_trusted = true;
2824
2825 break;
2826 }
2827
2828 if (errno != ENOENT)
2829 return log_error_errno(errno, "Failed to open %s: %m", j);
2830 }
2831
2832 if (!f) {
2833 /* After that, let's look for a file next to the
2834 * actual image we shall boot. */
2835
2836 if (arg_image) {
2837 p = file_in_same_dir(arg_image, fn);
2838 if (!p)
2839 return log_oom();
2840 } else if (arg_directory) {
2841 p = file_in_same_dir(arg_directory, fn);
2842 if (!p)
2843 return log_oom();
2844 }
2845
2846 if (p) {
2847 f = fopen(p, "re");
2848 if (!f && errno != ENOENT)
2849 return log_error_errno(errno, "Failed to open %s: %m", p);
2850
2851 /* By default we do not trust configuration from /var/lib/machines */
2852 if (arg_settings_trusted < 0)
2853 arg_settings_trusted = false;
2854 }
2855 }
2856
2857 if (!f)
2858 return 0;
2859
2860 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2861
2862 r = settings_load(f, p, &settings);
2863 if (r < 0)
2864 return r;
2865
2866 /* Copy over bits from the settings, unless they have been
2867 * explicitly masked by command line switches. */
2868
2869 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2870 settings->boot >= 0) {
2871 arg_boot = settings->boot;
2872
2873 strv_free(arg_parameters);
2874 arg_parameters = settings->parameters;
2875 settings->parameters = NULL;
2876 }
2877
2878 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2879 settings->environment) {
2880 strv_free(arg_setenv);
2881 arg_setenv = settings->environment;
2882 settings->environment = NULL;
2883 }
2884
2885 if ((arg_settings_mask & SETTING_USER) == 0 &&
2886 settings->user) {
2887 free(arg_user);
2888 arg_user = settings->user;
2889 settings->user = NULL;
2890 }
2891
2892 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2893 uint64_t plus;
f757855e 2894
0e265674
LP
2895 plus = settings->capability;
2896 if (settings_private_network(settings))
2897 plus |= (1ULL << CAP_NET_ADMIN);
2898
2899 if (!arg_settings_trusted && plus != 0) {
2900 if (settings->capability != 0)
2901 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2902 } else
2903 arg_retain |= plus;
f757855e
LP
2904
2905 arg_retain &= ~settings->drop_capability;
2906 }
2907
2908 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2909 settings->kill_signal > 0)
2910 arg_kill_signal = settings->kill_signal;
2911
2912 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2913 settings->personality != PERSONALITY_INVALID)
2914 arg_personality = settings->personality;
2915
2916 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2917 !sd_id128_is_null(settings->machine_id)) {
2918
2919 if (!arg_settings_trusted)
2920 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2921 else
2922 arg_uuid = settings->machine_id;
2923 }
2924
2925 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2926 settings->read_only >= 0)
2927 arg_read_only = settings->read_only;
2928
2929 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2930 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2931 arg_volatile_mode = settings->volatile_mode;
2932
2933 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2934 settings->n_custom_mounts > 0) {
2935
2936 if (!arg_settings_trusted)
2937 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2938 else {
2939 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2940 arg_custom_mounts = settings->custom_mounts;
2941 arg_n_custom_mounts = settings->n_custom_mounts;
2942
2943 settings->custom_mounts = NULL;
2944 settings->n_custom_mounts = 0;
2945 }
2946 }
2947
2948 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2949 (settings->private_network >= 0 ||
2950 settings->network_veth >= 0 ||
2951 settings->network_bridge ||
2952 settings->network_interfaces ||
2953 settings->network_macvlan ||
2954 settings->network_ipvlan)) {
2955
2956 if (!arg_settings_trusted)
2957 log_warning("Ignoring network settings, file %s is not trusted.", p);
2958 else {
0e265674
LP
2959 arg_network_veth = settings_private_network(settings);
2960 arg_private_network = settings_private_network(settings);
2961
f757855e
LP
2962 strv_free(arg_network_interfaces);
2963 arg_network_interfaces = settings->network_interfaces;
2964 settings->network_interfaces = NULL;
2965
2966 strv_free(arg_network_macvlan);
2967 arg_network_macvlan = settings->network_macvlan;
2968 settings->network_macvlan = NULL;
2969
2970 strv_free(arg_network_ipvlan);
2971 arg_network_ipvlan = settings->network_ipvlan;
2972 settings->network_ipvlan = NULL;
2973
2974 free(arg_network_bridge);
2975 arg_network_bridge = settings->network_bridge;
2976 settings->network_bridge = NULL;
f757855e
LP
2977 }
2978 }
2979
2980 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2981 settings->expose_ports) {
2982
2983 if (!arg_settings_trusted)
2984 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2985 else {
2986 expose_port_free_all(arg_expose_ports);
2987 arg_expose_ports = settings->expose_ports;
2988 settings->expose_ports = NULL;
2989 }
2990 }
2991
2992 return 0;
2993}
2994
03cfe0d5
LP
2995int main(int argc, char *argv[]) {
2996
2997 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2998 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2999 _cleanup_close_ int master = -1, image_fd = -1;
3000 _cleanup_fdset_free_ FDSet *fds = NULL;
3001 int r, n_fd_passed, loop_nr = -1;
3002 char veth_name[IFNAMSIZ];
3003 bool secondary = false, remove_subvol = false;
72c0a2c2 3004 sigset_t mask_chld;
03cfe0d5
LP
3005 pid_t pid = 0;
3006 int ret = EXIT_SUCCESS;
3007 union in_addr_union exposed = {};
3008 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3009 bool interactive;
3010
3011 log_parse_environment();
3012 log_open();
3013
3014 r = parse_argv(argc, argv);
3015 if (r <= 0)
3016 goto finish;
3017
03cfe0d5
LP
3018 if (geteuid() != 0) {
3019 log_error("Need to be root.");
3020 r = -EPERM;
3021 goto finish;
3022 }
f757855e
LP
3023 r = determine_names();
3024 if (r < 0)
3025 goto finish;
3026
3027 r = load_settings();
3028 if (r < 0)
3029 goto finish;
3030
3031 r = verify_arguments();
3032 if (r < 0)
3033 goto finish;
03cfe0d5
LP
3034
3035 n_fd_passed = sd_listen_fds(false);
3036 if (n_fd_passed > 0) {
3037 r = fdset_new_listen_fds(&fds, false);
3038 if (r < 0) {
3039 log_error_errno(r, "Failed to collect file descriptors: %m");
3040 goto finish;
3041 }
3042 }
3043
3044 if (arg_directory) {
3045 assert(!arg_image);
3046
3047 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3048 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3049 r = -EINVAL;
3050 goto finish;
3051 }
3052
3053 if (arg_ephemeral) {
3054 _cleanup_free_ char *np = NULL;
3055
3056 /* If the specified path is a mount point we
3057 * generate the new snapshot immediately
3058 * inside it under a random name. However if
3059 * the specified is not a mount point we
3060 * create the new snapshot in the parent
3061 * directory, just next to it. */
e26d6ce5 3062 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3063 if (r < 0) {
3064 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3065 goto finish;
3066 }
3067 if (r > 0)
770b5ce4 3068 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3069 else
770b5ce4 3070 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3071 if (r < 0) {
3072 log_error_errno(r, "Failed to generate name for snapshot: %m");
3073 goto finish;
3074 }
3075
3076 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3077 if (r < 0) {
3078 log_error_errno(r, "Failed to lock %s: %m", np);
3079 goto finish;
3080 }
3081
5bcd08db 3082 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3083 if (r < 0) {
3084 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3085 goto finish;
ec16945e
LP
3086 }
3087
3088 free(arg_directory);
3089 arg_directory = np;
8a16a7b4 3090 np = NULL;
ec16945e
LP
3091
3092 remove_subvol = true;
30535c16
LP
3093
3094 } else {
3095 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3096 if (r == -EBUSY) {
3097 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3098 goto finish;
3099 }
3100 if (r < 0) {
3101 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3102 return r;
3103 }
3104
3105 if (arg_template) {
5bcd08db 3106 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3107 if (r == -EEXIST) {
3108 if (!arg_quiet)
3109 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3110 } else if (r < 0) {
83521414 3111 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3112 goto finish;
3113 } else {
3114 if (!arg_quiet)
3115 log_info("Populated %s from template %s.", arg_directory, arg_template);
3116 }
3117 }
ec16945e
LP
3118 }
3119
1b9e5b12
LP
3120 if (arg_boot) {
3121 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3122 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3123 r = -EINVAL;
1b9e5b12
LP
3124 goto finish;
3125 }
3126 } else {
3127 const char *p;
3128
16fb773e
LP
3129 p = strjoina(arg_directory, "/usr/");
3130 if (laccess(p, F_OK) < 0) {
3131 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3132 r = -EINVAL;
1b9e5b12 3133 goto finish;
1b9e5b12
LP
3134 }
3135 }
ec16945e 3136
6b9132a9 3137 } else {
1b9e5b12 3138 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3139
ec16945e
LP
3140 assert(arg_image);
3141 assert(!arg_template);
3142
30535c16
LP
3143 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3144 if (r == -EBUSY) {
3145 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3146 goto finish;
3147 }
3148 if (r < 0) {
3149 r = log_error_errno(r, "Failed to create image lock: %m");
3150 goto finish;
3151 }
3152
1b9e5b12 3153 if (!mkdtemp(template)) {
56f64d95 3154 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3155 r = -errno;
6b9132a9 3156 goto finish;
1b9e5b12 3157 }
6b9132a9 3158
1b9e5b12
LP
3159 arg_directory = strdup(template);
3160 if (!arg_directory) {
3161 r = log_oom();
3162 goto finish;
6b9132a9 3163 }
88213476 3164
1b9e5b12
LP
3165 image_fd = setup_image(&device_path, &loop_nr);
3166 if (image_fd < 0) {
3167 r = image_fd;
842f3b0f
LP
3168 goto finish;
3169 }
1b9e5b12 3170
4d9f07b4
LP
3171 r = dissect_image(image_fd,
3172 &root_device, &root_device_rw,
3173 &home_device, &home_device_rw,
3174 &srv_device, &srv_device_rw,
3175 &secondary);
1b9e5b12
LP
3176 if (r < 0)
3177 goto finish;
842f3b0f 3178 }
842f3b0f 3179
5a8af538
LP
3180 r = custom_mounts_prepare();
3181 if (r < 0)
3182 goto finish;
3183
03cfe0d5
LP
3184 interactive =
3185 isatty(STDIN_FILENO) > 0 &&
3186 isatty(STDOUT_FILENO) > 0;
9c857b9d 3187
db7feb7e
LP
3188 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3189 if (master < 0) {
ec16945e 3190 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3191 goto finish;
3192 }
3193
611b312b
LP
3194 r = ptsname_malloc(master, &console);
3195 if (r < 0) {
3196 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3197 goto finish;
3198 }
3199
a258bf26 3200 if (unlockpt(master) < 0) {
ec16945e 3201 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3202 goto finish;
3203 }
3204
9c857b9d
LP
3205 if (!arg_quiet)
3206 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3207 arg_machine, arg_image ?: arg_directory);
3208
72c0a2c2 3209 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3210
023fb90b
LP
3211 assert_se(sigemptyset(&mask_chld) == 0);
3212 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3213
03cfe0d5
LP
3214 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3215 r = log_error_errno(errno, "Failed to become subreaper: %m");
3216 goto finish;
3217 }
3218
d87be9b0 3219 for (;;) {
825d5287
RM
3220 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3221 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3222 ContainerStatus container_status;
7566e267 3223 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3224 static const struct sigaction sa = {
189d5bac 3225 .sa_handler = nop_signal_handler,
e866af3a
DH
3226 .sa_flags = SA_NOCLDSTOP,
3227 };
03cfe0d5
LP
3228 int ifi = 0;
3229 ssize_t l;
dbb60d69
LP
3230 _cleanup_event_unref_ sd_event *event = NULL;
3231 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3232 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3233 char last_char = 0;
e866af3a 3234
7566e267 3235 r = barrier_create(&barrier);
a2da110b 3236 if (r < 0) {
da927ba9 3237 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3238 goto finish;
3239 }
3240
4610de50 3241 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3242 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3243 goto finish;
3244 }
3245
4610de50 3246 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3247 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3248 goto finish;
3249 }
3250
4610de50 3251 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3252 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3253 goto finish;
3254 }
3255
825d5287 3256 if (arg_userns)
4610de50 3257 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3258 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3259 goto finish;
3260 }
3261
e866af3a
DH
3262 /* Child can be killed before execv(), so handle SIGCHLD
3263 * in order to interrupt parent's blocking calls and
3264 * give it a chance to call wait() and terminate. */
3265 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3266 if (r < 0) {
ec16945e 3267 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3268 goto finish;
3269 }
3270
e866af3a
DH
3271 r = sigaction(SIGCHLD, &sa, NULL);
3272 if (r < 0) {
ec16945e 3273 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3274 goto finish;
3275 }
3276
03cfe0d5 3277 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3278 if (pid < 0) {
3279 if (errno == EINVAL)
ec16945e 3280 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3281 else
ec16945e 3282 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3283
d87be9b0
LP
3284 goto finish;
3285 }
a258bf26 3286
d87be9b0 3287 if (pid == 0) {
03cfe0d5 3288 /* The outer child only has a file system namespace. */
a2da110b
DH
3289 barrier_set_role(&barrier, BARRIER_CHILD);
3290
03e334a1 3291 master = safe_close(master);
a258bf26 3292
03e334a1 3293 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3294 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3295 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3296 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3297
ce30c8dc
LP
3298 (void) reset_all_signal_handlers();
3299 (void) reset_signal_mask();
f5c1b9ee 3300
03cfe0d5
LP
3301 r = outer_child(&barrier,
3302 arg_directory,
3303 console,
3304 root_device, root_device_rw,
3305 home_device, home_device_rw,
3306 srv_device, srv_device_rw,
3307 interactive,
3308 secondary,
3309 pid_socket_pair[1],
3310 kmsg_socket_pair[1],
3311 rtnl_socket_pair[1],
825d5287 3312 uid_shift_socket_pair[1],
f757855e 3313 fds);
0cb9fbcd 3314 if (r < 0)
a2da110b 3315 _exit(EXIT_FAILURE);
d87be9b0 3316
03cfe0d5 3317 _exit(EXIT_SUCCESS);
da5b3bad 3318 }
88213476 3319
a2da110b 3320 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3321
2feceb5e 3322 fds = fdset_free(fds);
842f3b0f 3323
6d0b55c2
LP
3324 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3325 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3326 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3327 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3328
03cfe0d5
LP
3329 /* Wait for the outer child. */
3330 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3331 if (r < 0)
3332 goto finish;
3333 if (r != 0) {
3334 r = -EIO;
3335 goto finish;
3336 }
3337 pid = 0;
6dac160c 3338
03cfe0d5
LP
3339 /* And now retrieve the PID of the inner child. */
3340 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3341 if (l < 0) {
3342 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3343 goto finish;
3344 }
3345 if (l != sizeof(pid)) {
76d44882 3346 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3347 r = EIO;
3348 goto finish;
3349 }
354bfd2b 3350
03cfe0d5 3351 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3352
03cfe0d5
LP
3353 if (arg_userns) {
3354 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3355 log_error("Child died too early.");
3356 r = -ESRCH;
840295fc 3357 goto finish;
03cfe0d5 3358 }
ab046dde 3359
825d5287
RM
3360 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3361 if (l < 0) {
3362 r = log_error_errno(errno, "Failed to read UID shift: %m");
3363 goto finish;
3364 }
3365 if (l != sizeof(arg_uid_shift)) {
76d44882 3366 log_error("Short read while reading UID shift.");
825d5287
RM
3367 r = EIO;
3368 goto finish;
3369 }
3370
03cfe0d5 3371 r = setup_uid_map(pid);
840295fc
LP
3372 if (r < 0)
3373 goto finish;
ab046dde 3374
03cfe0d5
LP
3375 (void) barrier_place(&barrier); /* #2 */
3376 }
c74e630d 3377
9a2a5625 3378 if (arg_private_network) {
4bbfe7ad 3379
9a2a5625
LP
3380 r = move_network_interfaces(pid, arg_network_interfaces);
3381 if (r < 0)
3382 goto finish;
5aa4bb6b 3383
9a2a5625
LP
3384 if (arg_network_veth) {
3385 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3386 if (r < 0)
3387 goto finish;
3388 else if (r > 0)
3389 ifi = r;
6dac160c 3390
9a2a5625
LP
3391 if (arg_network_bridge) {
3392 r = setup_bridge(veth_name, arg_network_bridge);
3393 if (r < 0)
3394 goto finish;
3395 if (r > 0)
3396 ifi = r;
3397 }
3398 }
6dac160c 3399
9a2a5625
LP
3400 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3401 if (r < 0)
3402 goto finish;
3403
3404 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3405 if (r < 0)
3406 goto finish;
3407 }
6dac160c 3408
b7103bc5
LP
3409 if (arg_register) {
3410 r = register_machine(
3411 arg_machine,
3412 pid,
3413 arg_directory,
3414 arg_uuid,
3415 ifi,
3416 arg_slice,
3417 arg_custom_mounts, arg_n_custom_mounts,
3418 arg_kill_signal,
3419 arg_property,
3420 arg_keep_unit);
3421 if (r < 0)
3422 goto finish;
3423 }
6dac160c 3424
34829a32 3425 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3426 if (r < 0)
3427 goto finish;
3428
34829a32
LP
3429 if (arg_keep_unit) {
3430 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3431 if (r < 0)
3432 goto finish;
3433 }
efdb0237 3434
34829a32 3435 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3436 if (r < 0)
3437 goto finish;
6dac160c 3438
03cfe0d5
LP
3439 /* Notify the child that the parent is ready with all
3440 * its setup (including cgroup-ification), and that
3441 * the child can now hand over control to the code to
3442 * run inside the container. */
3443 (void) barrier_place(&barrier); /* #3 */
6dac160c 3444
03cfe0d5
LP
3445 /* Block SIGCHLD here, before notifying child.
3446 * process_pty() will handle it with the other signals. */
3447 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3448
03cfe0d5
LP
3449 /* Reset signal to default */
3450 r = default_signals(SIGCHLD, -1);
3451 if (r < 0) {
3452 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3453 goto finish;
3454 }
e866af3a 3455
03cfe0d5 3456 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3457 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3458 log_error("Child died too early.");
03cfe0d5
LP
3459 r = -ESRCH;
3460 goto finish;
3461 }
b12afc8c 3462
03cfe0d5
LP
3463 sd_notifyf(false,
3464 "READY=1\n"
3465 "STATUS=Container running.\n"
3466 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3467
03cfe0d5
LP
3468 r = sd_event_new(&event);
3469 if (r < 0) {
3470 log_error_errno(r, "Failed to get default event source: %m");
3471 goto finish;
3472 }
88213476 3473
03cfe0d5
LP
3474 if (arg_kill_signal > 0) {
3475 /* Try to kill the init system on SIGINT or SIGTERM */
3476 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3477 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3478 } else {
3479 /* Immediately exit */
3480 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3481 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3482 }
023fb90b 3483
03cfe0d5
LP
3484 /* simply exit on sigchld */
3485 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3486
03cfe0d5 3487 if (arg_expose_ports) {
7a8f6325 3488 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3489 if (r < 0)
3490 goto finish;
023fb90b 3491
7a8f6325 3492 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3493 }
023fb90b 3494
03cfe0d5 3495 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3496
ae3dde80 3497 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3498 if (r < 0) {
3499 log_error_errno(r, "Failed to create PTY forwarder: %m");
3500 goto finish;
3501 }
023fb90b 3502
03cfe0d5
LP
3503 r = sd_event_loop(event);
3504 if (r < 0) {
3505 log_error_errno(r, "Failed to run event loop: %m");
3506 goto finish;
3507 }
6d0b55c2 3508
03cfe0d5 3509 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3510
03cfe0d5 3511 forward = pty_forward_free(forward);
6d0b55c2 3512
03cfe0d5
LP
3513 if (!arg_quiet && last_char != '\n')
3514 putc('\n', stdout);
04d39279 3515
03cfe0d5 3516 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3517 if (arg_register && !arg_keep_unit)
3518 terminate_machine(pid);
1f0cd86b 3519
840295fc 3520 /* Normally redundant, but better safe than sorry */
04d39279 3521 kill(pid, SIGKILL);
a258bf26 3522
113cea80 3523 r = wait_for_container(pid, &container_status);
04d39279
LP
3524 pid = 0;
3525
ec16945e 3526 if (r < 0)
ce9f1527
LP
3527 /* We failed to wait for the container, or the
3528 * container exited abnormally */
ec16945e
LP
3529 goto finish;
3530 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3531 /* The container exited with a non-zero
3532 * status, or with zero status and no reboot
3533 * was requested. */
ec16945e 3534 ret = r;
d87be9b0 3535 break;
ec16945e 3536 }
88213476 3537
113cea80 3538 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3539
3540 if (arg_keep_unit) {
3541 /* Special handling if we are running as a
3542 * service: instead of simply restarting the
3543 * machine we want to restart the entire
3544 * service, so let's inform systemd about this
3545 * with the special exit code 133. The service
3546 * file uses RestartForceExitStatus=133 so
3547 * that this results in a full nspawn
3548 * restart. This is necessary since we might
3549 * have cgroup parameters set we want to have
3550 * flushed out. */
ec16945e
LP
3551 ret = 133;
3552 r = 0;
ce38dbc8
LP
3553 break;
3554 }
6d0b55c2 3555
7a8f6325 3556 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3557 }
88213476
LP
3558
3559finish:
af4ec430
LP
3560 sd_notify(false,
3561 "STOPPING=1\n"
3562 "STATUS=Terminating...");
3563
9444b1f2
LP
3564 if (pid > 0)
3565 kill(pid, SIGKILL);
88213476 3566
503546da
LP
3567 /* Try to flush whatever is still queued in the pty */
3568 if (master >= 0)
59f448cf 3569 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3570
03cfe0d5
LP
3571 loop_remove(loop_nr, &image_fd);
3572
ec16945e
LP
3573 if (remove_subvol && arg_directory) {
3574 int k;
3575
5bcd08db 3576 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3577 if (k < 0)
3578 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3579 }
3580
785890ac
LP
3581 if (arg_machine) {
3582 const char *p;
3583
63c372cb 3584 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3585 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3586 }
3587
7a8f6325 3588 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3589
04d391da 3590 free(arg_directory);
ec16945e
LP
3591 free(arg_template);
3592 free(arg_image);
7027ff61 3593 free(arg_machine);
c74e630d
LP
3594 free(arg_user);
3595 strv_free(arg_setenv);
f757855e 3596 free(arg_network_bridge);
c74e630d
LP
3597 strv_free(arg_network_interfaces);
3598 strv_free(arg_network_macvlan);
4bbfe7ad 3599 strv_free(arg_network_ipvlan);
f757855e
LP
3600 strv_free(arg_parameters);
3601 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3602 expose_port_free_all(arg_expose_ports);
6d0b55c2 3603
ec16945e 3604 return r < 0 ? EXIT_FAILURE : ret;
88213476 3605}