]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
util-lib: split our string related calls from util.[ch] into its own file string...
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e
LP
48
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
8fe0087e
LP
53#include "cap-list.h"
54#include "capability.h"
04d391da 55#include "cgroup-util.h"
8fe0087e 56#include "copy.h"
4fc9982c 57#include "dev-setup.h"
8fe0087e
LP
58#include "env-util.h"
59#include "event-util.h"
842f3b0f 60#include "fdset.h"
a5c32cff 61#include "fileio.h"
8fe0087e 62#include "formats-util.h"
1b9e5b12 63#include "gpt.h"
8fe0087e
LP
64#include "hostname-util.h"
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
71#include "netlink-util.h"
07630cea
LP
72#include "nspawn-cgroup.h"
73#include "nspawn-expose-ports.h"
74#include "nspawn-mount.h"
75#include "nspawn-network.h"
76#include "nspawn-register.h"
77#include "nspawn-settings.h"
78#include "nspawn-setuid.h"
8fe0087e 79#include "path-util.h"
0b452006 80#include "process-util.h"
8fe0087e
LP
81#include "ptyfwd.h"
82#include "random-util.h"
83#include "rm-rf.h"
e9642be2
LP
84#ifdef HAVE_SECCOMP
85#include "seccomp-util.h"
86#endif
8fe0087e 87#include "signal-util.h"
07630cea 88#include "string-util.h"
8fe0087e
LP
89#include "strv.h"
90#include "terminal-util.h"
91#include "udev-util.h"
92#include "util.h"
e9642be2 93
113cea80
DH
94typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97} ContainerStatus;
98
57fb9fb5
LP
99typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104} LinkJournal;
88213476
LP
105
106static char *arg_directory = NULL;
ec16945e 107static char *arg_template = NULL;
687d0825 108static char *arg_user = NULL;
9444b1f2 109static sd_id128_t arg_uuid = {};
7027ff61 110static char *arg_machine = NULL;
c74e630d
LP
111static const char *arg_selinux_context = NULL;
112static const char *arg_selinux_apifs_context = NULL;
9444b1f2 113static const char *arg_slice = NULL;
ff01d048 114static bool arg_private_network = false;
bc2f673e 115static bool arg_read_only = false;
0f0dbc46 116static bool arg_boot = false;
ec16945e 117static bool arg_ephemeral = false;
57fb9fb5 118static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 119static bool arg_link_journal_try = false;
5076f0cc
LP
120static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 142 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
5a8af538
LP
147static CustomMount *arg_custom_mounts = NULL;
148static unsigned arg_n_custom_mounts = 0;
f4889f65 149static char **arg_setenv = NULL;
284c0b91 150static bool arg_quiet = false;
8a96d94e 151static bool arg_share_system = false;
eb91eb18 152static bool arg_register = true;
89f7c846 153static bool arg_keep_unit = false;
aa28aefe 154static char **arg_network_interfaces = NULL;
c74e630d 155static char **arg_network_macvlan = NULL;
4bbfe7ad 156static char **arg_network_ipvlan = NULL;
69c79d3c 157static bool arg_network_veth = false;
f757855e 158static char *arg_network_bridge = NULL;
050f7277 159static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 160static char *arg_image = NULL;
f757855e 161static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 162static ExposePort *arg_expose_ports = NULL;
f36933fe 163static char **arg_property = NULL;
6dac160c
LP
164static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165static bool arg_userns = false;
c6c8f6e2 166static int arg_kill_signal = 0;
efdb0237 167static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
168static SettingsMask arg_settings_mask = 0;
169static int arg_settings_trusted = -1;
170static char **arg_parameters = NULL;
88213476 171
601185b4 172static void help(void) {
88213476
LP
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
175 " -h --help Show this help\n"
176 " --version Print version string\n"
69c79d3c 177 " -q --quiet Do not show status information\n"
1b9e5b12 178 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 186 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 187 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 188 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 189 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
69c79d3c
LP
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
c74e630d
LP
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
4bbfe7ad
TG
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
0dfaa006 202 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 203 " and container\n"
ab046dde 204 " --network-bridge=INTERFACE\n"
32457153 205 " Add a virtual ethernet connection between host\n"
ab046dde
TG
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
6d0b55c2 208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 209 " Expose a container IP port on the host\n"
82adf6af
LP
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
a8828ed9
DW
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 223 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
a8828ed9 226 " the container\n"
5e5bfa6e
EY
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
06c17c39 229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
284c0b91 235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 236 " --share-system Share system namespaces with host\n"
eb91eb18 237 " --register=BOOLEAN Register container as machine\n"
89f7c846 238 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 239 " the service unit nspawn is running in\n"
6d0b55c2 240 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 242 , program_invocation_short_name);
88213476
LP
243}
244
5a8af538
LP
245
246static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
825d5287
RM
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
5a8af538
LP
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
14bcf25c 271 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277}
278
efdb0237
LP
279static int detect_unified_cgroup_hierarchy(void) {
280 const char *e;
281 int r;
282
283 /* Allow the user to control whether the unified hierarchy is used */
284 e = getenv("UNIFIED_CGROUP_HIERARCHY");
285 if (e) {
286 r = parse_boolean(e);
287 if (r < 0)
288 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
289
290 arg_unified_cgroup_hierarchy = r;
291 return 0;
292 }
293
294 /* Otherwise inherit the default from the host system */
295 r = cg_unified();
296 if (r < 0)
297 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
298
299 arg_unified_cgroup_hierarchy = r;
300 return 0;
301}
302
88213476
LP
303static int parse_argv(int argc, char *argv[]) {
304
a41fe3a2 305 enum {
acbeb427
ZJS
306 ARG_VERSION = 0x100,
307 ARG_PRIVATE_NETWORK,
bc2f673e 308 ARG_UUID,
5076f0cc 309 ARG_READ_ONLY,
57fb9fb5 310 ARG_CAPABILITY,
420c7379 311 ARG_DROP_CAPABILITY,
17fe0523
LP
312 ARG_LINK_JOURNAL,
313 ARG_BIND,
f4889f65 314 ARG_BIND_RO,
06c17c39 315 ARG_TMPFS,
5a8af538
LP
316 ARG_OVERLAY,
317 ARG_OVERLAY_RO,
f4889f65 318 ARG_SETENV,
eb91eb18 319 ARG_SHARE_SYSTEM,
89f7c846 320 ARG_REGISTER,
aa28aefe 321 ARG_KEEP_UNIT,
69c79d3c 322 ARG_NETWORK_INTERFACE,
c74e630d 323 ARG_NETWORK_MACVLAN,
4bbfe7ad 324 ARG_NETWORK_IPVLAN,
ab046dde 325 ARG_NETWORK_BRIDGE,
6afc95b7 326 ARG_PERSONALITY,
4d9f07b4 327 ARG_VOLATILE,
ec16945e 328 ARG_TEMPLATE,
f36933fe 329 ARG_PROPERTY,
6dac160c 330 ARG_PRIVATE_USERS,
c6c8f6e2 331 ARG_KILL_SIGNAL,
f757855e 332 ARG_SETTINGS,
a41fe3a2
LP
333 };
334
88213476 335 static const struct option options[] = {
aa28aefe
LP
336 { "help", no_argument, NULL, 'h' },
337 { "version", no_argument, NULL, ARG_VERSION },
338 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
339 { "template", required_argument, NULL, ARG_TEMPLATE },
340 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
341 { "user", required_argument, NULL, 'u' },
342 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
343 { "boot", no_argument, NULL, 'b' },
344 { "uuid", required_argument, NULL, ARG_UUID },
345 { "read-only", no_argument, NULL, ARG_READ_ONLY },
346 { "capability", required_argument, NULL, ARG_CAPABILITY },
347 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
348 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
349 { "bind", required_argument, NULL, ARG_BIND },
350 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 351 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
352 { "overlay", required_argument, NULL, ARG_OVERLAY },
353 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
354 { "machine", required_argument, NULL, 'M' },
355 { "slice", required_argument, NULL, 'S' },
356 { "setenv", required_argument, NULL, ARG_SETENV },
357 { "selinux-context", required_argument, NULL, 'Z' },
358 { "selinux-apifs-context", required_argument, NULL, 'L' },
359 { "quiet", no_argument, NULL, 'q' },
360 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
361 { "register", required_argument, NULL, ARG_REGISTER },
362 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
363 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 364 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 365 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 366 { "network-veth", no_argument, NULL, 'n' },
ab046dde 367 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 368 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 369 { "image", required_argument, NULL, 'i' },
4d9f07b4 370 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 371 { "port", required_argument, NULL, 'p' },
f36933fe 372 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 373 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 374 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 375 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 376 {}
88213476
LP
377 };
378
9444b1f2 379 int c, r;
a42c8b54 380 uint64_t plus = 0, minus = 0;
f757855e 381 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
382
383 assert(argc >= 0);
384 assert(argv);
385
0dfaa006 386 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
387
388 switch (c) {
389
390 case 'h':
601185b4
ZJS
391 help();
392 return 0;
88213476 393
acbeb427 394 case ARG_VERSION:
3f6fd1ba 395 return version();
acbeb427 396
88213476 397 case 'D':
0f03c2a4 398 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 399 if (r < 0)
0f03c2a4 400 return r;
ec16945e
LP
401 break;
402
403 case ARG_TEMPLATE:
0f03c2a4 404 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 405 if (r < 0)
0f03c2a4 406 return r;
88213476
LP
407 break;
408
1b9e5b12 409 case 'i':
0f03c2a4 410 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 411 if (r < 0)
0f03c2a4 412 return r;
ec16945e
LP
413 break;
414
415 case 'x':
416 arg_ephemeral = true;
1b9e5b12
LP
417 break;
418
687d0825 419 case 'u':
2fc09a9c
DM
420 r = free_and_strdup(&arg_user, optarg);
421 if (r < 0)
7027ff61 422 return log_oom();
687d0825 423
f757855e 424 arg_settings_mask |= SETTING_USER;
687d0825
MV
425 break;
426
ab046dde 427 case ARG_NETWORK_BRIDGE:
f757855e
LP
428 r = free_and_strdup(&arg_network_bridge, optarg);
429 if (r < 0)
430 return log_oom();
ab046dde
TG
431
432 /* fall through */
433
0dfaa006 434 case 'n':
69c79d3c
LP
435 arg_network_veth = true;
436 arg_private_network = true;
f757855e 437 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
438 break;
439
aa28aefe 440 case ARG_NETWORK_INTERFACE:
c74e630d
LP
441 if (strv_extend(&arg_network_interfaces, optarg) < 0)
442 return log_oom();
443
444 arg_private_network = true;
f757855e 445 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
446 break;
447
448 case ARG_NETWORK_MACVLAN:
449 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
450 return log_oom();
451
4bbfe7ad 452 arg_private_network = true;
f757855e 453 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
454 break;
455
456 case ARG_NETWORK_IPVLAN:
457 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
458 return log_oom();
459
aa28aefe
LP
460 /* fall through */
461
ff01d048
LP
462 case ARG_PRIVATE_NETWORK:
463 arg_private_network = true;
f757855e 464 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
465 break;
466
0f0dbc46
LP
467 case 'b':
468 arg_boot = true;
f757855e 469 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
470 break;
471
144f0fc0 472 case ARG_UUID:
9444b1f2
LP
473 r = sd_id128_from_string(optarg, &arg_uuid);
474 if (r < 0) {
aa96c6cb 475 log_error("Invalid UUID: %s", optarg);
9444b1f2 476 return r;
aa96c6cb 477 }
f757855e
LP
478
479 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 480 break;
aa96c6cb 481
9444b1f2 482 case 'S':
c74e630d 483 arg_slice = optarg;
144f0fc0
LP
484 break;
485
7027ff61 486 case 'M':
c1521918 487 if (isempty(optarg))
97b11eed 488 arg_machine = mfree(arg_machine);
c1521918 489 else {
0c3c4284 490 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
491 log_error("Invalid machine name: %s", optarg);
492 return -EINVAL;
493 }
7027ff61 494
0c3c4284
LP
495 r = free_and_strdup(&arg_machine, optarg);
496 if (r < 0)
eb91eb18
LP
497 return log_oom();
498
499 break;
500 }
7027ff61 501
82adf6af
LP
502 case 'Z':
503 arg_selinux_context = optarg;
a8828ed9
DW
504 break;
505
82adf6af
LP
506 case 'L':
507 arg_selinux_apifs_context = optarg;
a8828ed9
DW
508 break;
509
bc2f673e
LP
510 case ARG_READ_ONLY:
511 arg_read_only = true;
f757855e 512 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
513 break;
514
420c7379
LP
515 case ARG_CAPABILITY:
516 case ARG_DROP_CAPABILITY: {
a2a5291b 517 const char *state, *word;
5076f0cc
LP
518 size_t length;
519
520 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 521 _cleanup_free_ char *t;
5076f0cc
LP
522
523 t = strndup(word, length);
0d0f0c50
SL
524 if (!t)
525 return log_oom();
5076f0cc 526
39ed67d1
LP
527 if (streq(t, "all")) {
528 if (c == ARG_CAPABILITY)
a42c8b54 529 plus = (uint64_t) -1;
39ed67d1 530 else
a42c8b54 531 minus = (uint64_t) -1;
39ed67d1 532 } else {
2822da4f
LP
533 int cap;
534
535 cap = capability_from_name(t);
536 if (cap < 0) {
39ed67d1
LP
537 log_error("Failed to parse capability %s.", t);
538 return -EINVAL;
539 }
540
541 if (c == ARG_CAPABILITY)
a42c8b54 542 plus |= 1ULL << (uint64_t) cap;
39ed67d1 543 else
a42c8b54 544 minus |= 1ULL << (uint64_t) cap;
5076f0cc 545 }
5076f0cc
LP
546 }
547
f757855e 548 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
549 break;
550 }
551
57fb9fb5
LP
552 case 'j':
553 arg_link_journal = LINK_GUEST;
574edc90 554 arg_link_journal_try = true;
57fb9fb5
LP
555 break;
556
557 case ARG_LINK_JOURNAL:
53e438e3 558 if (streq(optarg, "auto")) {
57fb9fb5 559 arg_link_journal = LINK_AUTO;
53e438e3
LP
560 arg_link_journal_try = false;
561 } else if (streq(optarg, "no")) {
57fb9fb5 562 arg_link_journal = LINK_NO;
53e438e3
LP
563 arg_link_journal_try = false;
564 } else if (streq(optarg, "guest")) {
57fb9fb5 565 arg_link_journal = LINK_GUEST;
53e438e3
LP
566 arg_link_journal_try = false;
567 } else if (streq(optarg, "host")) {
57fb9fb5 568 arg_link_journal = LINK_HOST;
53e438e3
LP
569 arg_link_journal_try = false;
570 } else if (streq(optarg, "try-guest")) {
574edc90
MP
571 arg_link_journal = LINK_GUEST;
572 arg_link_journal_try = true;
573 } else if (streq(optarg, "try-host")) {
574 arg_link_journal = LINK_HOST;
575 arg_link_journal_try = true;
576 } else {
57fb9fb5
LP
577 log_error("Failed to parse link journal mode %s", optarg);
578 return -EINVAL;
579 }
580
581 break;
582
17fe0523 583 case ARG_BIND:
f757855e
LP
584 case ARG_BIND_RO:
585 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
586 if (r < 0)
587 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 588
f757855e 589 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 590 break;
06c17c39 591
f757855e
LP
592 case ARG_TMPFS:
593 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
594 if (r < 0)
595 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 596
f757855e 597 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 598 break;
5a8af538
LP
599
600 case ARG_OVERLAY:
601 case ARG_OVERLAY_RO: {
602 _cleanup_free_ char *upper = NULL, *destination = NULL;
603 _cleanup_strv_free_ char **lower = NULL;
604 CustomMount *m;
605 unsigned n = 0;
606 char **i;
607
62f9f39a
RM
608 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
609 if (r == -ENOMEM)
06c17c39 610 return log_oom();
62f9f39a
RM
611 else if (r < 0) {
612 log_error("Invalid overlay specification: %s", optarg);
613 return r;
614 }
06c17c39 615
5a8af538
LP
616 STRV_FOREACH(i, lower) {
617 if (!path_is_absolute(*i)) {
618 log_error("Overlay path %s is not absolute.", *i);
619 return -EINVAL;
620 }
621
622 n++;
623 }
624
625 if (n < 2) {
626 log_error("--overlay= needs at least two colon-separated directories specified.");
627 return -EINVAL;
628 }
629
630 if (n == 2) {
631 /* If two parameters are specified,
632 * the first one is the lower, the
633 * second one the upper directory. And
af86c440
ZJS
634 * we'll also define the destination
635 * mount point the same as the upper. */
5a8af538
LP
636 upper = lower[1];
637 lower[1] = NULL;
638
639 destination = strdup(upper);
640 if (!destination)
641 return log_oom();
642
643 } else {
644 upper = lower[n - 2];
645 destination = lower[n - 1];
646 lower[n - 2] = NULL;
647 }
648
f757855e 649 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
650 if (!m)
651 return log_oom();
652
653 m->destination = destination;
654 m->source = upper;
655 m->lower = lower;
656 m->read_only = c == ARG_OVERLAY_RO;
657
658 upper = destination = NULL;
659 lower = NULL;
06c17c39 660
f757855e 661 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
662 break;
663 }
664
f4889f65
LP
665 case ARG_SETENV: {
666 char **n;
667
668 if (!env_assignment_is_valid(optarg)) {
669 log_error("Environment variable assignment '%s' is not valid.", optarg);
670 return -EINVAL;
671 }
672
673 n = strv_env_set(arg_setenv, optarg);
674 if (!n)
675 return log_oom();
676
677 strv_free(arg_setenv);
678 arg_setenv = n;
f757855e
LP
679
680 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
681 break;
682 }
683
284c0b91
LP
684 case 'q':
685 arg_quiet = true;
686 break;
687
8a96d94e
LP
688 case ARG_SHARE_SYSTEM:
689 arg_share_system = true;
690 break;
691
eb91eb18
LP
692 case ARG_REGISTER:
693 r = parse_boolean(optarg);
694 if (r < 0) {
695 log_error("Failed to parse --register= argument: %s", optarg);
696 return r;
697 }
698
699 arg_register = r;
700 break;
701
89f7c846
LP
702 case ARG_KEEP_UNIT:
703 arg_keep_unit = true;
704 break;
705
6afc95b7
LP
706 case ARG_PERSONALITY:
707
ac45f971 708 arg_personality = personality_from_string(optarg);
050f7277 709 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
710 log_error("Unknown or unsupported personality '%s'.", optarg);
711 return -EINVAL;
712 }
713
f757855e 714 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
715 break;
716
4d9f07b4
LP
717 case ARG_VOLATILE:
718
719 if (!optarg)
f757855e 720 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 721 else {
f757855e 722 VolatileMode m;
4d9f07b4 723
f757855e
LP
724 m = volatile_mode_from_string(optarg);
725 if (m < 0) {
726 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 727 return -EINVAL;
f757855e
LP
728 } else
729 arg_volatile_mode = m;
6d0b55c2
LP
730 }
731
f757855e
LP
732 arg_settings_mask |= SETTING_VOLATILE_MODE;
733 break;
6d0b55c2 734
f757855e
LP
735 case 'p':
736 r = expose_port_parse(&arg_expose_ports, optarg);
737 if (r == -EEXIST)
738 return log_error_errno(r, "Duplicate port specification: %s", optarg);
739 if (r < 0)
740 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 741
f757855e 742 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 743 break;
6d0b55c2 744
f36933fe
LP
745 case ARG_PROPERTY:
746 if (strv_extend(&arg_property, optarg) < 0)
747 return log_oom();
748
749 break;
750
6dac160c
LP
751 case ARG_PRIVATE_USERS:
752 if (optarg) {
753 _cleanup_free_ char *buffer = NULL;
754 const char *range, *shift;
755
756 range = strchr(optarg, ':');
757 if (range) {
758 buffer = strndup(optarg, range - optarg);
759 if (!buffer)
760 return log_oom();
761 shift = buffer;
762
763 range++;
764 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
765 log_error("Failed to parse UID range: %s", range);
766 return -EINVAL;
767 }
768 } else
769 shift = optarg;
770
771 if (parse_uid(shift, &arg_uid_shift) < 0) {
772 log_error("Failed to parse UID: %s", optarg);
773 return -EINVAL;
774 }
775 }
776
777 arg_userns = true;
778 break;
779
c6c8f6e2
LP
780 case ARG_KILL_SIGNAL:
781 arg_kill_signal = signal_from_string_try_harder(optarg);
782 if (arg_kill_signal < 0) {
783 log_error("Cannot parse signal: %s", optarg);
784 return -EINVAL;
785 }
786
f757855e
LP
787 arg_settings_mask |= SETTING_KILL_SIGNAL;
788 break;
789
790 case ARG_SETTINGS:
791
792 /* no → do not read files
793 * yes → read files, do not override cmdline, trust only subset
794 * override → read files, override cmdline, trust only subset
795 * trusted → read files, do not override cmdline, trust all
796 */
797
798 r = parse_boolean(optarg);
799 if (r < 0) {
800 if (streq(optarg, "trusted")) {
801 mask_all_settings = false;
802 mask_no_settings = false;
803 arg_settings_trusted = true;
804
805 } else if (streq(optarg, "override")) {
806 mask_all_settings = false;
807 mask_no_settings = true;
808 arg_settings_trusted = -1;
809 } else
810 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
811 } else if (r > 0) {
812 /* yes */
813 mask_all_settings = false;
814 mask_no_settings = false;
815 arg_settings_trusted = -1;
816 } else {
817 /* no */
818 mask_all_settings = true;
819 mask_no_settings = false;
820 arg_settings_trusted = false;
821 }
822
c6c8f6e2
LP
823 break;
824
88213476
LP
825 case '?':
826 return -EINVAL;
827
828 default:
eb9da376 829 assert_not_reached("Unhandled option");
88213476 830 }
88213476 831
eb91eb18
LP
832 if (arg_share_system)
833 arg_register = false;
834
835 if (arg_boot && arg_share_system) {
836 log_error("--boot and --share-system may not be combined.");
837 return -EINVAL;
838 }
839
89f7c846
LP
840 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
841 log_error("--keep-unit may not be used when invoked from a user session.");
842 return -EINVAL;
843 }
844
1b9e5b12
LP
845 if (arg_directory && arg_image) {
846 log_error("--directory= and --image= may not be combined.");
847 return -EINVAL;
848 }
849
ec16945e
LP
850 if (arg_template && arg_image) {
851 log_error("--template= and --image= may not be combined.");
852 return -EINVAL;
853 }
854
855 if (arg_template && !(arg_directory || arg_machine)) {
856 log_error("--template= needs --directory= or --machine=.");
857 return -EINVAL;
858 }
859
860 if (arg_ephemeral && arg_template) {
861 log_error("--ephemeral and --template= may not be combined.");
862 return -EINVAL;
863 }
864
865 if (arg_ephemeral && arg_image) {
866 log_error("--ephemeral and --image= may not be combined.");
867 return -EINVAL;
868 }
869
df9a75e4
LP
870 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
871 log_error("--ephemeral and --link-journal= may not be combined.");
872 return -EINVAL;
873 }
874
f757855e
LP
875 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
876 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
877
878 if (argc > optind) {
879 arg_parameters = strv_copy(argv + optind);
880 if (!arg_parameters)
881 return log_oom();
882
883 arg_settings_mask |= SETTING_BOOT;
884 }
885
886 /* Load all settings from .nspawn files */
887 if (mask_no_settings)
888 arg_settings_mask = 0;
889
890 /* Don't load any settings from .nspawn files */
891 if (mask_all_settings)
892 arg_settings_mask = _SETTINGS_MASK_ALL;
893
894 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
895
896 r = detect_unified_cgroup_hierarchy();
897 if (r < 0)
898 return r;
899
900 return 1;
901}
902
903static int verify_arguments(void) {
904
905 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
906 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
907 return -EINVAL;
908 }
909
6d0b55c2
LP
910 if (arg_expose_ports && !arg_private_network) {
911 log_error("Cannot use --port= without private networking.");
912 return -EINVAL;
913 }
914
c6c8f6e2
LP
915 if (arg_boot && arg_kill_signal <= 0)
916 arg_kill_signal = SIGRTMIN+3;
917
f757855e 918 return 0;
88213476
LP
919}
920
03cfe0d5
LP
921static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
922 assert(p);
923
924 if (!arg_userns)
925 return 0;
926
927 if (uid == UID_INVALID && gid == GID_INVALID)
928 return 0;
929
930 if (uid != UID_INVALID) {
931 uid += arg_uid_shift;
932
933 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
934 return -EOVERFLOW;
935 }
936
937 if (gid != GID_INVALID) {
938 gid += (gid_t) arg_uid_shift;
939
940 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
941 return -EOVERFLOW;
942 }
943
944 if (lchown(p, uid, gid) < 0)
945 return -errno;
b12afc8c
LP
946
947 return 0;
948}
949
03cfe0d5
LP
950static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
951 const char *q;
952
953 q = prefix_roota(root, path);
954 if (mkdir(q, mode) < 0) {
955 if (errno == EEXIST)
956 return 0;
957 return -errno;
958 }
959
960 return userns_lchown(q, uid, gid);
961}
962
e58a1277 963static int setup_timezone(const char *dest) {
03cfe0d5
LP
964 _cleanup_free_ char *p = NULL, *q = NULL;
965 const char *where, *check, *what;
d4036145
LP
966 char *z, *y;
967 int r;
f8440af5 968
e58a1277
LP
969 assert(dest);
970
971 /* Fix the timezone, if possible */
d4036145
LP
972 r = readlink_malloc("/etc/localtime", &p);
973 if (r < 0) {
974 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
975 return 0;
976 }
977
978 z = path_startswith(p, "../usr/share/zoneinfo/");
979 if (!z)
980 z = path_startswith(p, "/usr/share/zoneinfo/");
981 if (!z) {
982 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
983 return 0;
984 }
985
03cfe0d5 986 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
987 r = readlink_malloc(where, &q);
988 if (r >= 0) {
989 y = path_startswith(q, "../usr/share/zoneinfo/");
990 if (!y)
991 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 992
d4036145
LP
993 /* Already pointing to the right place? Then do nothing .. */
994 if (y && streq(y, z))
995 return 0;
996 }
997
03cfe0d5
LP
998 check = strjoina("/usr/share/zoneinfo/", z);
999 check = prefix_root(dest, check);
1000 if (laccess(check, F_OK) < 0) {
d4036145
LP
1001 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1002 return 0;
1003 }
68fb0892 1004
79d80fc1
TG
1005 r = unlink(where);
1006 if (r < 0 && errno != ENOENT) {
56f64d95 1007 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1008 return 0;
1009 }
4d9f07b4 1010
03cfe0d5 1011 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1012 if (symlink(what, where) < 0) {
56f64d95 1013 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1014 return 0;
1015 }
e58a1277 1016
03cfe0d5
LP
1017 r = userns_lchown(where, 0, 0);
1018 if (r < 0)
1019 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1020
e58a1277 1021 return 0;
88213476
LP
1022}
1023
2547bb41 1024static int setup_resolv_conf(const char *dest) {
03cfe0d5 1025 const char *where = NULL;
79d80fc1 1026 int r;
2547bb41
LP
1027
1028 assert(dest);
1029
1030 if (arg_private_network)
1031 return 0;
1032
1033 /* Fix resolv.conf, if possible */
03cfe0d5 1034 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1035
f2068bcc 1036 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1037 if (r < 0) {
68a313c5
LP
1038 /* If the file already exists as symlink, let's
1039 * suppress the warning, under the assumption that
1040 * resolved or something similar runs inside and the
1041 * symlink points there.
1042 *
1043 * If the disk image is read-only, there's also no
1044 * point in complaining.
1045 */
1046 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1047 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1048 return 0;
1049 }
2547bb41 1050
03cfe0d5
LP
1051 r = userns_lchown(where, 0, 0);
1052 if (r < 0)
1053 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1054
2547bb41
LP
1055 return 0;
1056}
1057
9f24adc2 1058static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1059 assert(s);
9f24adc2
LP
1060
1061 snprintf(s, 37,
1062 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1063 SD_ID128_FORMAT_VAL(id));
1064
1065 return s;
1066}
1067
04bc4a3f 1068static int setup_boot_id(const char *dest) {
03cfe0d5 1069 const char *from, *to;
39883f62 1070 sd_id128_t rnd = {};
04bc4a3f
LP
1071 char as_uuid[37];
1072 int r;
1073
eb91eb18
LP
1074 if (arg_share_system)
1075 return 0;
1076
04bc4a3f
LP
1077 /* Generate a new randomized boot ID, so that each boot-up of
1078 * the container gets a new one */
1079
03cfe0d5
LP
1080 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1081 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1082
1083 r = sd_id128_randomize(&rnd);
f647962d
MS
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1086
9f24adc2 1087 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1088
4c1fc3e4 1089 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1090 if (r < 0)
1091 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1092
03cfe0d5
LP
1093 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1094 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1095 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1096 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1097
1098 unlink(from);
04bc4a3f
LP
1099 return r;
1100}
1101
e58a1277 1102static int copy_devnodes(const char *dest) {
88213476
LP
1103
1104 static const char devnodes[] =
1105 "null\0"
1106 "zero\0"
1107 "full\0"
1108 "random\0"
1109 "urandom\0"
85614d66
TG
1110 "tty\0"
1111 "net/tun\0";
88213476
LP
1112
1113 const char *d;
e58a1277 1114 int r = 0;
7fd1b19b 1115 _cleanup_umask_ mode_t u;
a258bf26
LP
1116
1117 assert(dest);
124640f1
LP
1118
1119 u = umask(0000);
88213476 1120
03cfe0d5
LP
1121 /* Create /dev/net, so that we can create /dev/net/tun in it */
1122 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1123 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1124
88213476 1125 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1126 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1127 struct stat st;
88213476 1128
7f112f50 1129 from = strappend("/dev/", d);
03cfe0d5 1130 to = prefix_root(dest, from);
88213476
LP
1131
1132 if (stat(from, &st) < 0) {
1133
4a62c710
MS
1134 if (errno != ENOENT)
1135 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1136
a258bf26 1137 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1138
03cfe0d5 1139 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1140 return -EIO;
a258bf26 1141
85614d66 1142 } else {
81f5049b
AC
1143 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144 if (errno != EPERM)
1145 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1146
1147 /* Some systems abusively restrict mknod but
1148 * allow bind mounts. */
1149 r = touch(to);
1150 if (r < 0)
1151 return log_error_errno(r, "touch (%s) failed: %m", to);
1152 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1153 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1154 }
6278cf60 1155
03cfe0d5
LP
1156 r = userns_lchown(to, 0, 0);
1157 if (r < 0)
1158 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1159 }
88213476
LP
1160 }
1161
e58a1277
LP
1162 return r;
1163}
88213476 1164
03cfe0d5
LP
1165static int setup_pts(const char *dest) {
1166 _cleanup_free_ char *options = NULL;
1167 const char *p;
1168
1169#ifdef HAVE_SELINUX
1170 if (arg_selinux_apifs_context)
1171 (void) asprintf(&options,
3dce8915 1172 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1173 arg_uid_shift + TTY_GID,
1174 arg_selinux_apifs_context);
1175 else
1176#endif
1177 (void) asprintf(&options,
3dce8915 1178 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1179 arg_uid_shift + TTY_GID);
f2d88580 1180
03cfe0d5 1181 if (!options)
f2d88580
LP
1182 return log_oom();
1183
03cfe0d5 1184 /* Mount /dev/pts itself */
cc9fce65 1185 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1186 if (mkdir(p, 0755) < 0)
1187 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1188 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1189 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1190 if (userns_lchown(p, 0, 0) < 0)
1191 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1192
1193 /* Create /dev/ptmx symlink */
1194 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1195 if (symlink("pts/ptmx", p) < 0)
1196 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1197 if (userns_lchown(p, 0, 0) < 0)
1198 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1199
03cfe0d5
LP
1200 /* And fix /dev/pts/ptmx ownership */
1201 p = prefix_roota(dest, "/dev/pts/ptmx");
1202 if (userns_lchown(p, 0, 0) < 0)
1203 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1204
f2d88580
LP
1205 return 0;
1206}
1207
e58a1277 1208static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1209 _cleanup_umask_ mode_t u;
1210 const char *to;
e58a1277 1211 int r;
e58a1277
LP
1212
1213 assert(dest);
1214 assert(console);
1215
1216 u = umask(0000);
1217
03cfe0d5 1218 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1219 if (r < 0)
1220 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1221
a258bf26
LP
1222 /* We need to bind mount the right tty to /dev/console since
1223 * ptys can only exist on pts file systems. To have something
81f5049b 1224 * to bind mount things on we create a empty regular file. */
a258bf26 1225
03cfe0d5 1226 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1227 r = touch(to);
1228 if (r < 0)
1229 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1230
4543768d 1231 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1232 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1233
25ea79fe 1234 return 0;
e58a1277
LP
1235}
1236
1237static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1238 const char *from, *to;
7fd1b19b 1239 _cleanup_umask_ mode_t u;
d9603714 1240 int fd, r;
e58a1277 1241
e58a1277 1242 assert(kmsg_socket >= 0);
a258bf26 1243
e58a1277 1244 u = umask(0000);
a258bf26 1245
03cfe0d5 1246 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1247 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1248 * on the reading side behave very similar to /proc/kmsg,
1249 * their writing side behaves differently from /dev/kmsg in
1250 * that writing blocks when nothing is reading. In order to
1251 * avoid any problems with containers deadlocking due to this
1252 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1253 from = prefix_roota(dest, "/run/kmsg");
1254 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1255
4a62c710 1256 if (mkfifo(from, 0600) < 0)
03cfe0d5 1257 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1258 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1259 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1260
1261 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1262 if (fd < 0)
1263 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1264
e58a1277
LP
1265 /* Store away the fd in the socket, so that it stays open as
1266 * long as we run the child */
3ee897d6 1267 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1268 safe_close(fd);
e58a1277 1269
d9603714
DH
1270 if (r < 0)
1271 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1272
03cfe0d5
LP
1273 /* And now make the FIFO unavailable as /run/kmsg... */
1274 (void) unlink(from);
1275
25ea79fe 1276 return 0;
88213476
LP
1277}
1278
1c4baffc 1279static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1280 union in_addr_union *exposed = userdata;
1281
1282 assert(rtnl);
1283 assert(m);
1284 assert(exposed);
1285
7a8f6325 1286 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1287 return 0;
1288}
1289
3a74cea5 1290static int setup_hostname(void) {
3a74cea5 1291
eb91eb18
LP
1292 if (arg_share_system)
1293 return 0;
1294
605f81a8 1295 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1296 return -errno;
3a74cea5 1297
7027ff61 1298 return 0;
3a74cea5
LP
1299}
1300
57fb9fb5 1301static int setup_journal(const char *directory) {
4d680aee 1302 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1303 _cleanup_free_ char *b = NULL, *d = NULL;
1304 const char *etc_machine_id, *p, *q;
27407a01 1305 char *id;
57fb9fb5
LP
1306 int r;
1307
df9a75e4
LP
1308 /* Don't link journals in ephemeral mode */
1309 if (arg_ephemeral)
1310 return 0;
1311
03cfe0d5 1312 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1313
03cfe0d5 1314 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1315 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1316 return 0;
f647962d 1317 else if (r < 0)
03cfe0d5 1318 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1319
27407a01
ZJS
1320 id = strstrip(b);
1321 if (isempty(id) && arg_link_journal == LINK_AUTO)
1322 return 0;
57fb9fb5 1323
27407a01
ZJS
1324 /* Verify validity */
1325 r = sd_id128_from_string(id, &machine_id);
f647962d 1326 if (r < 0)
03cfe0d5 1327 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1328
4d680aee 1329 r = sd_id128_get_machine(&this_id);
f647962d
MS
1330 if (r < 0)
1331 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1332
1333 if (sd_id128_equal(machine_id, this_id)) {
1334 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335 "Host and machine ids are equal (%s): refusing to link journals", id);
1336 if (arg_link_journal == LINK_AUTO)
1337 return 0;
df9a75e4 1338 return -EEXIST;
4d680aee
ZJS
1339 }
1340
1341 if (arg_link_journal == LINK_NO)
1342 return 0;
1343
03cfe0d5
LP
1344 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1345 if (r < 0)
1346 return log_error_errno(r, "Failed to create /var: %m");
1347
1348 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1349 if (r < 0)
1350 return log_error_errno(r, "Failed to create /var/log: %m");
1351
1352 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1353 if (r < 0)
1354 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1355
1356 p = strjoina("/var/log/journal/", id);
1357 q = prefix_roota(directory, p);
27407a01 1358
e26d6ce5 1359 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1360 if (arg_link_journal != LINK_AUTO) {
1361 log_error("%s: already a mount point, refusing to use for journal", p);
1362 return -EEXIST;
1363 }
1364
1365 return 0;
57fb9fb5
LP
1366 }
1367
e26d6ce5 1368 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1369 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1370 log_error("%s: already a mount point, refusing to use for journal", q);
1371 return -EEXIST;
57fb9fb5
LP
1372 }
1373
27407a01 1374 return 0;
57fb9fb5
LP
1375 }
1376
1377 r = readlink_and_make_absolute(p, &d);
1378 if (r >= 0) {
1379 if ((arg_link_journal == LINK_GUEST ||
1380 arg_link_journal == LINK_AUTO) &&
1381 path_equal(d, q)) {
1382
03cfe0d5 1383 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1384 if (r < 0)
56f64d95 1385 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1386 return 0;
57fb9fb5
LP
1387 }
1388
4a62c710
MS
1389 if (unlink(p) < 0)
1390 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1391 } else if (r == -EINVAL) {
1392
1393 if (arg_link_journal == LINK_GUEST &&
1394 rmdir(p) < 0) {
1395
27407a01
ZJS
1396 if (errno == ENOTDIR) {
1397 log_error("%s already exists and is neither a symlink nor a directory", p);
1398 return r;
1399 } else {
56f64d95 1400 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1401 return -errno;
57fb9fb5 1402 }
57fb9fb5
LP
1403 }
1404 } else if (r != -ENOENT) {
56f64d95 1405 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1406 return r;
57fb9fb5
LP
1407 }
1408
1409 if (arg_link_journal == LINK_GUEST) {
1410
1411 if (symlink(q, p) < 0) {
574edc90 1412 if (arg_link_journal_try) {
56f64d95 1413 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1414 return 0;
1415 } else {
56f64d95 1416 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1417 return -errno;
1418 }
57fb9fb5
LP
1419 }
1420
03cfe0d5 1421 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1422 if (r < 0)
56f64d95 1423 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1424 return 0;
57fb9fb5
LP
1425 }
1426
1427 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1428 /* don't create parents here -- if the host doesn't have
1429 * permanent journal set up, don't force it here */
1430 r = mkdir(p, 0755);
57fb9fb5 1431 if (r < 0) {
574edc90 1432 if (arg_link_journal_try) {
56f64d95 1433 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1434 return 0;
1435 } else {
56f64d95 1436 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1437 return r;
1438 }
57fb9fb5
LP
1439 }
1440
27407a01
ZJS
1441 } else if (access(p, F_OK) < 0)
1442 return 0;
57fb9fb5 1443
cdb2b9d0
LP
1444 if (dir_is_empty(q) == 0)
1445 log_warning("%s is not empty, proceeding anyway.", q);
1446
03cfe0d5 1447 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 1448 if (r < 0) {
56f64d95 1449 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1450 return r;
57fb9fb5
LP
1451 }
1452
4543768d 1453 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1454 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1455
27407a01 1456 return 0;
57fb9fb5
LP
1457}
1458
88213476 1459static int drop_capabilities(void) {
5076f0cc 1460 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1461}
1462
db999e0f
LP
1463static int reset_audit_loginuid(void) {
1464 _cleanup_free_ char *p = NULL;
1465 int r;
1466
1467 if (arg_share_system)
1468 return 0;
1469
1470 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1471 if (r == -ENOENT)
db999e0f 1472 return 0;
f647962d
MS
1473 if (r < 0)
1474 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1475
1476 /* Already reset? */
1477 if (streq(p, "4294967295"))
1478 return 0;
1479
ad118bda 1480 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1481 if (r < 0) {
10a87006
LP
1482 log_error_errno(r,
1483 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1484 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1485 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1486 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1487 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1488
db999e0f 1489 sleep(5);
77b6e194 1490 }
db999e0f
LP
1491
1492 return 0;
77b6e194
LP
1493}
1494
28650077 1495static int setup_seccomp(void) {
24fb1112
LP
1496
1497#ifdef HAVE_SECCOMP
9a71b112
JF
1498 static const struct {
1499 uint64_t capability;
1500 int syscall_num;
1501 } blacklist[] = {
5ba7a268
LP
1502 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1503 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1504 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1505 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1506 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1507 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1508 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1509 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1510 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1511 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1512 };
1513
24fb1112 1514 scmp_filter_ctx seccomp;
28650077 1515 unsigned i;
24fb1112
LP
1516 int r;
1517
24fb1112
LP
1518 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1519 if (!seccomp)
1520 return log_oom();
1521
e9642be2 1522 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1523 if (r < 0) {
da927ba9 1524 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1525 goto finish;
1526 }
1527
28650077 1528 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1529 if (arg_retain & (1ULL << blacklist[i].capability))
1530 continue;
1531
1532 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1533 if (r == -EFAULT)
1534 continue; /* unknown syscall */
1535 if (r < 0) {
da927ba9 1536 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1537 goto finish;
1538 }
1539 }
1540
d0a0ccf3 1541
28650077
LP
1542 /*
1543 Audit is broken in containers, much of the userspace audit
1544 hookup will fail if running inside a container. We don't
1545 care and just turn off creation of audit sockets.
1546
1547 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1548 with EAFNOSUPPORT which audit userspace uses as indication
1549 that audit is disabled in the kernel.
1550 */
1551
3302da46 1552 r = seccomp_rule_add(
24fb1112
LP
1553 seccomp,
1554 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1555 SCMP_SYS(socket),
1556 2,
1557 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1558 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1559 if (r < 0) {
da927ba9 1560 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1561 goto finish;
1562 }
1563
1564 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1565 if (r < 0) {
da927ba9 1566 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1567 goto finish;
1568 }
1569
1570 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1571 if (r == -EINVAL) {
1572 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1573 r = 0;
1574 goto finish;
1575 }
1576 if (r < 0) {
da927ba9 1577 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1578 goto finish;
1579 }
24fb1112
LP
1580
1581finish:
1582 seccomp_release(seccomp);
1583 return r;
1584#else
1585 return 0;
1586#endif
1587
1588}
1589
785890ac
LP
1590static int setup_propagate(const char *root) {
1591 const char *p, *q;
1592
1593 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1594 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1595 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1596 (void) mkdir_p(p, 0600);
1597
03cfe0d5
LP
1598 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1599 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1600
1601 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1602 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1603
1604 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1605 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1606
03cfe0d5 1607 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1608 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1609 return log_error_errno(errno, "Failed to install propagation bind mount.");
1610
1611 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1612 return log_error_errno(errno, "Failed to make propagation mount read-only");
1613
1614 return 0;
1615}
1616
1b9e5b12
LP
1617static int setup_image(char **device_path, int *loop_nr) {
1618 struct loop_info64 info = {
1619 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1620 };
1621 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1622 _cleanup_free_ char* loopdev = NULL;
1623 struct stat st;
1624 int r, nr;
1625
1626 assert(device_path);
1627 assert(loop_nr);
ec16945e 1628 assert(arg_image);
1b9e5b12
LP
1629
1630 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1631 if (fd < 0)
1632 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1633
4a62c710
MS
1634 if (fstat(fd, &st) < 0)
1635 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1636
1637 if (S_ISBLK(st.st_mode)) {
1638 char *p;
1639
1640 p = strdup(arg_image);
1641 if (!p)
1642 return log_oom();
1643
1644 *device_path = p;
1645
1646 *loop_nr = -1;
1647
1648 r = fd;
1649 fd = -1;
1650
1651 return r;
1652 }
1653
1654 if (!S_ISREG(st.st_mode)) {
56f64d95 1655 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
1656 return -EINVAL;
1657 }
1658
1659 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1660 if (control < 0)
1661 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1662
1663 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1664 if (nr < 0)
1665 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1666
1667 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1668 return log_oom();
1669
1670 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1671 if (loop < 0)
1672 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1673
4a62c710
MS
1674 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1675 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1676
1677 if (arg_read_only)
1678 info.lo_flags |= LO_FLAGS_READ_ONLY;
1679
4a62c710
MS
1680 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1681 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1682
1683 *device_path = loopdev;
1684 loopdev = NULL;
1685
1686 *loop_nr = nr;
1687
1688 r = loop;
1689 loop = -1;
1690
1691 return r;
1692}
1693
ada4799a
LP
1694#define PARTITION_TABLE_BLURB \
1695 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1696 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1697 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1698 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1699 "to be bootable with systemd-nspawn."
1700
1b9e5b12
LP
1701static int dissect_image(
1702 int fd,
727fd4fd
LP
1703 char **root_device, bool *root_device_rw,
1704 char **home_device, bool *home_device_rw,
1705 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1706 bool *secondary) {
1707
1708#ifdef HAVE_BLKID
01dc33ce
ZJS
1709 int home_nr = -1, srv_nr = -1;
1710#ifdef GPT_ROOT_NATIVE
1711 int root_nr = -1;
1712#endif
1713#ifdef GPT_ROOT_SECONDARY
1714 int secondary_root_nr = -1;
1715#endif
f6c51a81 1716 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1717 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1718 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1719 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1720 _cleanup_udev_unref_ struct udev *udev = NULL;
1721 struct udev_list_entry *first, *item;
f6c51a81 1722 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1723 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1724 const char *pttype = NULL;
1725 blkid_partlist pl;
1726 struct stat st;
c09ef2e4 1727 unsigned i;
1b9e5b12
LP
1728 int r;
1729
1730 assert(fd >= 0);
1731 assert(root_device);
1732 assert(home_device);
1733 assert(srv_device);
1734 assert(secondary);
ec16945e 1735 assert(arg_image);
1b9e5b12
LP
1736
1737 b = blkid_new_probe();
1738 if (!b)
1739 return log_oom();
1740
1741 errno = 0;
1742 r = blkid_probe_set_device(b, fd, 0, 0);
1743 if (r != 0) {
1744 if (errno == 0)
1745 return log_oom();
1746
56f64d95 1747 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1748 return -errno;
1749 }
1750
1751 blkid_probe_enable_partitions(b, 1);
1752 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1753
1754 errno = 0;
1755 r = blkid_do_safeprobe(b);
1756 if (r == -2 || r == 1) {
ada4799a
LP
1757 log_error("Failed to identify any partition table on\n"
1758 " %s\n"
1759 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1760 return -EINVAL;
1761 } else if (r != 0) {
1762 if (errno == 0)
1763 errno = EIO;
56f64d95 1764 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1765 return -errno;
1766 }
1767
48861960 1768 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1769
1770 is_gpt = streq_ptr(pttype, "gpt");
1771 is_mbr = streq_ptr(pttype, "dos");
1772
1773 if (!is_gpt && !is_mbr) {
1774 log_error("No GPT or MBR partition table discovered on\n"
1775 " %s\n"
1776 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1777 return -EINVAL;
1778 }
1779
1780 errno = 0;
1781 pl = blkid_probe_get_partitions(b);
1782 if (!pl) {
1783 if (errno == 0)
1784 return log_oom();
1785
1786 log_error("Failed to list partitions of %s", arg_image);
1787 return -errno;
1788 }
1789
1790 udev = udev_new();
1791 if (!udev)
1792 return log_oom();
1793
4a62c710
MS
1794 if (fstat(fd, &st) < 0)
1795 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1796
c09ef2e4
LP
1797 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1798 if (!d)
1b9e5b12
LP
1799 return log_oom();
1800
c09ef2e4
LP
1801 for (i = 0;; i++) {
1802 int n, m;
1b9e5b12 1803
c09ef2e4
LP
1804 if (i >= 10) {
1805 log_error("Kernel partitions never appeared.");
1806 return -ENXIO;
1807 }
1808
1809 e = udev_enumerate_new(udev);
1810 if (!e)
1811 return log_oom();
1812
1813 r = udev_enumerate_add_match_parent(e, d);
1814 if (r < 0)
1815 return log_oom();
1816
1817 r = udev_enumerate_scan_devices(e);
1818 if (r < 0)
1819 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1820
1821 /* Count the partitions enumerated by the kernel */
1822 n = 0;
1823 first = udev_enumerate_get_list_entry(e);
1824 udev_list_entry_foreach(item, first)
1825 n++;
1826
1827 /* Count the partitions enumerated by blkid */
1828 m = blkid_partlist_numof_partitions(pl);
1829 if (n == m + 1)
1830 break;
1831 if (n > m + 1) {
1832 log_error("blkid and kernel partition list do not match.");
1833 return -EIO;
1834 }
1835 if (n < m + 1) {
1836 unsigned j;
1837
1838 /* The kernel has probed fewer partitions than
1839 * blkid? Maybe the kernel prober is still
1840 * running or it got EBUSY because udev
1841 * already opened the device. Let's reprobe
1842 * the device, which is a synchronous call
1843 * that waits until probing is complete. */
1844
1845 for (j = 0; j < 20; j++) {
1846
1847 r = ioctl(fd, BLKRRPART, 0);
1848 if (r < 0)
1849 r = -errno;
1850 if (r >= 0 || r != -EBUSY)
1851 break;
1852
1853 /* If something else has the device
1854 * open, such as an udev rule, the
1855 * ioctl will return EBUSY. Since
1856 * there's no way to wait until it
1857 * isn't busy anymore, let's just wait
1858 * a bit, and try again.
1859 *
1860 * This is really something they
1861 * should fix in the kernel! */
1862
1863 usleep(50 * USEC_PER_MSEC);
1864 }
1865
1866 if (r < 0)
1867 return log_error_errno(r, "Failed to reread partition table: %m");
1868 }
1869
1870 e = udev_enumerate_unref(e);
1871 }
1b9e5b12
LP
1872
1873 first = udev_enumerate_get_list_entry(e);
1874 udev_list_entry_foreach(item, first) {
1875 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1876 const char *node;
727fd4fd 1877 unsigned long long flags;
1b9e5b12
LP
1878 blkid_partition pp;
1879 dev_t qn;
1880 int nr;
1881
1882 errno = 0;
1883 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1884 if (!q) {
1885 if (!errno)
1886 errno = ENOMEM;
1887
56f64d95 1888 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1889 return -errno;
1890 }
1891
1892 qn = udev_device_get_devnum(q);
1893 if (major(qn) == 0)
1894 continue;
1895
1896 if (st.st_rdev == qn)
1897 continue;
1898
1899 node = udev_device_get_devnode(q);
1900 if (!node)
1901 continue;
1902
1903 pp = blkid_partlist_devno_to_partition(pl, qn);
1904 if (!pp)
1905 continue;
1906
727fd4fd 1907 flags = blkid_partition_get_flags(pp);
727fd4fd 1908
1b9e5b12
LP
1909 nr = blkid_partition_get_partno(pp);
1910 if (nr < 0)
1911 continue;
1912
ada4799a
LP
1913 if (is_gpt) {
1914 sd_id128_t type_id;
1915 const char *stype;
1b9e5b12 1916
f6c51a81
LP
1917 if (flags & GPT_FLAG_NO_AUTO)
1918 continue;
1919
ada4799a
LP
1920 stype = blkid_partition_get_type_string(pp);
1921 if (!stype)
1922 continue;
1b9e5b12 1923
ada4799a 1924 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1925 continue;
1926
ada4799a 1927 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1928
ada4799a
LP
1929 if (home && nr >= home_nr)
1930 continue;
1b9e5b12 1931
ada4799a
LP
1932 home_nr = nr;
1933 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1934
ada4799a
LP
1935 r = free_and_strdup(&home, node);
1936 if (r < 0)
1937 return log_oom();
727fd4fd 1938
ada4799a
LP
1939 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1940
1941 if (srv && nr >= srv_nr)
1942 continue;
1943
1944 srv_nr = nr;
1945 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1946
1947 r = free_and_strdup(&srv, node);
1948 if (r < 0)
1949 return log_oom();
1950 }
1b9e5b12 1951#ifdef GPT_ROOT_NATIVE
ada4799a 1952 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1953
ada4799a
LP
1954 if (root && nr >= root_nr)
1955 continue;
1b9e5b12 1956
ada4799a
LP
1957 root_nr = nr;
1958 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1959
ada4799a
LP
1960 r = free_and_strdup(&root, node);
1961 if (r < 0)
1962 return log_oom();
1963 }
1b9e5b12
LP
1964#endif
1965#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1966 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1967
1968 if (secondary_root && nr >= secondary_root_nr)
1969 continue;
1970
1971 secondary_root_nr = nr;
1972 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&secondary_root, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1978#endif
f6c51a81
LP
1979 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1980
1981 if (generic)
1982 multiple_generic = true;
1983 else {
1984 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1985
1986 r = free_and_strdup(&generic, node);
1987 if (r < 0)
1988 return log_oom();
1989 }
1990 }
ada4799a
LP
1991
1992 } else if (is_mbr) {
1993 int type;
1b9e5b12 1994
f6c51a81
LP
1995 if (flags != 0x80) /* Bootable flag */
1996 continue;
1997
ada4799a
LP
1998 type = blkid_partition_get_type(pp);
1999 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2000 continue;
2001
f6c51a81
LP
2002 if (generic)
2003 multiple_generic = true;
2004 else {
2005 generic_rw = true;
727fd4fd 2006
f6c51a81
LP
2007 r = free_and_strdup(&root, node);
2008 if (r < 0)
2009 return log_oom();
2010 }
1b9e5b12 2011 }
1b9e5b12
LP
2012 }
2013
1b9e5b12
LP
2014 if (root) {
2015 *root_device = root;
2016 root = NULL;
727fd4fd
LP
2017
2018 *root_device_rw = root_rw;
1b9e5b12
LP
2019 *secondary = false;
2020 } else if (secondary_root) {
2021 *root_device = secondary_root;
2022 secondary_root = NULL;
727fd4fd
LP
2023
2024 *root_device_rw = secondary_root_rw;
1b9e5b12 2025 *secondary = true;
f6c51a81
LP
2026 } else if (generic) {
2027
2028 /* There were no partitions with precise meanings
2029 * around, but we found generic partitions. In this
2030 * case, if there's only one, we can go ahead and boot
2031 * it, otherwise we bail out, because we really cannot
2032 * make any sense of it. */
2033
2034 if (multiple_generic) {
2035 log_error("Identified multiple bootable Linux partitions on\n"
2036 " %s\n"
2037 PARTITION_TABLE_BLURB, arg_image);
2038 return -EINVAL;
2039 }
2040
2041 *root_device = generic;
2042 generic = NULL;
2043
2044 *root_device_rw = generic_rw;
2045 *secondary = false;
2046 } else {
2047 log_error("Failed to identify root partition in disk image\n"
2048 " %s\n"
2049 PARTITION_TABLE_BLURB, arg_image);
2050 return -EINVAL;
1b9e5b12
LP
2051 }
2052
2053 if (home) {
2054 *home_device = home;
2055 home = NULL;
727fd4fd
LP
2056
2057 *home_device_rw = home_rw;
1b9e5b12
LP
2058 }
2059
2060 if (srv) {
2061 *srv_device = srv;
2062 srv = NULL;
727fd4fd
LP
2063
2064 *srv_device_rw = srv_rw;
1b9e5b12
LP
2065 }
2066
2067 return 0;
2068#else
2069 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2070 return -EOPNOTSUPP;
1b9e5b12
LP
2071#endif
2072}
2073
727fd4fd 2074static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2075#ifdef HAVE_BLKID
2076 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2077 const char *fstype, *p;
2078 int r;
2079
2080 assert(what);
2081 assert(where);
2082
727fd4fd
LP
2083 if (arg_read_only)
2084 rw = false;
2085
1b9e5b12 2086 if (directory)
63c372cb 2087 p = strjoina(where, directory);
1b9e5b12
LP
2088 else
2089 p = where;
2090
2091 errno = 0;
2092 b = blkid_new_probe_from_filename(what);
2093 if (!b) {
2094 if (errno == 0)
2095 return log_oom();
56f64d95 2096 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2097 return -errno;
2098 }
2099
2100 blkid_probe_enable_superblocks(b, 1);
2101 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2102
2103 errno = 0;
2104 r = blkid_do_safeprobe(b);
2105 if (r == -1 || r == 1) {
2106 log_error("Cannot determine file system type of %s", what);
2107 return -EINVAL;
2108 } else if (r != 0) {
2109 if (errno == 0)
2110 errno = EIO;
56f64d95 2111 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2112 return -errno;
2113 }
2114
2115 errno = 0;
2116 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2117 if (errno == 0)
2118 errno = EINVAL;
2119 log_error("Failed to determine file system type of %s", what);
2120 return -errno;
2121 }
2122
2123 if (streq(fstype, "crypto_LUKS")) {
2124 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2125 return -EOPNOTSUPP;
1b9e5b12
LP
2126 }
2127
4a62c710
MS
2128 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2129 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2130
2131 return 0;
2132#else
2133 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2134 return -EOPNOTSUPP;
1b9e5b12
LP
2135#endif
2136}
2137
727fd4fd
LP
2138static int mount_devices(
2139 const char *where,
2140 const char *root_device, bool root_device_rw,
2141 const char *home_device, bool home_device_rw,
2142 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2143 int r;
2144
2145 assert(where);
2146
2147 if (root_device) {
727fd4fd 2148 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2149 if (r < 0)
2150 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2151 }
2152
2153 if (home_device) {
727fd4fd 2154 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2155 if (r < 0)
2156 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2157 }
2158
2159 if (srv_device) {
727fd4fd 2160 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2161 if (r < 0)
2162 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2163 }
2164
2165 return 0;
2166}
2167
2168static void loop_remove(int nr, int *image_fd) {
2169 _cleanup_close_ int control = -1;
e8c8ddcc 2170 int r;
1b9e5b12
LP
2171
2172 if (nr < 0)
2173 return;
2174
2175 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2176 r = ioctl(*image_fd, LOOP_CLR_FD);
2177 if (r < 0)
5e4074aa 2178 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2179 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2180 }
2181
2182 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2183 if (control < 0) {
56f64d95 2184 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2185 return;
e8c8ddcc 2186 }
1b9e5b12 2187
e8c8ddcc
TG
2188 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2189 if (r < 0)
5e4074aa 2190 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2191}
2192
113cea80 2193/*
6d416b9c
LS
2194 * Return values:
2195 * < 0 : wait_for_terminate() failed to get the state of the
2196 * container, the container was terminated by a signal, or
2197 * failed for an unknown reason. No change is made to the
2198 * container argument.
2199 * > 0 : The program executed in the container terminated with an
2200 * error. The exit code of the program executed in the
919699ec
LP
2201 * container is returned. The container argument has been set
2202 * to CONTAINER_TERMINATED.
6d416b9c
LS
2203 * 0 : The container is being rebooted, has been shut down or exited
2204 * successfully. The container argument has been set to either
2205 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2206 *
6d416b9c
LS
2207 * That is, success is indicated by a return value of zero, and an
2208 * error is indicated by a non-zero value.
113cea80
DH
2209 */
2210static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2211 siginfo_t status;
919699ec 2212 int r;
113cea80
DH
2213
2214 r = wait_for_terminate(pid, &status);
f647962d
MS
2215 if (r < 0)
2216 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2217
2218 switch (status.si_code) {
fddbb89c 2219
113cea80 2220 case CLD_EXITED:
919699ec
LP
2221 if (status.si_status == 0) {
2222 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2223
fddbb89c 2224 } else
919699ec 2225 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2226
919699ec
LP
2227 *container = CONTAINER_TERMINATED;
2228 return status.si_status;
113cea80
DH
2229
2230 case CLD_KILLED:
2231 if (status.si_status == SIGINT) {
113cea80 2232
919699ec 2233 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2234 *container = CONTAINER_TERMINATED;
919699ec
LP
2235 return 0;
2236
113cea80 2237 } else if (status.si_status == SIGHUP) {
113cea80 2238
919699ec 2239 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2240 *container = CONTAINER_REBOOTED;
919699ec 2241 return 0;
113cea80 2242 }
919699ec 2243
113cea80
DH
2244 /* CLD_KILLED fallthrough */
2245
2246 case CLD_DUMPED:
fddbb89c 2247 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2248 return -EIO;
113cea80
DH
2249
2250 default:
fddbb89c 2251 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2252 return -EIO;
113cea80
DH
2253 }
2254
2255 return r;
2256}
2257
023fb90b
LP
2258static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2259 pid_t pid;
2260
2261 pid = PTR_TO_UINT32(userdata);
2262 if (pid > 0) {
c6c8f6e2 2263 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2264 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2265 sd_event_source_set_userdata(s, NULL);
2266 return 0;
2267 }
2268 }
2269
2270 sd_event_exit(sd_event_source_get_event(s), 0);
2271 return 0;
2272}
2273
ec16945e 2274static int determine_names(void) {
1b9cebf6 2275 int r;
ec16945e 2276
c1521918
LP
2277 if (arg_template && !arg_directory && arg_machine) {
2278
2279 /* If --template= was specified then we should not
2280 * search for a machine, but instead create a new one
2281 * in /var/lib/machine. */
2282
2283 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2284 if (!arg_directory)
2285 return log_oom();
2286 }
2287
ec16945e 2288 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2289 if (arg_machine) {
2290 _cleanup_(image_unrefp) Image *i = NULL;
2291
2292 r = image_find(arg_machine, &i);
2293 if (r < 0)
2294 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2295 else if (r == 0) {
2296 log_error("No image for machine '%s': %m", arg_machine);
2297 return -ENOENT;
2298 }
2299
aceac2f0 2300 if (i->type == IMAGE_RAW)
0f03c2a4 2301 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2302 else
0f03c2a4 2303 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2304 if (r < 0)
2305 return log_error_errno(r, "Invalid image directory: %m");
2306
aee327b8
LP
2307 if (!arg_ephemeral)
2308 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2309 } else
ec16945e
LP
2310 arg_directory = get_current_dir_name();
2311
1b9cebf6
LP
2312 if (!arg_directory && !arg_machine) {
2313 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2314 return -EINVAL;
2315 }
2316 }
2317
2318 if (!arg_machine) {
b9ba4dab
LP
2319 if (arg_directory && path_equal(arg_directory, "/"))
2320 arg_machine = gethostname_malloc();
2321 else
2322 arg_machine = strdup(basename(arg_image ?: arg_directory));
2323
ec16945e
LP
2324 if (!arg_machine)
2325 return log_oom();
2326
ae691c1d 2327 hostname_cleanup(arg_machine);
ec16945e
LP
2328 if (!machine_name_is_valid(arg_machine)) {
2329 log_error("Failed to determine machine name automatically, please use -M.");
2330 return -EINVAL;
2331 }
b9ba4dab
LP
2332
2333 if (arg_ephemeral) {
2334 char *b;
2335
2336 /* Add a random suffix when this is an
2337 * ephemeral machine, so that we can run many
2338 * instances at once without manually having
2339 * to specify -M each time. */
2340
2341 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2342 return log_oom();
2343
2344 free(arg_machine);
2345 arg_machine = b;
2346 }
ec16945e
LP
2347 }
2348
2349 return 0;
2350}
2351
03cfe0d5 2352static int determine_uid_shift(const char *directory) {
6dac160c
LP
2353 int r;
2354
03cfe0d5
LP
2355 if (!arg_userns) {
2356 arg_uid_shift = 0;
6dac160c 2357 return 0;
03cfe0d5 2358 }
6dac160c
LP
2359
2360 if (arg_uid_shift == UID_INVALID) {
2361 struct stat st;
2362
03cfe0d5 2363 r = stat(directory, &st);
6dac160c 2364 if (r < 0)
03cfe0d5 2365 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2366
2367 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2368
2369 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2370 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2371 return -EINVAL;
2372 }
2373
2374 arg_uid_range = UINT32_C(0x10000);
2375 }
2376
2377 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2378 log_error("UID base too high for UID range.");
2379 return -EINVAL;
2380 }
2381
2382 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2383 return 0;
2384}
2385
03cfe0d5
LP
2386static int inner_child(
2387 Barrier *barrier,
2388 const char *directory,
2389 bool secondary,
2390 int kmsg_socket,
2391 int rtnl_socket,
f757855e 2392 FDSet *fds) {
69c79d3c 2393
03cfe0d5
LP
2394 _cleanup_free_ char *home = NULL;
2395 unsigned n_env = 2;
2396 const char *envp[] = {
2397 "PATH=" DEFAULT_PATH_SPLIT_USR,
2398 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2399 NULL, /* TERM */
2400 NULL, /* HOME */
2401 NULL, /* USER */
2402 NULL, /* LOGNAME */
2403 NULL, /* container_uuid */
2404 NULL, /* LISTEN_FDS */
2405 NULL, /* LISTEN_PID */
2406 NULL
2407 };
88213476 2408
2371271c 2409 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2410 int r;
88213476 2411
03cfe0d5
LP
2412 assert(barrier);
2413 assert(directory);
2414 assert(kmsg_socket >= 0);
88213476 2415
efdb0237
LP
2416 cg_unified_flush();
2417
03cfe0d5
LP
2418 if (arg_userns) {
2419 /* Tell the parent, that it now can write the UID map. */
2420 (void) barrier_place(barrier); /* #1 */
7027ff61 2421
03cfe0d5
LP
2422 /* Wait until the parent wrote the UID map */
2423 if (!barrier_place_and_sync(barrier)) { /* #2 */
2424 log_error("Parent died too early");
2425 return -ESRCH;
2426 }
88213476
LP
2427 }
2428
d1678248 2429 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2430 if (r < 0)
2431 return r;
2432
d8fc6a00
LP
2433 r = mount_sysfs(NULL);
2434 if (r < 0)
2435 return r;
2436
03cfe0d5
LP
2437 /* Wait until we are cgroup-ified, so that we
2438 * can mount the right cgroup path writable */
2439 if (!barrier_place_and_sync(barrier)) { /* #3 */
2440 log_error("Parent died too early");
2441 return -ESRCH;
88213476
LP
2442 }
2443
e83bebef 2444 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2445 if (r < 0)
2446 return r;
ec16945e 2447
03cfe0d5
LP
2448 r = reset_uid_gid();
2449 if (r < 0)
2450 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2451
03cfe0d5
LP
2452 r = setup_boot_id(NULL);
2453 if (r < 0)
2454 return r;
ec16945e 2455
03cfe0d5
LP
2456 r = setup_kmsg(NULL, kmsg_socket);
2457 if (r < 0)
2458 return r;
2459 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2460
03cfe0d5 2461 umask(0022);
30535c16 2462
03cfe0d5
LP
2463 if (setsid() < 0)
2464 return log_error_errno(errno, "setsid() failed: %m");
2465
2466 if (arg_private_network)
2467 loopback_setup();
2468
7a8f6325
LP
2469 if (arg_expose_ports) {
2470 r = expose_port_send_rtnl(rtnl_socket);
2471 if (r < 0)
2472 return r;
2473 rtnl_socket = safe_close(rtnl_socket);
2474 }
03cfe0d5
LP
2475
2476 if (drop_capabilities() < 0)
2477 return log_error_errno(errno, "drop_capabilities() failed: %m");
2478
2479 setup_hostname();
2480
050f7277 2481 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2482 if (personality(arg_personality) < 0)
2483 return log_error_errno(errno, "personality() failed: %m");
2484 } else if (secondary) {
2485 if (personality(PER_LINUX32) < 0)
2486 return log_error_errno(errno, "personality() failed: %m");
2487 }
2488
2489#ifdef HAVE_SELINUX
2490 if (arg_selinux_context)
2491 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2492 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2493#endif
2494
ee645080 2495 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2496 if (r < 0)
2497 return r;
2498
2499 envp[n_env] = strv_find_prefix(environ, "TERM=");
2500 if (envp[n_env])
2501 n_env ++;
2502
2503 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2504 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2505 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2506 return log_oom();
2507
2508 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2509 char as_uuid[37];
2510
2511 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2512 return log_oom();
2513 }
2514
2515 if (fdset_size(fds) > 0) {
2516 r = fdset_cloexec(fds, false);
2517 if (r < 0)
2518 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2519
2520 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2521 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2522 return log_oom();
2523 }
2524
2371271c
TG
2525 env_use = strv_env_merge(2, envp, arg_setenv);
2526 if (!env_use)
2527 return log_oom();
03cfe0d5
LP
2528
2529 /* Let the parent know that we are ready and
2530 * wait until the parent is ready with the
2531 * setup, too... */
2532 if (!barrier_place_and_sync(barrier)) { /* #4 */
2533 log_error("Parent died too early");
2534 return -ESRCH;
2535 }
2536
2537 /* Now, explicitly close the log, so that we
2538 * then can close all remaining fds. Closing
2539 * the log explicitly first has the benefit
2540 * that the logging subsystem knows about it,
2541 * and is thus ready to be reopened should we
2542 * need it again. Note that the other fds
2543 * closed here are at least the locking and
2544 * barrier fds. */
2545 log_close();
2546 (void) fdset_close_others(fds);
2547
2548 if (arg_boot) {
2549 char **a;
2550 size_t m;
2551
2552 /* Automatically search for the init system */
2553
f757855e 2554 m = 1 + strv_length(arg_parameters);
03cfe0d5 2555 a = newa(char*, m + 1);
f757855e
LP
2556 if (strv_isempty(arg_parameters))
2557 a[1] = NULL;
2558 else
2559 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2560
2561 a[0] = (char*) "/usr/lib/systemd/systemd";
2562 execve(a[0], a, env_use);
2563
2564 a[0] = (char*) "/lib/systemd/systemd";
2565 execve(a[0], a, env_use);
2566
2567 a[0] = (char*) "/sbin/init";
2568 execve(a[0], a, env_use);
f757855e
LP
2569 } else if (!strv_isempty(arg_parameters))
2570 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2571 else {
f757855e 2572 chdir(home ?: "/root");
03cfe0d5
LP
2573 execle("/bin/bash", "-bash", NULL, env_use);
2574 execle("/bin/sh", "-sh", NULL, env_use);
2575 }
2576
2577 (void) log_open();
2578 return log_error_errno(errno, "execv() failed: %m");
2579}
2580
2581static int outer_child(
2582 Barrier *barrier,
2583 const char *directory,
2584 const char *console,
2585 const char *root_device, bool root_device_rw,
2586 const char *home_device, bool home_device_rw,
2587 const char *srv_device, bool srv_device_rw,
2588 bool interactive,
2589 bool secondary,
2590 int pid_socket,
2591 int kmsg_socket,
2592 int rtnl_socket,
825d5287 2593 int uid_shift_socket,
f757855e 2594 FDSet *fds) {
03cfe0d5
LP
2595
2596 pid_t pid;
2597 ssize_t l;
2598 int r;
2599
2600 assert(barrier);
2601 assert(directory);
2602 assert(console);
2603 assert(pid_socket >= 0);
2604 assert(kmsg_socket >= 0);
2605
efdb0237
LP
2606 cg_unified_flush();
2607
03cfe0d5
LP
2608 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2609 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2610
2611 if (interactive) {
2612 close_nointr(STDIN_FILENO);
2613 close_nointr(STDOUT_FILENO);
2614 close_nointr(STDERR_FILENO);
2615
2616 r = open_terminal(console, O_RDWR);
2617 if (r != STDIN_FILENO) {
2618 if (r >= 0) {
2619 safe_close(r);
2620 r = -EINVAL;
2621 }
2622
2623 return log_error_errno(r, "Failed to open console: %m");
2624 }
2625
2626 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2627 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2628 return log_error_errno(errno, "Failed to duplicate console: %m");
2629 }
2630
2631 r = reset_audit_loginuid();
2632 if (r < 0)
2633 return r;
2634
2635 /* Mark everything as slave, so that we still
2636 * receive mounts from the real root, but don't
2637 * propagate mounts to the real root. */
2638 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2639 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2640
2641 r = mount_devices(directory,
2642 root_device, root_device_rw,
2643 home_device, home_device_rw,
2644 srv_device, srv_device_rw);
2645 if (r < 0)
2646 return r;
2647
391567f4
LP
2648 r = determine_uid_shift(directory);
2649 if (r < 0)
2650 return r;
2651
825d5287
RM
2652 if (arg_userns) {
2653 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2654 if (l < 0)
2655 return log_error_errno(errno, "Failed to send UID shift: %m");
2656 if (l != sizeof(arg_uid_shift)) {
2657 log_error("Short write while sending UID shift.");
2658 return -EIO;
2659 }
2660 }
2661
03cfe0d5
LP
2662 /* Turn directory into bind mount */
2663 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2664 return log_error_errno(errno, "Failed to make bind mount: %m");
2665
e83bebef 2666 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2667 if (r < 0)
2668 return r;
2669
e83bebef 2670 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2671 if (r < 0)
2672 return r;
2673
03cfe0d5
LP
2674 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2675 if (r < 0)
2676 return r;
2677
03cfe0d5
LP
2678 if (arg_read_only) {
2679 r = bind_remount_recursive(directory, true);
2680 if (r < 0)
2681 return log_error_errno(r, "Failed to make tree read-only: %m");
2682 }
2683
d1678248 2684 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2685 if (r < 0)
2686 return r;
2687
07fa00f9
LP
2688 r = copy_devnodes(directory);
2689 if (r < 0)
03cfe0d5
LP
2690 return r;
2691
2692 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2693
07fa00f9
LP
2694 r = setup_pts(directory);
2695 if (r < 0)
03cfe0d5
LP
2696 return r;
2697
2698 r = setup_propagate(directory);
2699 if (r < 0)
2700 return r;
2701
2702 r = setup_dev_console(directory, console);
2703 if (r < 0)
2704 return r;
2705
2706 r = setup_seccomp();
2707 if (r < 0)
2708 return r;
2709
2710 r = setup_timezone(directory);
2711 if (r < 0)
2712 return r;
2713
2714 r = setup_resolv_conf(directory);
2715 if (r < 0)
2716 return r;
2717
2718 r = setup_journal(directory);
2719 if (r < 0)
2720 return r;
2721
e83bebef 2722 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2723 if (r < 0)
2724 return r;
2725
e83bebef 2726 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2727 if (r < 0)
2728 return r;
2729
2730 r = mount_move_root(directory);
2731 if (r < 0)
2732 return log_error_errno(r, "Failed to move root directory: %m");
2733
2734 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2735 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2736 (arg_private_network ? CLONE_NEWNET : 0) |
2737 (arg_userns ? CLONE_NEWUSER : 0),
2738 NULL);
2739 if (pid < 0)
2740 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2741 if (pid == 0) {
2742 pid_socket = safe_close(pid_socket);
825d5287 2743 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2744
2745 /* The inner child has all namespaces that are
2746 * requested, so that we all are owned by the user if
2747 * user namespaces are turned on. */
2748
f757855e 2749 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2750 if (r < 0)
2751 _exit(EXIT_FAILURE);
2752
2753 _exit(EXIT_SUCCESS);
2754 }
2755
2756 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2757 if (l < 0)
2758 return log_error_errno(errno, "Failed to send PID: %m");
2759 if (l != sizeof(pid)) {
2760 log_error("Short write while sending PID.");
2761 return -EIO;
2762 }
2763
2764 pid_socket = safe_close(pid_socket);
327e26d6
KN
2765 kmsg_socket = safe_close(kmsg_socket);
2766 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2767
2768 return 0;
2769}
2770
2771static int setup_uid_map(pid_t pid) {
2772 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2773 int r;
2774
2775 assert(pid > 1);
2776
2777 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2778 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2779 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2780 if (r < 0)
2781 return log_error_errno(r, "Failed to write UID map: %m");
2782
2783 /* We always assign the same UID and GID ranges */
2784 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2785 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2786 if (r < 0)
2787 return log_error_errno(r, "Failed to write GID map: %m");
2788
2789 return 0;
2790}
2791
f757855e
LP
2792static int load_settings(void) {
2793 _cleanup_(settings_freep) Settings *settings = NULL;
2794 _cleanup_fclose_ FILE *f = NULL;
2795 _cleanup_free_ char *p = NULL;
2796 const char *fn, *i;
2797 int r;
2798
2799 /* If all settings are masked, there's no point in looking for
2800 * the settings file */
2801 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2802 return 0;
2803
2804 fn = strjoina(arg_machine, ".nspawn");
2805
2806 /* We first look in the admin's directories in /etc and /run */
2807 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2808 _cleanup_free_ char *j = NULL;
2809
2810 j = strjoin(i, "/", fn, NULL);
2811 if (!j)
2812 return log_oom();
2813
2814 f = fopen(j, "re");
2815 if (f) {
2816 p = j;
2817 j = NULL;
2818
2819 /* By default we trust configuration from /etc and /run */
2820 if (arg_settings_trusted < 0)
2821 arg_settings_trusted = true;
2822
2823 break;
2824 }
2825
2826 if (errno != ENOENT)
2827 return log_error_errno(errno, "Failed to open %s: %m", j);
2828 }
2829
2830 if (!f) {
2831 /* After that, let's look for a file next to the
2832 * actual image we shall boot. */
2833
2834 if (arg_image) {
2835 p = file_in_same_dir(arg_image, fn);
2836 if (!p)
2837 return log_oom();
2838 } else if (arg_directory) {
2839 p = file_in_same_dir(arg_directory, fn);
2840 if (!p)
2841 return log_oom();
2842 }
2843
2844 if (p) {
2845 f = fopen(p, "re");
2846 if (!f && errno != ENOENT)
2847 return log_error_errno(errno, "Failed to open %s: %m", p);
2848
2849 /* By default we do not trust configuration from /var/lib/machines */
2850 if (arg_settings_trusted < 0)
2851 arg_settings_trusted = false;
2852 }
2853 }
2854
2855 if (!f)
2856 return 0;
2857
2858 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2859
2860 r = settings_load(f, p, &settings);
2861 if (r < 0)
2862 return r;
2863
2864 /* Copy over bits from the settings, unless they have been
2865 * explicitly masked by command line switches. */
2866
2867 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2868 settings->boot >= 0) {
2869 arg_boot = settings->boot;
2870
2871 strv_free(arg_parameters);
2872 arg_parameters = settings->parameters;
2873 settings->parameters = NULL;
2874 }
2875
2876 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2877 settings->environment) {
2878 strv_free(arg_setenv);
2879 arg_setenv = settings->environment;
2880 settings->environment = NULL;
2881 }
2882
2883 if ((arg_settings_mask & SETTING_USER) == 0 &&
2884 settings->user) {
2885 free(arg_user);
2886 arg_user = settings->user;
2887 settings->user = NULL;
2888 }
2889
2890 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2891 uint64_t plus;
f757855e 2892
0e265674
LP
2893 plus = settings->capability;
2894 if (settings_private_network(settings))
2895 plus |= (1ULL << CAP_NET_ADMIN);
2896
2897 if (!arg_settings_trusted && plus != 0) {
2898 if (settings->capability != 0)
2899 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2900 } else
2901 arg_retain |= plus;
f757855e
LP
2902
2903 arg_retain &= ~settings->drop_capability;
2904 }
2905
2906 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2907 settings->kill_signal > 0)
2908 arg_kill_signal = settings->kill_signal;
2909
2910 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2911 settings->personality != PERSONALITY_INVALID)
2912 arg_personality = settings->personality;
2913
2914 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2915 !sd_id128_is_null(settings->machine_id)) {
2916
2917 if (!arg_settings_trusted)
2918 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2919 else
2920 arg_uuid = settings->machine_id;
2921 }
2922
2923 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2924 settings->read_only >= 0)
2925 arg_read_only = settings->read_only;
2926
2927 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2928 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2929 arg_volatile_mode = settings->volatile_mode;
2930
2931 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2932 settings->n_custom_mounts > 0) {
2933
2934 if (!arg_settings_trusted)
2935 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2936 else {
2937 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2938 arg_custom_mounts = settings->custom_mounts;
2939 arg_n_custom_mounts = settings->n_custom_mounts;
2940
2941 settings->custom_mounts = NULL;
2942 settings->n_custom_mounts = 0;
2943 }
2944 }
2945
2946 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2947 (settings->private_network >= 0 ||
2948 settings->network_veth >= 0 ||
2949 settings->network_bridge ||
2950 settings->network_interfaces ||
2951 settings->network_macvlan ||
2952 settings->network_ipvlan)) {
2953
2954 if (!arg_settings_trusted)
2955 log_warning("Ignoring network settings, file %s is not trusted.", p);
2956 else {
0e265674
LP
2957 arg_network_veth = settings_private_network(settings);
2958 arg_private_network = settings_private_network(settings);
2959
f757855e
LP
2960 strv_free(arg_network_interfaces);
2961 arg_network_interfaces = settings->network_interfaces;
2962 settings->network_interfaces = NULL;
2963
2964 strv_free(arg_network_macvlan);
2965 arg_network_macvlan = settings->network_macvlan;
2966 settings->network_macvlan = NULL;
2967
2968 strv_free(arg_network_ipvlan);
2969 arg_network_ipvlan = settings->network_ipvlan;
2970 settings->network_ipvlan = NULL;
2971
2972 free(arg_network_bridge);
2973 arg_network_bridge = settings->network_bridge;
2974 settings->network_bridge = NULL;
f757855e
LP
2975 }
2976 }
2977
2978 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2979 settings->expose_ports) {
2980
2981 if (!arg_settings_trusted)
2982 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2983 else {
2984 expose_port_free_all(arg_expose_ports);
2985 arg_expose_ports = settings->expose_ports;
2986 settings->expose_ports = NULL;
2987 }
2988 }
2989
2990 return 0;
2991}
2992
03cfe0d5
LP
2993int main(int argc, char *argv[]) {
2994
2995 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2996 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2997 _cleanup_close_ int master = -1, image_fd = -1;
2998 _cleanup_fdset_free_ FDSet *fds = NULL;
2999 int r, n_fd_passed, loop_nr = -1;
3000 char veth_name[IFNAMSIZ];
3001 bool secondary = false, remove_subvol = false;
72c0a2c2 3002 sigset_t mask_chld;
03cfe0d5
LP
3003 pid_t pid = 0;
3004 int ret = EXIT_SUCCESS;
3005 union in_addr_union exposed = {};
3006 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3007 bool interactive;
3008
3009 log_parse_environment();
3010 log_open();
3011
3012 r = parse_argv(argc, argv);
3013 if (r <= 0)
3014 goto finish;
3015
03cfe0d5
LP
3016 if (geteuid() != 0) {
3017 log_error("Need to be root.");
3018 r = -EPERM;
3019 goto finish;
3020 }
f757855e
LP
3021 r = determine_names();
3022 if (r < 0)
3023 goto finish;
3024
3025 r = load_settings();
3026 if (r < 0)
3027 goto finish;
3028
3029 r = verify_arguments();
3030 if (r < 0)
3031 goto finish;
03cfe0d5
LP
3032
3033 n_fd_passed = sd_listen_fds(false);
3034 if (n_fd_passed > 0) {
3035 r = fdset_new_listen_fds(&fds, false);
3036 if (r < 0) {
3037 log_error_errno(r, "Failed to collect file descriptors: %m");
3038 goto finish;
3039 }
3040 }
3041
3042 if (arg_directory) {
3043 assert(!arg_image);
3044
3045 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3046 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3047 r = -EINVAL;
3048 goto finish;
3049 }
3050
3051 if (arg_ephemeral) {
3052 _cleanup_free_ char *np = NULL;
3053
3054 /* If the specified path is a mount point we
3055 * generate the new snapshot immediately
3056 * inside it under a random name. However if
3057 * the specified is not a mount point we
3058 * create the new snapshot in the parent
3059 * directory, just next to it. */
e26d6ce5 3060 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3061 if (r < 0) {
3062 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3063 goto finish;
3064 }
3065 if (r > 0)
770b5ce4 3066 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3067 else
770b5ce4 3068 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3069 if (r < 0) {
3070 log_error_errno(r, "Failed to generate name for snapshot: %m");
3071 goto finish;
3072 }
3073
3074 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3075 if (r < 0) {
3076 log_error_errno(r, "Failed to lock %s: %m", np);
3077 goto finish;
3078 }
3079
5bcd08db 3080 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3081 if (r < 0) {
3082 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3083 goto finish;
ec16945e
LP
3084 }
3085
3086 free(arg_directory);
3087 arg_directory = np;
8a16a7b4 3088 np = NULL;
ec16945e
LP
3089
3090 remove_subvol = true;
30535c16
LP
3091
3092 } else {
3093 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3094 if (r == -EBUSY) {
3095 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3096 goto finish;
3097 }
3098 if (r < 0) {
3099 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3100 return r;
3101 }
3102
3103 if (arg_template) {
5bcd08db 3104 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3105 if (r == -EEXIST) {
3106 if (!arg_quiet)
3107 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3108 } else if (r < 0) {
83521414 3109 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3110 goto finish;
3111 } else {
3112 if (!arg_quiet)
3113 log_info("Populated %s from template %s.", arg_directory, arg_template);
3114 }
3115 }
ec16945e
LP
3116 }
3117
1b9e5b12
LP
3118 if (arg_boot) {
3119 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3120 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3121 r = -EINVAL;
1b9e5b12
LP
3122 goto finish;
3123 }
3124 } else {
3125 const char *p;
3126
16fb773e
LP
3127 p = strjoina(arg_directory, "/usr/");
3128 if (laccess(p, F_OK) < 0) {
3129 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3130 r = -EINVAL;
1b9e5b12 3131 goto finish;
1b9e5b12
LP
3132 }
3133 }
ec16945e 3134
6b9132a9 3135 } else {
1b9e5b12 3136 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3137
ec16945e
LP
3138 assert(arg_image);
3139 assert(!arg_template);
3140
30535c16
LP
3141 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3142 if (r == -EBUSY) {
3143 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3144 goto finish;
3145 }
3146 if (r < 0) {
3147 r = log_error_errno(r, "Failed to create image lock: %m");
3148 goto finish;
3149 }
3150
1b9e5b12 3151 if (!mkdtemp(template)) {
56f64d95 3152 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3153 r = -errno;
6b9132a9 3154 goto finish;
1b9e5b12 3155 }
6b9132a9 3156
1b9e5b12
LP
3157 arg_directory = strdup(template);
3158 if (!arg_directory) {
3159 r = log_oom();
3160 goto finish;
6b9132a9 3161 }
88213476 3162
1b9e5b12
LP
3163 image_fd = setup_image(&device_path, &loop_nr);
3164 if (image_fd < 0) {
3165 r = image_fd;
842f3b0f
LP
3166 goto finish;
3167 }
1b9e5b12 3168
4d9f07b4
LP
3169 r = dissect_image(image_fd,
3170 &root_device, &root_device_rw,
3171 &home_device, &home_device_rw,
3172 &srv_device, &srv_device_rw,
3173 &secondary);
1b9e5b12
LP
3174 if (r < 0)
3175 goto finish;
842f3b0f 3176 }
842f3b0f 3177
5a8af538
LP
3178 r = custom_mounts_prepare();
3179 if (r < 0)
3180 goto finish;
3181
03cfe0d5
LP
3182 interactive =
3183 isatty(STDIN_FILENO) > 0 &&
3184 isatty(STDOUT_FILENO) > 0;
9c857b9d 3185
db7feb7e
LP
3186 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3187 if (master < 0) {
ec16945e 3188 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3189 goto finish;
3190 }
3191
611b312b
LP
3192 r = ptsname_malloc(master, &console);
3193 if (r < 0) {
3194 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3195 goto finish;
3196 }
3197
a258bf26 3198 if (unlockpt(master) < 0) {
ec16945e 3199 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3200 goto finish;
3201 }
3202
9c857b9d
LP
3203 if (!arg_quiet)
3204 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3205 arg_machine, arg_image ?: arg_directory);
3206
72c0a2c2 3207 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3208
023fb90b
LP
3209 assert_se(sigemptyset(&mask_chld) == 0);
3210 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3211
03cfe0d5
LP
3212 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3213 r = log_error_errno(errno, "Failed to become subreaper: %m");
3214 goto finish;
3215 }
3216
d87be9b0 3217 for (;;) {
825d5287
RM
3218 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3219 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3220 ContainerStatus container_status;
7566e267 3221 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3222 static const struct sigaction sa = {
189d5bac 3223 .sa_handler = nop_signal_handler,
e866af3a
DH
3224 .sa_flags = SA_NOCLDSTOP,
3225 };
03cfe0d5
LP
3226 int ifi = 0;
3227 ssize_t l;
dbb60d69
LP
3228 _cleanup_event_unref_ sd_event *event = NULL;
3229 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3230 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3231 char last_char = 0;
e866af3a 3232
7566e267 3233 r = barrier_create(&barrier);
a2da110b 3234 if (r < 0) {
da927ba9 3235 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3236 goto finish;
3237 }
3238
4610de50 3239 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3240 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3241 goto finish;
3242 }
3243
4610de50 3244 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3245 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3246 goto finish;
3247 }
3248
4610de50 3249 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3250 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3251 goto finish;
3252 }
3253
825d5287 3254 if (arg_userns)
4610de50 3255 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3256 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3257 goto finish;
3258 }
3259
e866af3a
DH
3260 /* Child can be killed before execv(), so handle SIGCHLD
3261 * in order to interrupt parent's blocking calls and
3262 * give it a chance to call wait() and terminate. */
3263 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3264 if (r < 0) {
ec16945e 3265 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3266 goto finish;
3267 }
3268
e866af3a
DH
3269 r = sigaction(SIGCHLD, &sa, NULL);
3270 if (r < 0) {
ec16945e 3271 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3272 goto finish;
3273 }
3274
03cfe0d5 3275 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3276 if (pid < 0) {
3277 if (errno == EINVAL)
ec16945e 3278 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3279 else
ec16945e 3280 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3281
d87be9b0
LP
3282 goto finish;
3283 }
a258bf26 3284
d87be9b0 3285 if (pid == 0) {
03cfe0d5 3286 /* The outer child only has a file system namespace. */
a2da110b
DH
3287 barrier_set_role(&barrier, BARRIER_CHILD);
3288
03e334a1 3289 master = safe_close(master);
a258bf26 3290
03e334a1 3291 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3292 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3293 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3294 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3295
ce30c8dc
LP
3296 (void) reset_all_signal_handlers();
3297 (void) reset_signal_mask();
f5c1b9ee 3298
03cfe0d5
LP
3299 r = outer_child(&barrier,
3300 arg_directory,
3301 console,
3302 root_device, root_device_rw,
3303 home_device, home_device_rw,
3304 srv_device, srv_device_rw,
3305 interactive,
3306 secondary,
3307 pid_socket_pair[1],
3308 kmsg_socket_pair[1],
3309 rtnl_socket_pair[1],
825d5287 3310 uid_shift_socket_pair[1],
f757855e 3311 fds);
0cb9fbcd 3312 if (r < 0)
a2da110b 3313 _exit(EXIT_FAILURE);
d87be9b0 3314
03cfe0d5 3315 _exit(EXIT_SUCCESS);
da5b3bad 3316 }
88213476 3317
a2da110b 3318 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3319
2feceb5e 3320 fds = fdset_free(fds);
842f3b0f 3321
6d0b55c2
LP
3322 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3323 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3324 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3325 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3326
03cfe0d5
LP
3327 /* Wait for the outer child. */
3328 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3329 if (r < 0)
3330 goto finish;
3331 if (r != 0) {
3332 r = -EIO;
3333 goto finish;
3334 }
3335 pid = 0;
6dac160c 3336
03cfe0d5
LP
3337 /* And now retrieve the PID of the inner child. */
3338 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3339 if (l < 0) {
3340 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3341 goto finish;
3342 }
3343 if (l != sizeof(pid)) {
76d44882 3344 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3345 r = EIO;
3346 goto finish;
3347 }
354bfd2b 3348
03cfe0d5 3349 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3350
03cfe0d5
LP
3351 if (arg_userns) {
3352 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3353 log_error("Child died too early.");
3354 r = -ESRCH;
840295fc 3355 goto finish;
03cfe0d5 3356 }
ab046dde 3357
825d5287
RM
3358 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3359 if (l < 0) {
3360 r = log_error_errno(errno, "Failed to read UID shift: %m");
3361 goto finish;
3362 }
3363 if (l != sizeof(arg_uid_shift)) {
76d44882 3364 log_error("Short read while reading UID shift.");
825d5287
RM
3365 r = EIO;
3366 goto finish;
3367 }
3368
03cfe0d5 3369 r = setup_uid_map(pid);
840295fc
LP
3370 if (r < 0)
3371 goto finish;
ab046dde 3372
03cfe0d5
LP
3373 (void) barrier_place(&barrier); /* #2 */
3374 }
c74e630d 3375
9a2a5625 3376 if (arg_private_network) {
4bbfe7ad 3377
9a2a5625
LP
3378 r = move_network_interfaces(pid, arg_network_interfaces);
3379 if (r < 0)
3380 goto finish;
5aa4bb6b 3381
9a2a5625
LP
3382 if (arg_network_veth) {
3383 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3384 if (r < 0)
3385 goto finish;
3386 else if (r > 0)
3387 ifi = r;
6dac160c 3388
9a2a5625
LP
3389 if (arg_network_bridge) {
3390 r = setup_bridge(veth_name, arg_network_bridge);
3391 if (r < 0)
3392 goto finish;
3393 if (r > 0)
3394 ifi = r;
3395 }
3396 }
6dac160c 3397
9a2a5625
LP
3398 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3399 if (r < 0)
3400 goto finish;
3401
3402 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3403 if (r < 0)
3404 goto finish;
3405 }
6dac160c 3406
b7103bc5
LP
3407 if (arg_register) {
3408 r = register_machine(
3409 arg_machine,
3410 pid,
3411 arg_directory,
3412 arg_uuid,
3413 ifi,
3414 arg_slice,
3415 arg_custom_mounts, arg_n_custom_mounts,
3416 arg_kill_signal,
3417 arg_property,
3418 arg_keep_unit);
3419 if (r < 0)
3420 goto finish;
3421 }
6dac160c 3422
34829a32 3423 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3424 if (r < 0)
3425 goto finish;
3426
34829a32
LP
3427 if (arg_keep_unit) {
3428 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3429 if (r < 0)
3430 goto finish;
3431 }
efdb0237 3432
34829a32 3433 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3434 if (r < 0)
3435 goto finish;
6dac160c 3436
03cfe0d5
LP
3437 /* Notify the child that the parent is ready with all
3438 * its setup (including cgroup-ification), and that
3439 * the child can now hand over control to the code to
3440 * run inside the container. */
3441 (void) barrier_place(&barrier); /* #3 */
6dac160c 3442
03cfe0d5
LP
3443 /* Block SIGCHLD here, before notifying child.
3444 * process_pty() will handle it with the other signals. */
3445 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3446
03cfe0d5
LP
3447 /* Reset signal to default */
3448 r = default_signals(SIGCHLD, -1);
3449 if (r < 0) {
3450 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3451 goto finish;
3452 }
e866af3a 3453
03cfe0d5 3454 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3455 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3456 log_error("Child died too early.");
03cfe0d5
LP
3457 r = -ESRCH;
3458 goto finish;
3459 }
b12afc8c 3460
03cfe0d5
LP
3461 sd_notifyf(false,
3462 "READY=1\n"
3463 "STATUS=Container running.\n"
3464 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3465
03cfe0d5
LP
3466 r = sd_event_new(&event);
3467 if (r < 0) {
3468 log_error_errno(r, "Failed to get default event source: %m");
3469 goto finish;
3470 }
88213476 3471
03cfe0d5
LP
3472 if (arg_kill_signal > 0) {
3473 /* Try to kill the init system on SIGINT or SIGTERM */
3474 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3475 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3476 } else {
3477 /* Immediately exit */
3478 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3479 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3480 }
023fb90b 3481
03cfe0d5
LP
3482 /* simply exit on sigchld */
3483 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3484
03cfe0d5 3485 if (arg_expose_ports) {
7a8f6325 3486 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3487 if (r < 0)
3488 goto finish;
023fb90b 3489
7a8f6325 3490 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3491 }
023fb90b 3492
03cfe0d5 3493 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3494
ae3dde80 3495 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3496 if (r < 0) {
3497 log_error_errno(r, "Failed to create PTY forwarder: %m");
3498 goto finish;
3499 }
023fb90b 3500
03cfe0d5
LP
3501 r = sd_event_loop(event);
3502 if (r < 0) {
3503 log_error_errno(r, "Failed to run event loop: %m");
3504 goto finish;
3505 }
6d0b55c2 3506
03cfe0d5 3507 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3508
03cfe0d5 3509 forward = pty_forward_free(forward);
6d0b55c2 3510
03cfe0d5
LP
3511 if (!arg_quiet && last_char != '\n')
3512 putc('\n', stdout);
04d39279 3513
03cfe0d5 3514 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3515 if (arg_register && !arg_keep_unit)
3516 terminate_machine(pid);
1f0cd86b 3517
840295fc 3518 /* Normally redundant, but better safe than sorry */
04d39279 3519 kill(pid, SIGKILL);
a258bf26 3520
113cea80 3521 r = wait_for_container(pid, &container_status);
04d39279
LP
3522 pid = 0;
3523
ec16945e 3524 if (r < 0)
ce9f1527
LP
3525 /* We failed to wait for the container, or the
3526 * container exited abnormally */
ec16945e
LP
3527 goto finish;
3528 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3529 /* The container exited with a non-zero
3530 * status, or with zero status and no reboot
3531 * was requested. */
ec16945e 3532 ret = r;
d87be9b0 3533 break;
ec16945e 3534 }
88213476 3535
113cea80 3536 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3537
3538 if (arg_keep_unit) {
3539 /* Special handling if we are running as a
3540 * service: instead of simply restarting the
3541 * machine we want to restart the entire
3542 * service, so let's inform systemd about this
3543 * with the special exit code 133. The service
3544 * file uses RestartForceExitStatus=133 so
3545 * that this results in a full nspawn
3546 * restart. This is necessary since we might
3547 * have cgroup parameters set we want to have
3548 * flushed out. */
ec16945e
LP
3549 ret = 133;
3550 r = 0;
ce38dbc8
LP
3551 break;
3552 }
6d0b55c2 3553
7a8f6325 3554 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3555 }
88213476
LP
3556
3557finish:
af4ec430
LP
3558 sd_notify(false,
3559 "STOPPING=1\n"
3560 "STATUS=Terminating...");
3561
9444b1f2
LP
3562 if (pid > 0)
3563 kill(pid, SIGKILL);
88213476 3564
503546da
LP
3565 /* Try to flush whatever is still queued in the pty */
3566 if (master >= 0)
59f448cf 3567 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3568
03cfe0d5
LP
3569 loop_remove(loop_nr, &image_fd);
3570
ec16945e
LP
3571 if (remove_subvol && arg_directory) {
3572 int k;
3573
5bcd08db 3574 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3575 if (k < 0)
3576 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3577 }
3578
785890ac
LP
3579 if (arg_machine) {
3580 const char *p;
3581
63c372cb 3582 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3583 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3584 }
3585
7a8f6325 3586 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3587
04d391da 3588 free(arg_directory);
ec16945e
LP
3589 free(arg_template);
3590 free(arg_image);
7027ff61 3591 free(arg_machine);
c74e630d
LP
3592 free(arg_user);
3593 strv_free(arg_setenv);
f757855e 3594 free(arg_network_bridge);
c74e630d
LP
3595 strv_free(arg_network_interfaces);
3596 strv_free(arg_network_macvlan);
4bbfe7ad 3597 strv_free(arg_network_ipvlan);
f757855e
LP
3598 strv_free(arg_parameters);
3599 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3600 expose_port_free_all(arg_expose_ports);
6d0b55c2 3601
ec16945e 3602 return r < 0 ? EXIT_FAILURE : ret;
88213476 3603}