]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: rework how we determine private networking settings
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e
LP
48
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
8fe0087e
LP
53#include "cap-list.h"
54#include "capability.h"
04d391da 55#include "cgroup-util.h"
8fe0087e 56#include "copy.h"
4fc9982c 57#include "dev-setup.h"
8fe0087e
LP
58#include "env-util.h"
59#include "event-util.h"
842f3b0f 60#include "fdset.h"
a5c32cff 61#include "fileio.h"
8fe0087e 62#include "formats-util.h"
1b9e5b12 63#include "gpt.h"
8fe0087e
LP
64#include "hostname-util.h"
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
71#include "netlink-util.h"
72#include "path-util.h"
0b452006 73#include "process-util.h"
8fe0087e
LP
74#include "ptyfwd.h"
75#include "random-util.h"
76#include "rm-rf.h"
e9642be2
LP
77#ifdef HAVE_SECCOMP
78#include "seccomp-util.h"
79#endif
8fe0087e
LP
80#include "signal-util.h"
81#include "strv.h"
82#include "terminal-util.h"
83#include "udev-util.h"
84#include "util.h"
e9642be2 85
db3b1ded
LP
86#include "nspawn-cgroup.h"
87#include "nspawn-expose-ports.h"
e83bebef 88#include "nspawn-mount.h"
9a2a5625 89#include "nspawn-network.h"
b7103bc5 90#include "nspawn-register.h"
db3b1ded 91#include "nspawn-settings.h"
ee645080 92#include "nspawn-setuid.h"
6d0b55c2 93
113cea80
DH
94typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97} ContainerStatus;
98
57fb9fb5
LP
99typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104} LinkJournal;
88213476
LP
105
106static char *arg_directory = NULL;
ec16945e 107static char *arg_template = NULL;
687d0825 108static char *arg_user = NULL;
9444b1f2 109static sd_id128_t arg_uuid = {};
7027ff61 110static char *arg_machine = NULL;
c74e630d
LP
111static const char *arg_selinux_context = NULL;
112static const char *arg_selinux_apifs_context = NULL;
9444b1f2 113static const char *arg_slice = NULL;
ff01d048 114static bool arg_private_network = false;
bc2f673e 115static bool arg_read_only = false;
0f0dbc46 116static bool arg_boot = false;
ec16945e 117static bool arg_ephemeral = false;
57fb9fb5 118static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 119static bool arg_link_journal_try = false;
5076f0cc
LP
120static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 142 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
5a8af538
LP
147static CustomMount *arg_custom_mounts = NULL;
148static unsigned arg_n_custom_mounts = 0;
f4889f65 149static char **arg_setenv = NULL;
284c0b91 150static bool arg_quiet = false;
8a96d94e 151static bool arg_share_system = false;
eb91eb18 152static bool arg_register = true;
89f7c846 153static bool arg_keep_unit = false;
aa28aefe 154static char **arg_network_interfaces = NULL;
c74e630d 155static char **arg_network_macvlan = NULL;
4bbfe7ad 156static char **arg_network_ipvlan = NULL;
69c79d3c 157static bool arg_network_veth = false;
f757855e 158static char *arg_network_bridge = NULL;
050f7277 159static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 160static char *arg_image = NULL;
f757855e 161static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 162static ExposePort *arg_expose_ports = NULL;
f36933fe 163static char **arg_property = NULL;
6dac160c
LP
164static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165static bool arg_userns = false;
c6c8f6e2 166static int arg_kill_signal = 0;
efdb0237 167static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
168static SettingsMask arg_settings_mask = 0;
169static int arg_settings_trusted = -1;
170static char **arg_parameters = NULL;
88213476 171
601185b4 172static void help(void) {
88213476
LP
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
175 " -h --help Show this help\n"
176 " --version Print version string\n"
69c79d3c 177 " -q --quiet Do not show status information\n"
1b9e5b12 178 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 186 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 187 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 188 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 189 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
69c79d3c
LP
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
c74e630d
LP
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
4bbfe7ad
TG
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
0dfaa006 202 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 203 " and container\n"
ab046dde 204 " --network-bridge=INTERFACE\n"
32457153 205 " Add a virtual ethernet connection between host\n"
ab046dde
TG
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
6d0b55c2 208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 209 " Expose a container IP port on the host\n"
82adf6af
LP
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
a8828ed9
DW
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 223 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
a8828ed9 226 " the container\n"
5e5bfa6e
EY
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
06c17c39 229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
284c0b91 235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 236 " --share-system Share system namespaces with host\n"
eb91eb18 237 " --register=BOOLEAN Register container as machine\n"
89f7c846 238 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 239 " the service unit nspawn is running in\n"
6d0b55c2 240 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 242 , program_invocation_short_name);
88213476
LP
243}
244
5a8af538
LP
245
246static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
825d5287
RM
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
5a8af538
LP
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
14bcf25c 271 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277}
278
ec16945e
LP
279static int set_sanitized_path(char **b, const char *path) {
280 char *p;
281
282 assert(b);
283 assert(path);
284
285 p = canonicalize_file_name(path);
286 if (!p) {
287 if (errno != ENOENT)
288 return -errno;
289
290 p = path_make_absolute_cwd(path);
291 if (!p)
292 return -ENOMEM;
293 }
294
295 free(*b);
296 *b = path_kill_slashes(p);
297 return 0;
298}
299
efdb0237
LP
300static int detect_unified_cgroup_hierarchy(void) {
301 const char *e;
302 int r;
303
304 /* Allow the user to control whether the unified hierarchy is used */
305 e = getenv("UNIFIED_CGROUP_HIERARCHY");
306 if (e) {
307 r = parse_boolean(e);
308 if (r < 0)
309 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
310
311 arg_unified_cgroup_hierarchy = r;
312 return 0;
313 }
314
315 /* Otherwise inherit the default from the host system */
316 r = cg_unified();
317 if (r < 0)
318 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
319
320 arg_unified_cgroup_hierarchy = r;
321 return 0;
322}
323
88213476
LP
324static int parse_argv(int argc, char *argv[]) {
325
a41fe3a2 326 enum {
acbeb427
ZJS
327 ARG_VERSION = 0x100,
328 ARG_PRIVATE_NETWORK,
bc2f673e 329 ARG_UUID,
5076f0cc 330 ARG_READ_ONLY,
57fb9fb5 331 ARG_CAPABILITY,
420c7379 332 ARG_DROP_CAPABILITY,
17fe0523
LP
333 ARG_LINK_JOURNAL,
334 ARG_BIND,
f4889f65 335 ARG_BIND_RO,
06c17c39 336 ARG_TMPFS,
5a8af538
LP
337 ARG_OVERLAY,
338 ARG_OVERLAY_RO,
f4889f65 339 ARG_SETENV,
eb91eb18 340 ARG_SHARE_SYSTEM,
89f7c846 341 ARG_REGISTER,
aa28aefe 342 ARG_KEEP_UNIT,
69c79d3c 343 ARG_NETWORK_INTERFACE,
c74e630d 344 ARG_NETWORK_MACVLAN,
4bbfe7ad 345 ARG_NETWORK_IPVLAN,
ab046dde 346 ARG_NETWORK_BRIDGE,
6afc95b7 347 ARG_PERSONALITY,
4d9f07b4 348 ARG_VOLATILE,
ec16945e 349 ARG_TEMPLATE,
f36933fe 350 ARG_PROPERTY,
6dac160c 351 ARG_PRIVATE_USERS,
c6c8f6e2 352 ARG_KILL_SIGNAL,
f757855e 353 ARG_SETTINGS,
a41fe3a2
LP
354 };
355
88213476 356 static const struct option options[] = {
aa28aefe
LP
357 { "help", no_argument, NULL, 'h' },
358 { "version", no_argument, NULL, ARG_VERSION },
359 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
360 { "template", required_argument, NULL, ARG_TEMPLATE },
361 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
362 { "user", required_argument, NULL, 'u' },
363 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
364 { "boot", no_argument, NULL, 'b' },
365 { "uuid", required_argument, NULL, ARG_UUID },
366 { "read-only", no_argument, NULL, ARG_READ_ONLY },
367 { "capability", required_argument, NULL, ARG_CAPABILITY },
368 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
369 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
370 { "bind", required_argument, NULL, ARG_BIND },
371 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 372 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
373 { "overlay", required_argument, NULL, ARG_OVERLAY },
374 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
375 { "machine", required_argument, NULL, 'M' },
376 { "slice", required_argument, NULL, 'S' },
377 { "setenv", required_argument, NULL, ARG_SETENV },
378 { "selinux-context", required_argument, NULL, 'Z' },
379 { "selinux-apifs-context", required_argument, NULL, 'L' },
380 { "quiet", no_argument, NULL, 'q' },
381 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
382 { "register", required_argument, NULL, ARG_REGISTER },
383 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
384 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 385 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 386 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 387 { "network-veth", no_argument, NULL, 'n' },
ab046dde 388 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 389 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 390 { "image", required_argument, NULL, 'i' },
4d9f07b4 391 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 392 { "port", required_argument, NULL, 'p' },
f36933fe 393 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 394 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 395 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 396 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 397 {}
88213476
LP
398 };
399
9444b1f2 400 int c, r;
a42c8b54 401 uint64_t plus = 0, minus = 0;
f757855e 402 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
403
404 assert(argc >= 0);
405 assert(argv);
406
0dfaa006 407 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
408
409 switch (c) {
410
411 case 'h':
601185b4
ZJS
412 help();
413 return 0;
88213476 414
acbeb427 415 case ARG_VERSION:
3f6fd1ba 416 return version();
acbeb427 417
88213476 418 case 'D':
ec16945e
LP
419 r = set_sanitized_path(&arg_directory, optarg);
420 if (r < 0)
421 return log_error_errno(r, "Invalid root directory: %m");
422
423 break;
424
425 case ARG_TEMPLATE:
426 r = set_sanitized_path(&arg_template, optarg);
427 if (r < 0)
428 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
429
430 break;
431
1b9e5b12 432 case 'i':
ec16945e
LP
433 r = set_sanitized_path(&arg_image, optarg);
434 if (r < 0)
435 return log_error_errno(r, "Invalid image path: %m");
436
437 break;
438
439 case 'x':
440 arg_ephemeral = true;
1b9e5b12
LP
441 break;
442
687d0825 443 case 'u':
2fc09a9c
DM
444 r = free_and_strdup(&arg_user, optarg);
445 if (r < 0)
7027ff61 446 return log_oom();
687d0825 447
f757855e 448 arg_settings_mask |= SETTING_USER;
687d0825
MV
449 break;
450
ab046dde 451 case ARG_NETWORK_BRIDGE:
f757855e
LP
452 r = free_and_strdup(&arg_network_bridge, optarg);
453 if (r < 0)
454 return log_oom();
ab046dde
TG
455
456 /* fall through */
457
0dfaa006 458 case 'n':
69c79d3c
LP
459 arg_network_veth = true;
460 arg_private_network = true;
f757855e 461 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
462 break;
463
aa28aefe 464 case ARG_NETWORK_INTERFACE:
c74e630d
LP
465 if (strv_extend(&arg_network_interfaces, optarg) < 0)
466 return log_oom();
467
468 arg_private_network = true;
f757855e 469 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
470 break;
471
472 case ARG_NETWORK_MACVLAN:
473 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
474 return log_oom();
475
4bbfe7ad 476 arg_private_network = true;
f757855e 477 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
478 break;
479
480 case ARG_NETWORK_IPVLAN:
481 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
482 return log_oom();
483
aa28aefe
LP
484 /* fall through */
485
ff01d048
LP
486 case ARG_PRIVATE_NETWORK:
487 arg_private_network = true;
f757855e 488 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
489 break;
490
0f0dbc46
LP
491 case 'b':
492 arg_boot = true;
f757855e 493 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
494 break;
495
144f0fc0 496 case ARG_UUID:
9444b1f2
LP
497 r = sd_id128_from_string(optarg, &arg_uuid);
498 if (r < 0) {
aa96c6cb 499 log_error("Invalid UUID: %s", optarg);
9444b1f2 500 return r;
aa96c6cb 501 }
f757855e
LP
502
503 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 504 break;
aa96c6cb 505
9444b1f2 506 case 'S':
c74e630d 507 arg_slice = optarg;
144f0fc0
LP
508 break;
509
7027ff61 510 case 'M':
c1521918 511 if (isempty(optarg))
97b11eed 512 arg_machine = mfree(arg_machine);
c1521918 513 else {
0c3c4284 514 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
515 log_error("Invalid machine name: %s", optarg);
516 return -EINVAL;
517 }
7027ff61 518
0c3c4284
LP
519 r = free_and_strdup(&arg_machine, optarg);
520 if (r < 0)
eb91eb18
LP
521 return log_oom();
522
523 break;
524 }
7027ff61 525
82adf6af
LP
526 case 'Z':
527 arg_selinux_context = optarg;
a8828ed9
DW
528 break;
529
82adf6af
LP
530 case 'L':
531 arg_selinux_apifs_context = optarg;
a8828ed9
DW
532 break;
533
bc2f673e
LP
534 case ARG_READ_ONLY:
535 arg_read_only = true;
f757855e 536 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
537 break;
538
420c7379
LP
539 case ARG_CAPABILITY:
540 case ARG_DROP_CAPABILITY: {
a2a5291b 541 const char *state, *word;
5076f0cc
LP
542 size_t length;
543
544 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 545 _cleanup_free_ char *t;
5076f0cc
LP
546
547 t = strndup(word, length);
0d0f0c50
SL
548 if (!t)
549 return log_oom();
5076f0cc 550
39ed67d1
LP
551 if (streq(t, "all")) {
552 if (c == ARG_CAPABILITY)
a42c8b54 553 plus = (uint64_t) -1;
39ed67d1 554 else
a42c8b54 555 minus = (uint64_t) -1;
39ed67d1 556 } else {
2822da4f
LP
557 int cap;
558
559 cap = capability_from_name(t);
560 if (cap < 0) {
39ed67d1
LP
561 log_error("Failed to parse capability %s.", t);
562 return -EINVAL;
563 }
564
565 if (c == ARG_CAPABILITY)
a42c8b54 566 plus |= 1ULL << (uint64_t) cap;
39ed67d1 567 else
a42c8b54 568 minus |= 1ULL << (uint64_t) cap;
5076f0cc 569 }
5076f0cc
LP
570 }
571
f757855e 572 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
573 break;
574 }
575
57fb9fb5
LP
576 case 'j':
577 arg_link_journal = LINK_GUEST;
574edc90 578 arg_link_journal_try = true;
57fb9fb5
LP
579 break;
580
581 case ARG_LINK_JOURNAL:
53e438e3 582 if (streq(optarg, "auto")) {
57fb9fb5 583 arg_link_journal = LINK_AUTO;
53e438e3
LP
584 arg_link_journal_try = false;
585 } else if (streq(optarg, "no")) {
57fb9fb5 586 arg_link_journal = LINK_NO;
53e438e3
LP
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "guest")) {
57fb9fb5 589 arg_link_journal = LINK_GUEST;
53e438e3
LP
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "host")) {
57fb9fb5 592 arg_link_journal = LINK_HOST;
53e438e3
LP
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "try-guest")) {
574edc90
MP
595 arg_link_journal = LINK_GUEST;
596 arg_link_journal_try = true;
597 } else if (streq(optarg, "try-host")) {
598 arg_link_journal = LINK_HOST;
599 arg_link_journal_try = true;
600 } else {
57fb9fb5
LP
601 log_error("Failed to parse link journal mode %s", optarg);
602 return -EINVAL;
603 }
604
605 break;
606
17fe0523 607 case ARG_BIND:
f757855e
LP
608 case ARG_BIND_RO:
609 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 612
f757855e 613 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 614 break;
06c17c39 615
f757855e
LP
616 case ARG_TMPFS:
617 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
618 if (r < 0)
619 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 620
f757855e 621 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 622 break;
5a8af538
LP
623
624 case ARG_OVERLAY:
625 case ARG_OVERLAY_RO: {
626 _cleanup_free_ char *upper = NULL, *destination = NULL;
627 _cleanup_strv_free_ char **lower = NULL;
628 CustomMount *m;
629 unsigned n = 0;
630 char **i;
631
62f9f39a
RM
632 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
633 if (r == -ENOMEM)
06c17c39 634 return log_oom();
62f9f39a
RM
635 else if (r < 0) {
636 log_error("Invalid overlay specification: %s", optarg);
637 return r;
638 }
06c17c39 639
5a8af538
LP
640 STRV_FOREACH(i, lower) {
641 if (!path_is_absolute(*i)) {
642 log_error("Overlay path %s is not absolute.", *i);
643 return -EINVAL;
644 }
645
646 n++;
647 }
648
649 if (n < 2) {
650 log_error("--overlay= needs at least two colon-separated directories specified.");
651 return -EINVAL;
652 }
653
654 if (n == 2) {
655 /* If two parameters are specified,
656 * the first one is the lower, the
657 * second one the upper directory. And
af86c440
ZJS
658 * we'll also define the destination
659 * mount point the same as the upper. */
5a8af538
LP
660 upper = lower[1];
661 lower[1] = NULL;
662
663 destination = strdup(upper);
664 if (!destination)
665 return log_oom();
666
667 } else {
668 upper = lower[n - 2];
669 destination = lower[n - 1];
670 lower[n - 2] = NULL;
671 }
672
f757855e 673 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
674 if (!m)
675 return log_oom();
676
677 m->destination = destination;
678 m->source = upper;
679 m->lower = lower;
680 m->read_only = c == ARG_OVERLAY_RO;
681
682 upper = destination = NULL;
683 lower = NULL;
06c17c39 684
f757855e 685 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
686 break;
687 }
688
f4889f65
LP
689 case ARG_SETENV: {
690 char **n;
691
692 if (!env_assignment_is_valid(optarg)) {
693 log_error("Environment variable assignment '%s' is not valid.", optarg);
694 return -EINVAL;
695 }
696
697 n = strv_env_set(arg_setenv, optarg);
698 if (!n)
699 return log_oom();
700
701 strv_free(arg_setenv);
702 arg_setenv = n;
f757855e
LP
703
704 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
705 break;
706 }
707
284c0b91
LP
708 case 'q':
709 arg_quiet = true;
710 break;
711
8a96d94e
LP
712 case ARG_SHARE_SYSTEM:
713 arg_share_system = true;
714 break;
715
eb91eb18
LP
716 case ARG_REGISTER:
717 r = parse_boolean(optarg);
718 if (r < 0) {
719 log_error("Failed to parse --register= argument: %s", optarg);
720 return r;
721 }
722
723 arg_register = r;
724 break;
725
89f7c846
LP
726 case ARG_KEEP_UNIT:
727 arg_keep_unit = true;
728 break;
729
6afc95b7
LP
730 case ARG_PERSONALITY:
731
ac45f971 732 arg_personality = personality_from_string(optarg);
050f7277 733 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
734 log_error("Unknown or unsupported personality '%s'.", optarg);
735 return -EINVAL;
736 }
737
f757855e 738 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
739 break;
740
4d9f07b4
LP
741 case ARG_VOLATILE:
742
743 if (!optarg)
f757855e 744 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 745 else {
f757855e 746 VolatileMode m;
4d9f07b4 747
f757855e
LP
748 m = volatile_mode_from_string(optarg);
749 if (m < 0) {
750 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 751 return -EINVAL;
f757855e
LP
752 } else
753 arg_volatile_mode = m;
6d0b55c2
LP
754 }
755
f757855e
LP
756 arg_settings_mask |= SETTING_VOLATILE_MODE;
757 break;
6d0b55c2 758
f757855e
LP
759 case 'p':
760 r = expose_port_parse(&arg_expose_ports, optarg);
761 if (r == -EEXIST)
762 return log_error_errno(r, "Duplicate port specification: %s", optarg);
763 if (r < 0)
764 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 765
f757855e 766 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 767 break;
6d0b55c2 768
f36933fe
LP
769 case ARG_PROPERTY:
770 if (strv_extend(&arg_property, optarg) < 0)
771 return log_oom();
772
773 break;
774
6dac160c
LP
775 case ARG_PRIVATE_USERS:
776 if (optarg) {
777 _cleanup_free_ char *buffer = NULL;
778 const char *range, *shift;
779
780 range = strchr(optarg, ':');
781 if (range) {
782 buffer = strndup(optarg, range - optarg);
783 if (!buffer)
784 return log_oom();
785 shift = buffer;
786
787 range++;
788 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
789 log_error("Failed to parse UID range: %s", range);
790 return -EINVAL;
791 }
792 } else
793 shift = optarg;
794
795 if (parse_uid(shift, &arg_uid_shift) < 0) {
796 log_error("Failed to parse UID: %s", optarg);
797 return -EINVAL;
798 }
799 }
800
801 arg_userns = true;
802 break;
803
c6c8f6e2
LP
804 case ARG_KILL_SIGNAL:
805 arg_kill_signal = signal_from_string_try_harder(optarg);
806 if (arg_kill_signal < 0) {
807 log_error("Cannot parse signal: %s", optarg);
808 return -EINVAL;
809 }
810
f757855e
LP
811 arg_settings_mask |= SETTING_KILL_SIGNAL;
812 break;
813
814 case ARG_SETTINGS:
815
816 /* no → do not read files
817 * yes → read files, do not override cmdline, trust only subset
818 * override → read files, override cmdline, trust only subset
819 * trusted → read files, do not override cmdline, trust all
820 */
821
822 r = parse_boolean(optarg);
823 if (r < 0) {
824 if (streq(optarg, "trusted")) {
825 mask_all_settings = false;
826 mask_no_settings = false;
827 arg_settings_trusted = true;
828
829 } else if (streq(optarg, "override")) {
830 mask_all_settings = false;
831 mask_no_settings = true;
832 arg_settings_trusted = -1;
833 } else
834 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
835 } else if (r > 0) {
836 /* yes */
837 mask_all_settings = false;
838 mask_no_settings = false;
839 arg_settings_trusted = -1;
840 } else {
841 /* no */
842 mask_all_settings = true;
843 mask_no_settings = false;
844 arg_settings_trusted = false;
845 }
846
c6c8f6e2
LP
847 break;
848
88213476
LP
849 case '?':
850 return -EINVAL;
851
852 default:
eb9da376 853 assert_not_reached("Unhandled option");
88213476 854 }
88213476 855
eb91eb18
LP
856 if (arg_share_system)
857 arg_register = false;
858
859 if (arg_boot && arg_share_system) {
860 log_error("--boot and --share-system may not be combined.");
861 return -EINVAL;
862 }
863
89f7c846
LP
864 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
865 log_error("--keep-unit may not be used when invoked from a user session.");
866 return -EINVAL;
867 }
868
1b9e5b12
LP
869 if (arg_directory && arg_image) {
870 log_error("--directory= and --image= may not be combined.");
871 return -EINVAL;
872 }
873
ec16945e
LP
874 if (arg_template && arg_image) {
875 log_error("--template= and --image= may not be combined.");
876 return -EINVAL;
877 }
878
879 if (arg_template && !(arg_directory || arg_machine)) {
880 log_error("--template= needs --directory= or --machine=.");
881 return -EINVAL;
882 }
883
884 if (arg_ephemeral && arg_template) {
885 log_error("--ephemeral and --template= may not be combined.");
886 return -EINVAL;
887 }
888
889 if (arg_ephemeral && arg_image) {
890 log_error("--ephemeral and --image= may not be combined.");
891 return -EINVAL;
892 }
893
df9a75e4
LP
894 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
895 log_error("--ephemeral and --link-journal= may not be combined.");
896 return -EINVAL;
897 }
898
f757855e
LP
899 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
900 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
901
902 if (argc > optind) {
903 arg_parameters = strv_copy(argv + optind);
904 if (!arg_parameters)
905 return log_oom();
906
907 arg_settings_mask |= SETTING_BOOT;
908 }
909
910 /* Load all settings from .nspawn files */
911 if (mask_no_settings)
912 arg_settings_mask = 0;
913
914 /* Don't load any settings from .nspawn files */
915 if (mask_all_settings)
916 arg_settings_mask = _SETTINGS_MASK_ALL;
917
918 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
919
920 r = detect_unified_cgroup_hierarchy();
921 if (r < 0)
922 return r;
923
924 return 1;
925}
926
927static int verify_arguments(void) {
928
929 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
930 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
931 return -EINVAL;
932 }
933
6d0b55c2
LP
934 if (arg_expose_ports && !arg_private_network) {
935 log_error("Cannot use --port= without private networking.");
936 return -EINVAL;
937 }
938
c6c8f6e2
LP
939 if (arg_boot && arg_kill_signal <= 0)
940 arg_kill_signal = SIGRTMIN+3;
941
f757855e 942 return 0;
88213476
LP
943}
944
03cfe0d5
LP
945static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
946 assert(p);
947
948 if (!arg_userns)
949 return 0;
950
951 if (uid == UID_INVALID && gid == GID_INVALID)
952 return 0;
953
954 if (uid != UID_INVALID) {
955 uid += arg_uid_shift;
956
957 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
958 return -EOVERFLOW;
959 }
960
961 if (gid != GID_INVALID) {
962 gid += (gid_t) arg_uid_shift;
963
964 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
965 return -EOVERFLOW;
966 }
967
968 if (lchown(p, uid, gid) < 0)
969 return -errno;
b12afc8c
LP
970
971 return 0;
972}
973
03cfe0d5
LP
974static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
975 const char *q;
976
977 q = prefix_roota(root, path);
978 if (mkdir(q, mode) < 0) {
979 if (errno == EEXIST)
980 return 0;
981 return -errno;
982 }
983
984 return userns_lchown(q, uid, gid);
985}
986
e58a1277 987static int setup_timezone(const char *dest) {
03cfe0d5
LP
988 _cleanup_free_ char *p = NULL, *q = NULL;
989 const char *where, *check, *what;
d4036145
LP
990 char *z, *y;
991 int r;
f8440af5 992
e58a1277
LP
993 assert(dest);
994
995 /* Fix the timezone, if possible */
d4036145
LP
996 r = readlink_malloc("/etc/localtime", &p);
997 if (r < 0) {
998 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
999 return 0;
1000 }
1001
1002 z = path_startswith(p, "../usr/share/zoneinfo/");
1003 if (!z)
1004 z = path_startswith(p, "/usr/share/zoneinfo/");
1005 if (!z) {
1006 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1007 return 0;
1008 }
1009
03cfe0d5 1010 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1011 r = readlink_malloc(where, &q);
1012 if (r >= 0) {
1013 y = path_startswith(q, "../usr/share/zoneinfo/");
1014 if (!y)
1015 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1016
d4036145
LP
1017 /* Already pointing to the right place? Then do nothing .. */
1018 if (y && streq(y, z))
1019 return 0;
1020 }
1021
03cfe0d5
LP
1022 check = strjoina("/usr/share/zoneinfo/", z);
1023 check = prefix_root(dest, check);
1024 if (laccess(check, F_OK) < 0) {
d4036145
LP
1025 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1026 return 0;
1027 }
68fb0892 1028
79d80fc1
TG
1029 r = unlink(where);
1030 if (r < 0 && errno != ENOENT) {
56f64d95 1031 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1032 return 0;
1033 }
4d9f07b4 1034
03cfe0d5 1035 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1036 if (symlink(what, where) < 0) {
56f64d95 1037 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1038 return 0;
1039 }
e58a1277 1040
03cfe0d5
LP
1041 r = userns_lchown(where, 0, 0);
1042 if (r < 0)
1043 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1044
e58a1277 1045 return 0;
88213476
LP
1046}
1047
2547bb41 1048static int setup_resolv_conf(const char *dest) {
03cfe0d5 1049 const char *where = NULL;
79d80fc1 1050 int r;
2547bb41
LP
1051
1052 assert(dest);
1053
1054 if (arg_private_network)
1055 return 0;
1056
1057 /* Fix resolv.conf, if possible */
03cfe0d5 1058 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1059
f2068bcc 1060 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1061 if (r < 0) {
68a313c5
LP
1062 /* If the file already exists as symlink, let's
1063 * suppress the warning, under the assumption that
1064 * resolved or something similar runs inside and the
1065 * symlink points there.
1066 *
1067 * If the disk image is read-only, there's also no
1068 * point in complaining.
1069 */
1070 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1071 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1072 return 0;
1073 }
2547bb41 1074
03cfe0d5
LP
1075 r = userns_lchown(where, 0, 0);
1076 if (r < 0)
1077 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1078
2547bb41
LP
1079 return 0;
1080}
1081
9f24adc2 1082static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1083 assert(s);
9f24adc2
LP
1084
1085 snprintf(s, 37,
1086 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1087 SD_ID128_FORMAT_VAL(id));
1088
1089 return s;
1090}
1091
04bc4a3f 1092static int setup_boot_id(const char *dest) {
03cfe0d5 1093 const char *from, *to;
39883f62 1094 sd_id128_t rnd = {};
04bc4a3f
LP
1095 char as_uuid[37];
1096 int r;
1097
eb91eb18
LP
1098 if (arg_share_system)
1099 return 0;
1100
04bc4a3f
LP
1101 /* Generate a new randomized boot ID, so that each boot-up of
1102 * the container gets a new one */
1103
03cfe0d5
LP
1104 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1105 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1106
1107 r = sd_id128_randomize(&rnd);
f647962d
MS
1108 if (r < 0)
1109 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1110
9f24adc2 1111 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1112
4c1fc3e4 1113 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1114 if (r < 0)
1115 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1116
03cfe0d5
LP
1117 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1118 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1119 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1120 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1121
1122 unlink(from);
04bc4a3f
LP
1123 return r;
1124}
1125
e58a1277 1126static int copy_devnodes(const char *dest) {
88213476
LP
1127
1128 static const char devnodes[] =
1129 "null\0"
1130 "zero\0"
1131 "full\0"
1132 "random\0"
1133 "urandom\0"
85614d66
TG
1134 "tty\0"
1135 "net/tun\0";
88213476
LP
1136
1137 const char *d;
e58a1277 1138 int r = 0;
7fd1b19b 1139 _cleanup_umask_ mode_t u;
a258bf26
LP
1140
1141 assert(dest);
124640f1
LP
1142
1143 u = umask(0000);
88213476 1144
03cfe0d5
LP
1145 /* Create /dev/net, so that we can create /dev/net/tun in it */
1146 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1147 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1148
88213476 1149 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1150 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1151 struct stat st;
88213476 1152
7f112f50 1153 from = strappend("/dev/", d);
03cfe0d5 1154 to = prefix_root(dest, from);
88213476
LP
1155
1156 if (stat(from, &st) < 0) {
1157
4a62c710
MS
1158 if (errno != ENOENT)
1159 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1160
a258bf26 1161 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1162
03cfe0d5 1163 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1164 return -EIO;
a258bf26 1165
85614d66 1166 } else {
81f5049b
AC
1167 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1168 if (errno != EPERM)
1169 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1170
1171 /* Some systems abusively restrict mknod but
1172 * allow bind mounts. */
1173 r = touch(to);
1174 if (r < 0)
1175 return log_error_errno(r, "touch (%s) failed: %m", to);
1176 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1177 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1178 }
6278cf60 1179
03cfe0d5
LP
1180 r = userns_lchown(to, 0, 0);
1181 if (r < 0)
1182 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1183 }
88213476
LP
1184 }
1185
e58a1277
LP
1186 return r;
1187}
88213476 1188
03cfe0d5
LP
1189static int setup_pts(const char *dest) {
1190 _cleanup_free_ char *options = NULL;
1191 const char *p;
1192
1193#ifdef HAVE_SELINUX
1194 if (arg_selinux_apifs_context)
1195 (void) asprintf(&options,
3dce8915 1196 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1197 arg_uid_shift + TTY_GID,
1198 arg_selinux_apifs_context);
1199 else
1200#endif
1201 (void) asprintf(&options,
3dce8915 1202 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1203 arg_uid_shift + TTY_GID);
f2d88580 1204
03cfe0d5 1205 if (!options)
f2d88580
LP
1206 return log_oom();
1207
03cfe0d5 1208 /* Mount /dev/pts itself */
cc9fce65 1209 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1210 if (mkdir(p, 0755) < 0)
1211 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1212 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1213 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1214 if (userns_lchown(p, 0, 0) < 0)
1215 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1216
1217 /* Create /dev/ptmx symlink */
1218 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1219 if (symlink("pts/ptmx", p) < 0)
1220 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1221 if (userns_lchown(p, 0, 0) < 0)
1222 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1223
03cfe0d5
LP
1224 /* And fix /dev/pts/ptmx ownership */
1225 p = prefix_roota(dest, "/dev/pts/ptmx");
1226 if (userns_lchown(p, 0, 0) < 0)
1227 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1228
f2d88580
LP
1229 return 0;
1230}
1231
e58a1277 1232static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1233 _cleanup_umask_ mode_t u;
1234 const char *to;
e58a1277 1235 int r;
e58a1277
LP
1236
1237 assert(dest);
1238 assert(console);
1239
1240 u = umask(0000);
1241
03cfe0d5 1242 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1245
a258bf26
LP
1246 /* We need to bind mount the right tty to /dev/console since
1247 * ptys can only exist on pts file systems. To have something
81f5049b 1248 * to bind mount things on we create a empty regular file. */
a258bf26 1249
03cfe0d5 1250 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1251 r = touch(to);
1252 if (r < 0)
1253 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1254
4543768d 1255 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1256 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1257
25ea79fe 1258 return 0;
e58a1277
LP
1259}
1260
1261static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1262 const char *from, *to;
7fd1b19b 1263 _cleanup_umask_ mode_t u;
d9603714 1264 int fd, r;
e58a1277 1265
e58a1277 1266 assert(kmsg_socket >= 0);
a258bf26 1267
e58a1277 1268 u = umask(0000);
a258bf26 1269
03cfe0d5 1270 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1271 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1272 * on the reading side behave very similar to /proc/kmsg,
1273 * their writing side behaves differently from /dev/kmsg in
1274 * that writing blocks when nothing is reading. In order to
1275 * avoid any problems with containers deadlocking due to this
1276 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1277 from = prefix_roota(dest, "/run/kmsg");
1278 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1279
4a62c710 1280 if (mkfifo(from, 0600) < 0)
03cfe0d5 1281 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1282 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1283 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1284
1285 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1286 if (fd < 0)
1287 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1288
e58a1277
LP
1289 /* Store away the fd in the socket, so that it stays open as
1290 * long as we run the child */
3ee897d6 1291 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1292 safe_close(fd);
e58a1277 1293
d9603714
DH
1294 if (r < 0)
1295 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1296
03cfe0d5
LP
1297 /* And now make the FIFO unavailable as /run/kmsg... */
1298 (void) unlink(from);
1299
25ea79fe 1300 return 0;
88213476
LP
1301}
1302
1c4baffc 1303static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1304 union in_addr_union *exposed = userdata;
1305
1306 assert(rtnl);
1307 assert(m);
1308 assert(exposed);
1309
7a8f6325 1310 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1311 return 0;
1312}
1313
3a74cea5 1314static int setup_hostname(void) {
3a74cea5 1315
eb91eb18
LP
1316 if (arg_share_system)
1317 return 0;
1318
605f81a8 1319 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1320 return -errno;
3a74cea5 1321
7027ff61 1322 return 0;
3a74cea5
LP
1323}
1324
57fb9fb5 1325static int setup_journal(const char *directory) {
4d680aee 1326 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1327 _cleanup_free_ char *b = NULL, *d = NULL;
1328 const char *etc_machine_id, *p, *q;
27407a01 1329 char *id;
57fb9fb5
LP
1330 int r;
1331
df9a75e4
LP
1332 /* Don't link journals in ephemeral mode */
1333 if (arg_ephemeral)
1334 return 0;
1335
03cfe0d5 1336 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1337
03cfe0d5 1338 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1339 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1340 return 0;
f647962d 1341 else if (r < 0)
03cfe0d5 1342 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1343
27407a01
ZJS
1344 id = strstrip(b);
1345 if (isempty(id) && arg_link_journal == LINK_AUTO)
1346 return 0;
57fb9fb5 1347
27407a01
ZJS
1348 /* Verify validity */
1349 r = sd_id128_from_string(id, &machine_id);
f647962d 1350 if (r < 0)
03cfe0d5 1351 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1352
4d680aee 1353 r = sd_id128_get_machine(&this_id);
f647962d
MS
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1356
1357 if (sd_id128_equal(machine_id, this_id)) {
1358 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1359 "Host and machine ids are equal (%s): refusing to link journals", id);
1360 if (arg_link_journal == LINK_AUTO)
1361 return 0;
df9a75e4 1362 return -EEXIST;
4d680aee
ZJS
1363 }
1364
1365 if (arg_link_journal == LINK_NO)
1366 return 0;
1367
03cfe0d5
LP
1368 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1369 if (r < 0)
1370 return log_error_errno(r, "Failed to create /var: %m");
1371
1372 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to create /var/log: %m");
1375
1376 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1377 if (r < 0)
1378 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1379
1380 p = strjoina("/var/log/journal/", id);
1381 q = prefix_roota(directory, p);
27407a01 1382
e26d6ce5 1383 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1384 if (arg_link_journal != LINK_AUTO) {
1385 log_error("%s: already a mount point, refusing to use for journal", p);
1386 return -EEXIST;
1387 }
1388
1389 return 0;
57fb9fb5
LP
1390 }
1391
e26d6ce5 1392 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1393 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1394 log_error("%s: already a mount point, refusing to use for journal", q);
1395 return -EEXIST;
57fb9fb5
LP
1396 }
1397
27407a01 1398 return 0;
57fb9fb5
LP
1399 }
1400
1401 r = readlink_and_make_absolute(p, &d);
1402 if (r >= 0) {
1403 if ((arg_link_journal == LINK_GUEST ||
1404 arg_link_journal == LINK_AUTO) &&
1405 path_equal(d, q)) {
1406
03cfe0d5 1407 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1408 if (r < 0)
56f64d95 1409 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1410 return 0;
57fb9fb5
LP
1411 }
1412
4a62c710
MS
1413 if (unlink(p) < 0)
1414 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1415 } else if (r == -EINVAL) {
1416
1417 if (arg_link_journal == LINK_GUEST &&
1418 rmdir(p) < 0) {
1419
27407a01
ZJS
1420 if (errno == ENOTDIR) {
1421 log_error("%s already exists and is neither a symlink nor a directory", p);
1422 return r;
1423 } else {
56f64d95 1424 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1425 return -errno;
57fb9fb5 1426 }
57fb9fb5
LP
1427 }
1428 } else if (r != -ENOENT) {
56f64d95 1429 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1430 return r;
57fb9fb5
LP
1431 }
1432
1433 if (arg_link_journal == LINK_GUEST) {
1434
1435 if (symlink(q, p) < 0) {
574edc90 1436 if (arg_link_journal_try) {
56f64d95 1437 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1438 return 0;
1439 } else {
56f64d95 1440 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1441 return -errno;
1442 }
57fb9fb5
LP
1443 }
1444
03cfe0d5 1445 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1446 if (r < 0)
56f64d95 1447 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1448 return 0;
57fb9fb5
LP
1449 }
1450
1451 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1452 /* don't create parents here -- if the host doesn't have
1453 * permanent journal set up, don't force it here */
1454 r = mkdir(p, 0755);
57fb9fb5 1455 if (r < 0) {
574edc90 1456 if (arg_link_journal_try) {
56f64d95 1457 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1458 return 0;
1459 } else {
56f64d95 1460 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1461 return r;
1462 }
57fb9fb5
LP
1463 }
1464
27407a01
ZJS
1465 } else if (access(p, F_OK) < 0)
1466 return 0;
57fb9fb5 1467
cdb2b9d0
LP
1468 if (dir_is_empty(q) == 0)
1469 log_warning("%s is not empty, proceeding anyway.", q);
1470
03cfe0d5 1471 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 1472 if (r < 0) {
56f64d95 1473 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1474 return r;
57fb9fb5
LP
1475 }
1476
4543768d 1477 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1478 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1479
27407a01 1480 return 0;
57fb9fb5
LP
1481}
1482
88213476 1483static int drop_capabilities(void) {
5076f0cc 1484 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1485}
1486
db999e0f
LP
1487static int reset_audit_loginuid(void) {
1488 _cleanup_free_ char *p = NULL;
1489 int r;
1490
1491 if (arg_share_system)
1492 return 0;
1493
1494 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1495 if (r == -ENOENT)
db999e0f 1496 return 0;
f647962d
MS
1497 if (r < 0)
1498 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1499
1500 /* Already reset? */
1501 if (streq(p, "4294967295"))
1502 return 0;
1503
ad118bda 1504 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1505 if (r < 0) {
10a87006
LP
1506 log_error_errno(r,
1507 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1508 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1509 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1510 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1511 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1512
db999e0f 1513 sleep(5);
77b6e194 1514 }
db999e0f
LP
1515
1516 return 0;
77b6e194
LP
1517}
1518
28650077 1519static int setup_seccomp(void) {
24fb1112
LP
1520
1521#ifdef HAVE_SECCOMP
9a71b112
JF
1522 static const struct {
1523 uint64_t capability;
1524 int syscall_num;
1525 } blacklist[] = {
5ba7a268
LP
1526 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1527 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1528 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1529 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1532 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1533 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1535 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1536 };
1537
24fb1112 1538 scmp_filter_ctx seccomp;
28650077 1539 unsigned i;
24fb1112
LP
1540 int r;
1541
24fb1112
LP
1542 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1543 if (!seccomp)
1544 return log_oom();
1545
e9642be2 1546 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1547 if (r < 0) {
da927ba9 1548 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1549 goto finish;
1550 }
1551
28650077 1552 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1553 if (arg_retain & (1ULL << blacklist[i].capability))
1554 continue;
1555
1556 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1557 if (r == -EFAULT)
1558 continue; /* unknown syscall */
1559 if (r < 0) {
da927ba9 1560 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1561 goto finish;
1562 }
1563 }
1564
d0a0ccf3 1565
28650077
LP
1566 /*
1567 Audit is broken in containers, much of the userspace audit
1568 hookup will fail if running inside a container. We don't
1569 care and just turn off creation of audit sockets.
1570
1571 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1572 with EAFNOSUPPORT which audit userspace uses as indication
1573 that audit is disabled in the kernel.
1574 */
1575
3302da46 1576 r = seccomp_rule_add(
24fb1112
LP
1577 seccomp,
1578 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1579 SCMP_SYS(socket),
1580 2,
1581 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1582 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1583 if (r < 0) {
da927ba9 1584 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1585 goto finish;
1586 }
1587
1588 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1589 if (r < 0) {
da927ba9 1590 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1591 goto finish;
1592 }
1593
1594 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1595 if (r == -EINVAL) {
1596 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1597 r = 0;
1598 goto finish;
1599 }
1600 if (r < 0) {
da927ba9 1601 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1602 goto finish;
1603 }
24fb1112
LP
1604
1605finish:
1606 seccomp_release(seccomp);
1607 return r;
1608#else
1609 return 0;
1610#endif
1611
1612}
1613
785890ac
LP
1614static int setup_propagate(const char *root) {
1615 const char *p, *q;
1616
1617 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1618 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1619 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1620 (void) mkdir_p(p, 0600);
1621
03cfe0d5
LP
1622 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1623 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1624
1625 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1626 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1627
1628 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1629 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1630
03cfe0d5 1631 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1632 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1633 return log_error_errno(errno, "Failed to install propagation bind mount.");
1634
1635 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1636 return log_error_errno(errno, "Failed to make propagation mount read-only");
1637
1638 return 0;
1639}
1640
1b9e5b12
LP
1641static int setup_image(char **device_path, int *loop_nr) {
1642 struct loop_info64 info = {
1643 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1644 };
1645 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1646 _cleanup_free_ char* loopdev = NULL;
1647 struct stat st;
1648 int r, nr;
1649
1650 assert(device_path);
1651 assert(loop_nr);
ec16945e 1652 assert(arg_image);
1b9e5b12
LP
1653
1654 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1655 if (fd < 0)
1656 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1657
4a62c710
MS
1658 if (fstat(fd, &st) < 0)
1659 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1660
1661 if (S_ISBLK(st.st_mode)) {
1662 char *p;
1663
1664 p = strdup(arg_image);
1665 if (!p)
1666 return log_oom();
1667
1668 *device_path = p;
1669
1670 *loop_nr = -1;
1671
1672 r = fd;
1673 fd = -1;
1674
1675 return r;
1676 }
1677
1678 if (!S_ISREG(st.st_mode)) {
56f64d95 1679 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
1680 return -EINVAL;
1681 }
1682
1683 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1684 if (control < 0)
1685 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1686
1687 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1688 if (nr < 0)
1689 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1690
1691 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1692 return log_oom();
1693
1694 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1695 if (loop < 0)
1696 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1697
4a62c710
MS
1698 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1699 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1700
1701 if (arg_read_only)
1702 info.lo_flags |= LO_FLAGS_READ_ONLY;
1703
4a62c710
MS
1704 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1705 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1706
1707 *device_path = loopdev;
1708 loopdev = NULL;
1709
1710 *loop_nr = nr;
1711
1712 r = loop;
1713 loop = -1;
1714
1715 return r;
1716}
1717
ada4799a
LP
1718#define PARTITION_TABLE_BLURB \
1719 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1720 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1721 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1722 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1723 "to be bootable with systemd-nspawn."
1724
1b9e5b12
LP
1725static int dissect_image(
1726 int fd,
727fd4fd
LP
1727 char **root_device, bool *root_device_rw,
1728 char **home_device, bool *home_device_rw,
1729 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1730 bool *secondary) {
1731
1732#ifdef HAVE_BLKID
01dc33ce
ZJS
1733 int home_nr = -1, srv_nr = -1;
1734#ifdef GPT_ROOT_NATIVE
1735 int root_nr = -1;
1736#endif
1737#ifdef GPT_ROOT_SECONDARY
1738 int secondary_root_nr = -1;
1739#endif
f6c51a81 1740 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1741 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1742 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1743 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1744 _cleanup_udev_unref_ struct udev *udev = NULL;
1745 struct udev_list_entry *first, *item;
f6c51a81 1746 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1747 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1748 const char *pttype = NULL;
1749 blkid_partlist pl;
1750 struct stat st;
c09ef2e4 1751 unsigned i;
1b9e5b12
LP
1752 int r;
1753
1754 assert(fd >= 0);
1755 assert(root_device);
1756 assert(home_device);
1757 assert(srv_device);
1758 assert(secondary);
ec16945e 1759 assert(arg_image);
1b9e5b12
LP
1760
1761 b = blkid_new_probe();
1762 if (!b)
1763 return log_oom();
1764
1765 errno = 0;
1766 r = blkid_probe_set_device(b, fd, 0, 0);
1767 if (r != 0) {
1768 if (errno == 0)
1769 return log_oom();
1770
56f64d95 1771 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1772 return -errno;
1773 }
1774
1775 blkid_probe_enable_partitions(b, 1);
1776 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1777
1778 errno = 0;
1779 r = blkid_do_safeprobe(b);
1780 if (r == -2 || r == 1) {
ada4799a
LP
1781 log_error("Failed to identify any partition table on\n"
1782 " %s\n"
1783 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1784 return -EINVAL;
1785 } else if (r != 0) {
1786 if (errno == 0)
1787 errno = EIO;
56f64d95 1788 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1789 return -errno;
1790 }
1791
48861960 1792 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1793
1794 is_gpt = streq_ptr(pttype, "gpt");
1795 is_mbr = streq_ptr(pttype, "dos");
1796
1797 if (!is_gpt && !is_mbr) {
1798 log_error("No GPT or MBR partition table discovered on\n"
1799 " %s\n"
1800 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1801 return -EINVAL;
1802 }
1803
1804 errno = 0;
1805 pl = blkid_probe_get_partitions(b);
1806 if (!pl) {
1807 if (errno == 0)
1808 return log_oom();
1809
1810 log_error("Failed to list partitions of %s", arg_image);
1811 return -errno;
1812 }
1813
1814 udev = udev_new();
1815 if (!udev)
1816 return log_oom();
1817
4a62c710
MS
1818 if (fstat(fd, &st) < 0)
1819 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1820
c09ef2e4
LP
1821 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1822 if (!d)
1b9e5b12
LP
1823 return log_oom();
1824
c09ef2e4
LP
1825 for (i = 0;; i++) {
1826 int n, m;
1b9e5b12 1827
c09ef2e4
LP
1828 if (i >= 10) {
1829 log_error("Kernel partitions never appeared.");
1830 return -ENXIO;
1831 }
1832
1833 e = udev_enumerate_new(udev);
1834 if (!e)
1835 return log_oom();
1836
1837 r = udev_enumerate_add_match_parent(e, d);
1838 if (r < 0)
1839 return log_oom();
1840
1841 r = udev_enumerate_scan_devices(e);
1842 if (r < 0)
1843 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1844
1845 /* Count the partitions enumerated by the kernel */
1846 n = 0;
1847 first = udev_enumerate_get_list_entry(e);
1848 udev_list_entry_foreach(item, first)
1849 n++;
1850
1851 /* Count the partitions enumerated by blkid */
1852 m = blkid_partlist_numof_partitions(pl);
1853 if (n == m + 1)
1854 break;
1855 if (n > m + 1) {
1856 log_error("blkid and kernel partition list do not match.");
1857 return -EIO;
1858 }
1859 if (n < m + 1) {
1860 unsigned j;
1861
1862 /* The kernel has probed fewer partitions than
1863 * blkid? Maybe the kernel prober is still
1864 * running or it got EBUSY because udev
1865 * already opened the device. Let's reprobe
1866 * the device, which is a synchronous call
1867 * that waits until probing is complete. */
1868
1869 for (j = 0; j < 20; j++) {
1870
1871 r = ioctl(fd, BLKRRPART, 0);
1872 if (r < 0)
1873 r = -errno;
1874 if (r >= 0 || r != -EBUSY)
1875 break;
1876
1877 /* If something else has the device
1878 * open, such as an udev rule, the
1879 * ioctl will return EBUSY. Since
1880 * there's no way to wait until it
1881 * isn't busy anymore, let's just wait
1882 * a bit, and try again.
1883 *
1884 * This is really something they
1885 * should fix in the kernel! */
1886
1887 usleep(50 * USEC_PER_MSEC);
1888 }
1889
1890 if (r < 0)
1891 return log_error_errno(r, "Failed to reread partition table: %m");
1892 }
1893
1894 e = udev_enumerate_unref(e);
1895 }
1b9e5b12
LP
1896
1897 first = udev_enumerate_get_list_entry(e);
1898 udev_list_entry_foreach(item, first) {
1899 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1900 const char *node;
727fd4fd 1901 unsigned long long flags;
1b9e5b12
LP
1902 blkid_partition pp;
1903 dev_t qn;
1904 int nr;
1905
1906 errno = 0;
1907 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1908 if (!q) {
1909 if (!errno)
1910 errno = ENOMEM;
1911
56f64d95 1912 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1913 return -errno;
1914 }
1915
1916 qn = udev_device_get_devnum(q);
1917 if (major(qn) == 0)
1918 continue;
1919
1920 if (st.st_rdev == qn)
1921 continue;
1922
1923 node = udev_device_get_devnode(q);
1924 if (!node)
1925 continue;
1926
1927 pp = blkid_partlist_devno_to_partition(pl, qn);
1928 if (!pp)
1929 continue;
1930
727fd4fd 1931 flags = blkid_partition_get_flags(pp);
727fd4fd 1932
1b9e5b12
LP
1933 nr = blkid_partition_get_partno(pp);
1934 if (nr < 0)
1935 continue;
1936
ada4799a
LP
1937 if (is_gpt) {
1938 sd_id128_t type_id;
1939 const char *stype;
1b9e5b12 1940
f6c51a81
LP
1941 if (flags & GPT_FLAG_NO_AUTO)
1942 continue;
1943
ada4799a
LP
1944 stype = blkid_partition_get_type_string(pp);
1945 if (!stype)
1946 continue;
1b9e5b12 1947
ada4799a 1948 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1949 continue;
1950
ada4799a 1951 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1952
ada4799a
LP
1953 if (home && nr >= home_nr)
1954 continue;
1b9e5b12 1955
ada4799a
LP
1956 home_nr = nr;
1957 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1958
ada4799a
LP
1959 r = free_and_strdup(&home, node);
1960 if (r < 0)
1961 return log_oom();
727fd4fd 1962
ada4799a
LP
1963 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1964
1965 if (srv && nr >= srv_nr)
1966 continue;
1967
1968 srv_nr = nr;
1969 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1970
1971 r = free_and_strdup(&srv, node);
1972 if (r < 0)
1973 return log_oom();
1974 }
1b9e5b12 1975#ifdef GPT_ROOT_NATIVE
ada4799a 1976 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1977
ada4799a
LP
1978 if (root && nr >= root_nr)
1979 continue;
1b9e5b12 1980
ada4799a
LP
1981 root_nr = nr;
1982 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1983
ada4799a
LP
1984 r = free_and_strdup(&root, node);
1985 if (r < 0)
1986 return log_oom();
1987 }
1b9e5b12
LP
1988#endif
1989#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1990 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1991
1992 if (secondary_root && nr >= secondary_root_nr)
1993 continue;
1994
1995 secondary_root_nr = nr;
1996 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1997
1998 r = free_and_strdup(&secondary_root, node);
1999 if (r < 0)
2000 return log_oom();
2001 }
2002#endif
f6c51a81
LP
2003 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2004
2005 if (generic)
2006 multiple_generic = true;
2007 else {
2008 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2009
2010 r = free_and_strdup(&generic, node);
2011 if (r < 0)
2012 return log_oom();
2013 }
2014 }
ada4799a
LP
2015
2016 } else if (is_mbr) {
2017 int type;
1b9e5b12 2018
f6c51a81
LP
2019 if (flags != 0x80) /* Bootable flag */
2020 continue;
2021
ada4799a
LP
2022 type = blkid_partition_get_type(pp);
2023 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2024 continue;
2025
f6c51a81
LP
2026 if (generic)
2027 multiple_generic = true;
2028 else {
2029 generic_rw = true;
727fd4fd 2030
f6c51a81
LP
2031 r = free_and_strdup(&root, node);
2032 if (r < 0)
2033 return log_oom();
2034 }
1b9e5b12 2035 }
1b9e5b12
LP
2036 }
2037
1b9e5b12
LP
2038 if (root) {
2039 *root_device = root;
2040 root = NULL;
727fd4fd
LP
2041
2042 *root_device_rw = root_rw;
1b9e5b12
LP
2043 *secondary = false;
2044 } else if (secondary_root) {
2045 *root_device = secondary_root;
2046 secondary_root = NULL;
727fd4fd
LP
2047
2048 *root_device_rw = secondary_root_rw;
1b9e5b12 2049 *secondary = true;
f6c51a81
LP
2050 } else if (generic) {
2051
2052 /* There were no partitions with precise meanings
2053 * around, but we found generic partitions. In this
2054 * case, if there's only one, we can go ahead and boot
2055 * it, otherwise we bail out, because we really cannot
2056 * make any sense of it. */
2057
2058 if (multiple_generic) {
2059 log_error("Identified multiple bootable Linux partitions on\n"
2060 " %s\n"
2061 PARTITION_TABLE_BLURB, arg_image);
2062 return -EINVAL;
2063 }
2064
2065 *root_device = generic;
2066 generic = NULL;
2067
2068 *root_device_rw = generic_rw;
2069 *secondary = false;
2070 } else {
2071 log_error("Failed to identify root partition in disk image\n"
2072 " %s\n"
2073 PARTITION_TABLE_BLURB, arg_image);
2074 return -EINVAL;
1b9e5b12
LP
2075 }
2076
2077 if (home) {
2078 *home_device = home;
2079 home = NULL;
727fd4fd
LP
2080
2081 *home_device_rw = home_rw;
1b9e5b12
LP
2082 }
2083
2084 if (srv) {
2085 *srv_device = srv;
2086 srv = NULL;
727fd4fd
LP
2087
2088 *srv_device_rw = srv_rw;
1b9e5b12
LP
2089 }
2090
2091 return 0;
2092#else
2093 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2094 return -EOPNOTSUPP;
1b9e5b12
LP
2095#endif
2096}
2097
727fd4fd 2098static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2099#ifdef HAVE_BLKID
2100 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2101 const char *fstype, *p;
2102 int r;
2103
2104 assert(what);
2105 assert(where);
2106
727fd4fd
LP
2107 if (arg_read_only)
2108 rw = false;
2109
1b9e5b12 2110 if (directory)
63c372cb 2111 p = strjoina(where, directory);
1b9e5b12
LP
2112 else
2113 p = where;
2114
2115 errno = 0;
2116 b = blkid_new_probe_from_filename(what);
2117 if (!b) {
2118 if (errno == 0)
2119 return log_oom();
56f64d95 2120 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2121 return -errno;
2122 }
2123
2124 blkid_probe_enable_superblocks(b, 1);
2125 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2126
2127 errno = 0;
2128 r = blkid_do_safeprobe(b);
2129 if (r == -1 || r == 1) {
2130 log_error("Cannot determine file system type of %s", what);
2131 return -EINVAL;
2132 } else if (r != 0) {
2133 if (errno == 0)
2134 errno = EIO;
56f64d95 2135 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2136 return -errno;
2137 }
2138
2139 errno = 0;
2140 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2141 if (errno == 0)
2142 errno = EINVAL;
2143 log_error("Failed to determine file system type of %s", what);
2144 return -errno;
2145 }
2146
2147 if (streq(fstype, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2149 return -EOPNOTSUPP;
1b9e5b12
LP
2150 }
2151
4a62c710
MS
2152 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2153 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2154
2155 return 0;
2156#else
2157 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2158 return -EOPNOTSUPP;
1b9e5b12
LP
2159#endif
2160}
2161
727fd4fd
LP
2162static int mount_devices(
2163 const char *where,
2164 const char *root_device, bool root_device_rw,
2165 const char *home_device, bool home_device_rw,
2166 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2167 int r;
2168
2169 assert(where);
2170
2171 if (root_device) {
727fd4fd 2172 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2175 }
2176
2177 if (home_device) {
727fd4fd 2178 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2181 }
2182
2183 if (srv_device) {
727fd4fd 2184 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2185 if (r < 0)
2186 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2187 }
2188
2189 return 0;
2190}
2191
2192static void loop_remove(int nr, int *image_fd) {
2193 _cleanup_close_ int control = -1;
e8c8ddcc 2194 int r;
1b9e5b12
LP
2195
2196 if (nr < 0)
2197 return;
2198
2199 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2200 r = ioctl(*image_fd, LOOP_CLR_FD);
2201 if (r < 0)
5e4074aa 2202 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2203 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2204 }
2205
2206 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2207 if (control < 0) {
56f64d95 2208 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2209 return;
e8c8ddcc 2210 }
1b9e5b12 2211
e8c8ddcc
TG
2212 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2213 if (r < 0)
5e4074aa 2214 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2215}
2216
113cea80 2217/*
6d416b9c
LS
2218 * Return values:
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
919699ec
LP
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
6d416b9c
LS
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2230 *
6d416b9c
LS
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
113cea80
DH
2233 */
2234static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2235 siginfo_t status;
919699ec 2236 int r;
113cea80
DH
2237
2238 r = wait_for_terminate(pid, &status);
f647962d
MS
2239 if (r < 0)
2240 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2241
2242 switch (status.si_code) {
fddbb89c 2243
113cea80 2244 case CLD_EXITED:
919699ec
LP
2245 if (status.si_status == 0) {
2246 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2247
fddbb89c 2248 } else
919699ec 2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2250
919699ec
LP
2251 *container = CONTAINER_TERMINATED;
2252 return status.si_status;
113cea80
DH
2253
2254 case CLD_KILLED:
2255 if (status.si_status == SIGINT) {
113cea80 2256
919699ec 2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2258 *container = CONTAINER_TERMINATED;
919699ec
LP
2259 return 0;
2260
113cea80 2261 } else if (status.si_status == SIGHUP) {
113cea80 2262
919699ec 2263 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2264 *container = CONTAINER_REBOOTED;
919699ec 2265 return 0;
113cea80 2266 }
919699ec 2267
113cea80
DH
2268 /* CLD_KILLED fallthrough */
2269
2270 case CLD_DUMPED:
fddbb89c 2271 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2272 return -EIO;
113cea80
DH
2273
2274 default:
fddbb89c 2275 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2276 return -EIO;
113cea80
DH
2277 }
2278
2279 return r;
2280}
2281
023fb90b
LP
2282static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2283 pid_t pid;
2284
2285 pid = PTR_TO_UINT32(userdata);
2286 if (pid > 0) {
c6c8f6e2 2287 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s, NULL);
2290 return 0;
2291 }
2292 }
2293
2294 sd_event_exit(sd_event_source_get_event(s), 0);
2295 return 0;
2296}
2297
ec16945e 2298static int determine_names(void) {
1b9cebf6 2299 int r;
ec16945e 2300
c1521918
LP
2301 if (arg_template && !arg_directory && arg_machine) {
2302
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2306
2307 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2308 if (!arg_directory)
2309 return log_oom();
2310 }
2311
ec16945e 2312 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2313 if (arg_machine) {
2314 _cleanup_(image_unrefp) Image *i = NULL;
2315
2316 r = image_find(arg_machine, &i);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2319 else if (r == 0) {
2320 log_error("No image for machine '%s': %m", arg_machine);
2321 return -ENOENT;
2322 }
2323
aceac2f0 2324 if (i->type == IMAGE_RAW)
1b9cebf6
LP
2325 r = set_sanitized_path(&arg_image, i->path);
2326 else
2327 r = set_sanitized_path(&arg_directory, i->path);
2328 if (r < 0)
2329 return log_error_errno(r, "Invalid image directory: %m");
2330
aee327b8
LP
2331 if (!arg_ephemeral)
2332 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2333 } else
ec16945e
LP
2334 arg_directory = get_current_dir_name();
2335
1b9cebf6
LP
2336 if (!arg_directory && !arg_machine) {
2337 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2338 return -EINVAL;
2339 }
2340 }
2341
2342 if (!arg_machine) {
b9ba4dab
LP
2343 if (arg_directory && path_equal(arg_directory, "/"))
2344 arg_machine = gethostname_malloc();
2345 else
2346 arg_machine = strdup(basename(arg_image ?: arg_directory));
2347
ec16945e
LP
2348 if (!arg_machine)
2349 return log_oom();
2350
ae691c1d 2351 hostname_cleanup(arg_machine);
ec16945e
LP
2352 if (!machine_name_is_valid(arg_machine)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2354 return -EINVAL;
2355 }
b9ba4dab
LP
2356
2357 if (arg_ephemeral) {
2358 char *b;
2359
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2364
2365 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2366 return log_oom();
2367
2368 free(arg_machine);
2369 arg_machine = b;
2370 }
ec16945e
LP
2371 }
2372
2373 return 0;
2374}
2375
03cfe0d5 2376static int determine_uid_shift(const char *directory) {
6dac160c
LP
2377 int r;
2378
03cfe0d5
LP
2379 if (!arg_userns) {
2380 arg_uid_shift = 0;
6dac160c 2381 return 0;
03cfe0d5 2382 }
6dac160c
LP
2383
2384 if (arg_uid_shift == UID_INVALID) {
2385 struct stat st;
2386
03cfe0d5 2387 r = stat(directory, &st);
6dac160c 2388 if (r < 0)
03cfe0d5 2389 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2390
2391 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2392
2393 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2394 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2395 return -EINVAL;
2396 }
2397
2398 arg_uid_range = UINT32_C(0x10000);
2399 }
2400
2401 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2402 log_error("UID base too high for UID range.");
2403 return -EINVAL;
2404 }
2405
2406 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2407 return 0;
2408}
2409
03cfe0d5
LP
2410static int inner_child(
2411 Barrier *barrier,
2412 const char *directory,
2413 bool secondary,
2414 int kmsg_socket,
2415 int rtnl_socket,
f757855e 2416 FDSet *fds) {
69c79d3c 2417
03cfe0d5
LP
2418 _cleanup_free_ char *home = NULL;
2419 unsigned n_env = 2;
2420 const char *envp[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR,
2422 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2423 NULL, /* TERM */
2424 NULL, /* HOME */
2425 NULL, /* USER */
2426 NULL, /* LOGNAME */
2427 NULL, /* container_uuid */
2428 NULL, /* LISTEN_FDS */
2429 NULL, /* LISTEN_PID */
2430 NULL
2431 };
88213476 2432
2371271c 2433 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2434 int r;
88213476 2435
03cfe0d5
LP
2436 assert(barrier);
2437 assert(directory);
2438 assert(kmsg_socket >= 0);
88213476 2439
efdb0237
LP
2440 cg_unified_flush();
2441
03cfe0d5
LP
2442 if (arg_userns) {
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier); /* #1 */
7027ff61 2445
03cfe0d5
LP
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier)) { /* #2 */
2448 log_error("Parent died too early");
2449 return -ESRCH;
2450 }
88213476
LP
2451 }
2452
d1678248 2453 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2454 if (r < 0)
2455 return r;
2456
d8fc6a00
LP
2457 r = mount_sysfs(NULL);
2458 if (r < 0)
2459 return r;
2460
03cfe0d5
LP
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier)) { /* #3 */
2464 log_error("Parent died too early");
2465 return -ESRCH;
88213476
LP
2466 }
2467
e83bebef 2468 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2469 if (r < 0)
2470 return r;
ec16945e 2471
03cfe0d5
LP
2472 r = reset_uid_gid();
2473 if (r < 0)
2474 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2475
03cfe0d5
LP
2476 r = setup_boot_id(NULL);
2477 if (r < 0)
2478 return r;
ec16945e 2479
03cfe0d5
LP
2480 r = setup_kmsg(NULL, kmsg_socket);
2481 if (r < 0)
2482 return r;
2483 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2484
03cfe0d5 2485 umask(0022);
30535c16 2486
03cfe0d5
LP
2487 if (setsid() < 0)
2488 return log_error_errno(errno, "setsid() failed: %m");
2489
2490 if (arg_private_network)
2491 loopback_setup();
2492
7a8f6325
LP
2493 if (arg_expose_ports) {
2494 r = expose_port_send_rtnl(rtnl_socket);
2495 if (r < 0)
2496 return r;
2497 rtnl_socket = safe_close(rtnl_socket);
2498 }
03cfe0d5
LP
2499
2500 if (drop_capabilities() < 0)
2501 return log_error_errno(errno, "drop_capabilities() failed: %m");
2502
2503 setup_hostname();
2504
050f7277 2505 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2506 if (personality(arg_personality) < 0)
2507 return log_error_errno(errno, "personality() failed: %m");
2508 } else if (secondary) {
2509 if (personality(PER_LINUX32) < 0)
2510 return log_error_errno(errno, "personality() failed: %m");
2511 }
2512
2513#ifdef HAVE_SELINUX
2514 if (arg_selinux_context)
2515 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2516 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2517#endif
2518
ee645080 2519 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2520 if (r < 0)
2521 return r;
2522
2523 envp[n_env] = strv_find_prefix(environ, "TERM=");
2524 if (envp[n_env])
2525 n_env ++;
2526
2527 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2528 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2529 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2530 return log_oom();
2531
2532 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2533 char as_uuid[37];
2534
2535 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2536 return log_oom();
2537 }
2538
2539 if (fdset_size(fds) > 0) {
2540 r = fdset_cloexec(fds, false);
2541 if (r < 0)
2542 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2543
2544 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2545 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2546 return log_oom();
2547 }
2548
2371271c
TG
2549 env_use = strv_env_merge(2, envp, arg_setenv);
2550 if (!env_use)
2551 return log_oom();
03cfe0d5
LP
2552
2553 /* Let the parent know that we are ready and
2554 * wait until the parent is ready with the
2555 * setup, too... */
2556 if (!barrier_place_and_sync(barrier)) { /* #4 */
2557 log_error("Parent died too early");
2558 return -ESRCH;
2559 }
2560
2561 /* Now, explicitly close the log, so that we
2562 * then can close all remaining fds. Closing
2563 * the log explicitly first has the benefit
2564 * that the logging subsystem knows about it,
2565 * and is thus ready to be reopened should we
2566 * need it again. Note that the other fds
2567 * closed here are at least the locking and
2568 * barrier fds. */
2569 log_close();
2570 (void) fdset_close_others(fds);
2571
2572 if (arg_boot) {
2573 char **a;
2574 size_t m;
2575
2576 /* Automatically search for the init system */
2577
f757855e 2578 m = 1 + strv_length(arg_parameters);
03cfe0d5 2579 a = newa(char*, m + 1);
f757855e
LP
2580 if (strv_isempty(arg_parameters))
2581 a[1] = NULL;
2582 else
2583 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2584
2585 a[0] = (char*) "/usr/lib/systemd/systemd";
2586 execve(a[0], a, env_use);
2587
2588 a[0] = (char*) "/lib/systemd/systemd";
2589 execve(a[0], a, env_use);
2590
2591 a[0] = (char*) "/sbin/init";
2592 execve(a[0], a, env_use);
f757855e
LP
2593 } else if (!strv_isempty(arg_parameters))
2594 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2595 else {
f757855e 2596 chdir(home ?: "/root");
03cfe0d5
LP
2597 execle("/bin/bash", "-bash", NULL, env_use);
2598 execle("/bin/sh", "-sh", NULL, env_use);
2599 }
2600
2601 (void) log_open();
2602 return log_error_errno(errno, "execv() failed: %m");
2603}
2604
2605static int outer_child(
2606 Barrier *barrier,
2607 const char *directory,
2608 const char *console,
2609 const char *root_device, bool root_device_rw,
2610 const char *home_device, bool home_device_rw,
2611 const char *srv_device, bool srv_device_rw,
2612 bool interactive,
2613 bool secondary,
2614 int pid_socket,
2615 int kmsg_socket,
2616 int rtnl_socket,
825d5287 2617 int uid_shift_socket,
f757855e 2618 FDSet *fds) {
03cfe0d5
LP
2619
2620 pid_t pid;
2621 ssize_t l;
2622 int r;
2623
2624 assert(barrier);
2625 assert(directory);
2626 assert(console);
2627 assert(pid_socket >= 0);
2628 assert(kmsg_socket >= 0);
2629
efdb0237
LP
2630 cg_unified_flush();
2631
03cfe0d5
LP
2632 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2633 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2634
2635 if (interactive) {
2636 close_nointr(STDIN_FILENO);
2637 close_nointr(STDOUT_FILENO);
2638 close_nointr(STDERR_FILENO);
2639
2640 r = open_terminal(console, O_RDWR);
2641 if (r != STDIN_FILENO) {
2642 if (r >= 0) {
2643 safe_close(r);
2644 r = -EINVAL;
2645 }
2646
2647 return log_error_errno(r, "Failed to open console: %m");
2648 }
2649
2650 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2651 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2652 return log_error_errno(errno, "Failed to duplicate console: %m");
2653 }
2654
2655 r = reset_audit_loginuid();
2656 if (r < 0)
2657 return r;
2658
2659 /* Mark everything as slave, so that we still
2660 * receive mounts from the real root, but don't
2661 * propagate mounts to the real root. */
2662 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2663 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2664
2665 r = mount_devices(directory,
2666 root_device, root_device_rw,
2667 home_device, home_device_rw,
2668 srv_device, srv_device_rw);
2669 if (r < 0)
2670 return r;
2671
391567f4
LP
2672 r = determine_uid_shift(directory);
2673 if (r < 0)
2674 return r;
2675
825d5287
RM
2676 if (arg_userns) {
2677 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2678 if (l < 0)
2679 return log_error_errno(errno, "Failed to send UID shift: %m");
2680 if (l != sizeof(arg_uid_shift)) {
2681 log_error("Short write while sending UID shift.");
2682 return -EIO;
2683 }
2684 }
2685
03cfe0d5
LP
2686 /* Turn directory into bind mount */
2687 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2688 return log_error_errno(errno, "Failed to make bind mount: %m");
2689
e83bebef 2690 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2691 if (r < 0)
2692 return r;
2693
e83bebef 2694 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2695 if (r < 0)
2696 return r;
2697
03cfe0d5
LP
2698 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2699 if (r < 0)
2700 return r;
2701
03cfe0d5
LP
2702 if (arg_read_only) {
2703 r = bind_remount_recursive(directory, true);
2704 if (r < 0)
2705 return log_error_errno(r, "Failed to make tree read-only: %m");
2706 }
2707
d1678248 2708 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2709 if (r < 0)
2710 return r;
2711
07fa00f9
LP
2712 r = copy_devnodes(directory);
2713 if (r < 0)
03cfe0d5
LP
2714 return r;
2715
2716 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2717
07fa00f9
LP
2718 r = setup_pts(directory);
2719 if (r < 0)
03cfe0d5
LP
2720 return r;
2721
2722 r = setup_propagate(directory);
2723 if (r < 0)
2724 return r;
2725
2726 r = setup_dev_console(directory, console);
2727 if (r < 0)
2728 return r;
2729
2730 r = setup_seccomp();
2731 if (r < 0)
2732 return r;
2733
2734 r = setup_timezone(directory);
2735 if (r < 0)
2736 return r;
2737
2738 r = setup_resolv_conf(directory);
2739 if (r < 0)
2740 return r;
2741
2742 r = setup_journal(directory);
2743 if (r < 0)
2744 return r;
2745
e83bebef 2746 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2747 if (r < 0)
2748 return r;
2749
e83bebef 2750 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2751 if (r < 0)
2752 return r;
2753
2754 r = mount_move_root(directory);
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to move root directory: %m");
2757
2758 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2759 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2760 (arg_private_network ? CLONE_NEWNET : 0) |
2761 (arg_userns ? CLONE_NEWUSER : 0),
2762 NULL);
2763 if (pid < 0)
2764 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2765 if (pid == 0) {
2766 pid_socket = safe_close(pid_socket);
825d5287 2767 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2768
2769 /* The inner child has all namespaces that are
2770 * requested, so that we all are owned by the user if
2771 * user namespaces are turned on. */
2772
f757855e 2773 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2774 if (r < 0)
2775 _exit(EXIT_FAILURE);
2776
2777 _exit(EXIT_SUCCESS);
2778 }
2779
2780 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2781 if (l < 0)
2782 return log_error_errno(errno, "Failed to send PID: %m");
2783 if (l != sizeof(pid)) {
2784 log_error("Short write while sending PID.");
2785 return -EIO;
2786 }
2787
2788 pid_socket = safe_close(pid_socket);
327e26d6
KN
2789 kmsg_socket = safe_close(kmsg_socket);
2790 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2791
2792 return 0;
2793}
2794
2795static int setup_uid_map(pid_t pid) {
2796 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2797 int r;
2798
2799 assert(pid > 1);
2800
2801 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2802 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2803 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to write UID map: %m");
2806
2807 /* We always assign the same UID and GID ranges */
2808 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2809 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write GID map: %m");
2812
2813 return 0;
2814}
2815
f757855e
LP
2816static int load_settings(void) {
2817 _cleanup_(settings_freep) Settings *settings = NULL;
2818 _cleanup_fclose_ FILE *f = NULL;
2819 _cleanup_free_ char *p = NULL;
2820 const char *fn, *i;
2821 int r;
2822
2823 /* If all settings are masked, there's no point in looking for
2824 * the settings file */
2825 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2826 return 0;
2827
2828 fn = strjoina(arg_machine, ".nspawn");
2829
2830 /* We first look in the admin's directories in /etc and /run */
2831 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2832 _cleanup_free_ char *j = NULL;
2833
2834 j = strjoin(i, "/", fn, NULL);
2835 if (!j)
2836 return log_oom();
2837
2838 f = fopen(j, "re");
2839 if (f) {
2840 p = j;
2841 j = NULL;
2842
2843 /* By default we trust configuration from /etc and /run */
2844 if (arg_settings_trusted < 0)
2845 arg_settings_trusted = true;
2846
2847 break;
2848 }
2849
2850 if (errno != ENOENT)
2851 return log_error_errno(errno, "Failed to open %s: %m", j);
2852 }
2853
2854 if (!f) {
2855 /* After that, let's look for a file next to the
2856 * actual image we shall boot. */
2857
2858 if (arg_image) {
2859 p = file_in_same_dir(arg_image, fn);
2860 if (!p)
2861 return log_oom();
2862 } else if (arg_directory) {
2863 p = file_in_same_dir(arg_directory, fn);
2864 if (!p)
2865 return log_oom();
2866 }
2867
2868 if (p) {
2869 f = fopen(p, "re");
2870 if (!f && errno != ENOENT)
2871 return log_error_errno(errno, "Failed to open %s: %m", p);
2872
2873 /* By default we do not trust configuration from /var/lib/machines */
2874 if (arg_settings_trusted < 0)
2875 arg_settings_trusted = false;
2876 }
2877 }
2878
2879 if (!f)
2880 return 0;
2881
2882 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2883
2884 r = settings_load(f, p, &settings);
2885 if (r < 0)
2886 return r;
2887
2888 /* Copy over bits from the settings, unless they have been
2889 * explicitly masked by command line switches. */
2890
2891 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2892 settings->boot >= 0) {
2893 arg_boot = settings->boot;
2894
2895 strv_free(arg_parameters);
2896 arg_parameters = settings->parameters;
2897 settings->parameters = NULL;
2898 }
2899
2900 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2901 settings->environment) {
2902 strv_free(arg_setenv);
2903 arg_setenv = settings->environment;
2904 settings->environment = NULL;
2905 }
2906
2907 if ((arg_settings_mask & SETTING_USER) == 0 &&
2908 settings->user) {
2909 free(arg_user);
2910 arg_user = settings->user;
2911 settings->user = NULL;
2912 }
2913
2914 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2915 uint64_t plus;
f757855e 2916
0e265674
LP
2917 plus = settings->capability;
2918 if (settings_private_network(settings))
2919 plus |= (1ULL << CAP_NET_ADMIN);
2920
2921 if (!arg_settings_trusted && plus != 0) {
2922 if (settings->capability != 0)
2923 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2924 } else
2925 arg_retain |= plus;
f757855e
LP
2926
2927 arg_retain &= ~settings->drop_capability;
2928 }
2929
2930 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2931 settings->kill_signal > 0)
2932 arg_kill_signal = settings->kill_signal;
2933
2934 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2935 settings->personality != PERSONALITY_INVALID)
2936 arg_personality = settings->personality;
2937
2938 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2939 !sd_id128_is_null(settings->machine_id)) {
2940
2941 if (!arg_settings_trusted)
2942 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2943 else
2944 arg_uuid = settings->machine_id;
2945 }
2946
2947 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2948 settings->read_only >= 0)
2949 arg_read_only = settings->read_only;
2950
2951 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2952 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2953 arg_volatile_mode = settings->volatile_mode;
2954
2955 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2956 settings->n_custom_mounts > 0) {
2957
2958 if (!arg_settings_trusted)
2959 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2960 else {
2961 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2962 arg_custom_mounts = settings->custom_mounts;
2963 arg_n_custom_mounts = settings->n_custom_mounts;
2964
2965 settings->custom_mounts = NULL;
2966 settings->n_custom_mounts = 0;
2967 }
2968 }
2969
2970 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2971 (settings->private_network >= 0 ||
2972 settings->network_veth >= 0 ||
2973 settings->network_bridge ||
2974 settings->network_interfaces ||
2975 settings->network_macvlan ||
2976 settings->network_ipvlan)) {
2977
2978 if (!arg_settings_trusted)
2979 log_warning("Ignoring network settings, file %s is not trusted.", p);
2980 else {
0e265674
LP
2981 arg_network_veth = settings_private_network(settings);
2982 arg_private_network = settings_private_network(settings);
2983
f757855e
LP
2984 strv_free(arg_network_interfaces);
2985 arg_network_interfaces = settings->network_interfaces;
2986 settings->network_interfaces = NULL;
2987
2988 strv_free(arg_network_macvlan);
2989 arg_network_macvlan = settings->network_macvlan;
2990 settings->network_macvlan = NULL;
2991
2992 strv_free(arg_network_ipvlan);
2993 arg_network_ipvlan = settings->network_ipvlan;
2994 settings->network_ipvlan = NULL;
2995
2996 free(arg_network_bridge);
2997 arg_network_bridge = settings->network_bridge;
2998 settings->network_bridge = NULL;
f757855e
LP
2999 }
3000 }
3001
3002 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3003 settings->expose_ports) {
3004
3005 if (!arg_settings_trusted)
3006 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3007 else {
3008 expose_port_free_all(arg_expose_ports);
3009 arg_expose_ports = settings->expose_ports;
3010 settings->expose_ports = NULL;
3011 }
3012 }
3013
3014 return 0;
3015}
3016
03cfe0d5
LP
3017int main(int argc, char *argv[]) {
3018
3019 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3020 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3021 _cleanup_close_ int master = -1, image_fd = -1;
3022 _cleanup_fdset_free_ FDSet *fds = NULL;
3023 int r, n_fd_passed, loop_nr = -1;
3024 char veth_name[IFNAMSIZ];
3025 bool secondary = false, remove_subvol = false;
72c0a2c2 3026 sigset_t mask_chld;
03cfe0d5
LP
3027 pid_t pid = 0;
3028 int ret = EXIT_SUCCESS;
3029 union in_addr_union exposed = {};
3030 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3031 bool interactive;
3032
3033 log_parse_environment();
3034 log_open();
3035
3036 r = parse_argv(argc, argv);
3037 if (r <= 0)
3038 goto finish;
3039
03cfe0d5
LP
3040 if (geteuid() != 0) {
3041 log_error("Need to be root.");
3042 r = -EPERM;
3043 goto finish;
3044 }
f757855e
LP
3045 r = determine_names();
3046 if (r < 0)
3047 goto finish;
3048
3049 r = load_settings();
3050 if (r < 0)
3051 goto finish;
3052
3053 r = verify_arguments();
3054 if (r < 0)
3055 goto finish;
03cfe0d5
LP
3056
3057 n_fd_passed = sd_listen_fds(false);
3058 if (n_fd_passed > 0) {
3059 r = fdset_new_listen_fds(&fds, false);
3060 if (r < 0) {
3061 log_error_errno(r, "Failed to collect file descriptors: %m");
3062 goto finish;
3063 }
3064 }
3065
3066 if (arg_directory) {
3067 assert(!arg_image);
3068
3069 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3070 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3071 r = -EINVAL;
3072 goto finish;
3073 }
3074
3075 if (arg_ephemeral) {
3076 _cleanup_free_ char *np = NULL;
3077
3078 /* If the specified path is a mount point we
3079 * generate the new snapshot immediately
3080 * inside it under a random name. However if
3081 * the specified is not a mount point we
3082 * create the new snapshot in the parent
3083 * directory, just next to it. */
e26d6ce5 3084 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3085 if (r < 0) {
3086 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3087 goto finish;
3088 }
3089 if (r > 0)
770b5ce4 3090 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3091 else
770b5ce4 3092 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3093 if (r < 0) {
3094 log_error_errno(r, "Failed to generate name for snapshot: %m");
3095 goto finish;
3096 }
3097
3098 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3099 if (r < 0) {
3100 log_error_errno(r, "Failed to lock %s: %m", np);
3101 goto finish;
3102 }
3103
5bcd08db 3104 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3105 if (r < 0) {
3106 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3107 goto finish;
ec16945e
LP
3108 }
3109
3110 free(arg_directory);
3111 arg_directory = np;
8a16a7b4 3112 np = NULL;
ec16945e
LP
3113
3114 remove_subvol = true;
30535c16
LP
3115
3116 } else {
3117 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3118 if (r == -EBUSY) {
3119 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3120 goto finish;
3121 }
3122 if (r < 0) {
3123 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3124 return r;
3125 }
3126
3127 if (arg_template) {
5bcd08db 3128 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3129 if (r == -EEXIST) {
3130 if (!arg_quiet)
3131 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3132 } else if (r < 0) {
83521414 3133 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3134 goto finish;
3135 } else {
3136 if (!arg_quiet)
3137 log_info("Populated %s from template %s.", arg_directory, arg_template);
3138 }
3139 }
ec16945e
LP
3140 }
3141
1b9e5b12
LP
3142 if (arg_boot) {
3143 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3144 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3145 r = -EINVAL;
1b9e5b12
LP
3146 goto finish;
3147 }
3148 } else {
3149 const char *p;
3150
63c372cb 3151 p = strjoina(arg_directory,
1b9e5b12
LP
3152 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3153 if (access(p, F_OK) < 0) {
3154 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3155 r = -EINVAL;
1b9e5b12 3156 goto finish;
1b9e5b12
LP
3157 }
3158 }
ec16945e 3159
6b9132a9 3160 } else {
1b9e5b12 3161 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3162
ec16945e
LP
3163 assert(arg_image);
3164 assert(!arg_template);
3165
30535c16
LP
3166 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3167 if (r == -EBUSY) {
3168 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3169 goto finish;
3170 }
3171 if (r < 0) {
3172 r = log_error_errno(r, "Failed to create image lock: %m");
3173 goto finish;
3174 }
3175
1b9e5b12 3176 if (!mkdtemp(template)) {
56f64d95 3177 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3178 r = -errno;
6b9132a9 3179 goto finish;
1b9e5b12 3180 }
6b9132a9 3181
1b9e5b12
LP
3182 arg_directory = strdup(template);
3183 if (!arg_directory) {
3184 r = log_oom();
3185 goto finish;
6b9132a9 3186 }
88213476 3187
1b9e5b12
LP
3188 image_fd = setup_image(&device_path, &loop_nr);
3189 if (image_fd < 0) {
3190 r = image_fd;
842f3b0f
LP
3191 goto finish;
3192 }
1b9e5b12 3193
4d9f07b4
LP
3194 r = dissect_image(image_fd,
3195 &root_device, &root_device_rw,
3196 &home_device, &home_device_rw,
3197 &srv_device, &srv_device_rw,
3198 &secondary);
1b9e5b12
LP
3199 if (r < 0)
3200 goto finish;
842f3b0f 3201 }
842f3b0f 3202
5a8af538
LP
3203 r = custom_mounts_prepare();
3204 if (r < 0)
3205 goto finish;
3206
03cfe0d5
LP
3207 interactive =
3208 isatty(STDIN_FILENO) > 0 &&
3209 isatty(STDOUT_FILENO) > 0;
9c857b9d 3210
db7feb7e
LP
3211 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3212 if (master < 0) {
ec16945e 3213 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3214 goto finish;
3215 }
3216
611b312b
LP
3217 r = ptsname_malloc(master, &console);
3218 if (r < 0) {
3219 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3220 goto finish;
3221 }
3222
a258bf26 3223 if (unlockpt(master) < 0) {
ec16945e 3224 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3225 goto finish;
3226 }
3227
9c857b9d
LP
3228 if (!arg_quiet)
3229 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3230 arg_machine, arg_image ?: arg_directory);
3231
72c0a2c2 3232 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3233
023fb90b
LP
3234 assert_se(sigemptyset(&mask_chld) == 0);
3235 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3236
03cfe0d5
LP
3237 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3238 r = log_error_errno(errno, "Failed to become subreaper: %m");
3239 goto finish;
3240 }
3241
d87be9b0 3242 for (;;) {
825d5287
RM
3243 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3244 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3245 ContainerStatus container_status;
7566e267 3246 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3247 static const struct sigaction sa = {
189d5bac 3248 .sa_handler = nop_signal_handler,
e866af3a
DH
3249 .sa_flags = SA_NOCLDSTOP,
3250 };
03cfe0d5
LP
3251 int ifi = 0;
3252 ssize_t l;
dbb60d69
LP
3253 _cleanup_event_unref_ sd_event *event = NULL;
3254 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3255 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3256 char last_char = 0;
e866af3a 3257
7566e267 3258 r = barrier_create(&barrier);
a2da110b 3259 if (r < 0) {
da927ba9 3260 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3261 goto finish;
3262 }
3263
4610de50 3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3265 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3266 goto finish;
3267 }
3268
4610de50 3269 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3270 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3271 goto finish;
3272 }
3273
4610de50 3274 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3275 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3276 goto finish;
3277 }
3278
825d5287 3279 if (arg_userns)
4610de50 3280 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3281 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3282 goto finish;
3283 }
3284
e866af3a
DH
3285 /* Child can be killed before execv(), so handle SIGCHLD
3286 * in order to interrupt parent's blocking calls and
3287 * give it a chance to call wait() and terminate. */
3288 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3289 if (r < 0) {
ec16945e 3290 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3291 goto finish;
3292 }
3293
e866af3a
DH
3294 r = sigaction(SIGCHLD, &sa, NULL);
3295 if (r < 0) {
ec16945e 3296 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3297 goto finish;
3298 }
3299
03cfe0d5 3300 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3301 if (pid < 0) {
3302 if (errno == EINVAL)
ec16945e 3303 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3304 else
ec16945e 3305 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3306
d87be9b0
LP
3307 goto finish;
3308 }
a258bf26 3309
d87be9b0 3310 if (pid == 0) {
03cfe0d5 3311 /* The outer child only has a file system namespace. */
a2da110b
DH
3312 barrier_set_role(&barrier, BARRIER_CHILD);
3313
03e334a1 3314 master = safe_close(master);
a258bf26 3315
03e334a1 3316 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3317 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3318 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3319 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3320
ce30c8dc
LP
3321 (void) reset_all_signal_handlers();
3322 (void) reset_signal_mask();
f5c1b9ee 3323
03cfe0d5
LP
3324 r = outer_child(&barrier,
3325 arg_directory,
3326 console,
3327 root_device, root_device_rw,
3328 home_device, home_device_rw,
3329 srv_device, srv_device_rw,
3330 interactive,
3331 secondary,
3332 pid_socket_pair[1],
3333 kmsg_socket_pair[1],
3334 rtnl_socket_pair[1],
825d5287 3335 uid_shift_socket_pair[1],
f757855e 3336 fds);
0cb9fbcd 3337 if (r < 0)
a2da110b 3338 _exit(EXIT_FAILURE);
d87be9b0 3339
03cfe0d5 3340 _exit(EXIT_SUCCESS);
da5b3bad 3341 }
88213476 3342
a2da110b 3343 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3344
2feceb5e 3345 fds = fdset_free(fds);
842f3b0f 3346
6d0b55c2
LP
3347 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3348 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3349 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3350 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3351
03cfe0d5
LP
3352 /* Wait for the outer child. */
3353 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3354 if (r < 0)
3355 goto finish;
3356 if (r != 0) {
3357 r = -EIO;
3358 goto finish;
3359 }
3360 pid = 0;
6dac160c 3361
03cfe0d5
LP
3362 /* And now retrieve the PID of the inner child. */
3363 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3364 if (l < 0) {
3365 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3366 goto finish;
3367 }
3368 if (l != sizeof(pid)) {
76d44882 3369 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3370 r = EIO;
3371 goto finish;
3372 }
354bfd2b 3373
03cfe0d5 3374 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3375
03cfe0d5
LP
3376 if (arg_userns) {
3377 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3378 log_error("Child died too early.");
3379 r = -ESRCH;
840295fc 3380 goto finish;
03cfe0d5 3381 }
ab046dde 3382
825d5287
RM
3383 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3384 if (l < 0) {
3385 r = log_error_errno(errno, "Failed to read UID shift: %m");
3386 goto finish;
3387 }
3388 if (l != sizeof(arg_uid_shift)) {
76d44882 3389 log_error("Short read while reading UID shift.");
825d5287
RM
3390 r = EIO;
3391 goto finish;
3392 }
3393
03cfe0d5 3394 r = setup_uid_map(pid);
840295fc
LP
3395 if (r < 0)
3396 goto finish;
ab046dde 3397
03cfe0d5
LP
3398 (void) barrier_place(&barrier); /* #2 */
3399 }
c74e630d 3400
9a2a5625 3401 if (arg_private_network) {
4bbfe7ad 3402
9a2a5625
LP
3403 r = move_network_interfaces(pid, arg_network_interfaces);
3404 if (r < 0)
3405 goto finish;
5aa4bb6b 3406
9a2a5625
LP
3407 if (arg_network_veth) {
3408 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3409 if (r < 0)
3410 goto finish;
3411 else if (r > 0)
3412 ifi = r;
6dac160c 3413
9a2a5625
LP
3414 if (arg_network_bridge) {
3415 r = setup_bridge(veth_name, arg_network_bridge);
3416 if (r < 0)
3417 goto finish;
3418 if (r > 0)
3419 ifi = r;
3420 }
3421 }
6dac160c 3422
9a2a5625
LP
3423 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3424 if (r < 0)
3425 goto finish;
3426
3427 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3428 if (r < 0)
3429 goto finish;
3430 }
6dac160c 3431
b7103bc5
LP
3432 if (arg_register) {
3433 r = register_machine(
3434 arg_machine,
3435 pid,
3436 arg_directory,
3437 arg_uuid,
3438 ifi,
3439 arg_slice,
3440 arg_custom_mounts, arg_n_custom_mounts,
3441 arg_kill_signal,
3442 arg_property,
3443 arg_keep_unit);
3444 if (r < 0)
3445 goto finish;
3446 }
6dac160c 3447
34829a32 3448 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3449 if (r < 0)
3450 goto finish;
3451
34829a32
LP
3452 if (arg_keep_unit) {
3453 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3454 if (r < 0)
3455 goto finish;
3456 }
efdb0237 3457
34829a32 3458 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3459 if (r < 0)
3460 goto finish;
6dac160c 3461
03cfe0d5
LP
3462 /* Notify the child that the parent is ready with all
3463 * its setup (including cgroup-ification), and that
3464 * the child can now hand over control to the code to
3465 * run inside the container. */
3466 (void) barrier_place(&barrier); /* #3 */
6dac160c 3467
03cfe0d5
LP
3468 /* Block SIGCHLD here, before notifying child.
3469 * process_pty() will handle it with the other signals. */
3470 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3471
03cfe0d5
LP
3472 /* Reset signal to default */
3473 r = default_signals(SIGCHLD, -1);
3474 if (r < 0) {
3475 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3476 goto finish;
3477 }
e866af3a 3478
03cfe0d5 3479 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3480 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3481 log_error("Child died too early.");
03cfe0d5
LP
3482 r = -ESRCH;
3483 goto finish;
3484 }
b12afc8c 3485
03cfe0d5
LP
3486 sd_notifyf(false,
3487 "READY=1\n"
3488 "STATUS=Container running.\n"
3489 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3490
03cfe0d5
LP
3491 r = sd_event_new(&event);
3492 if (r < 0) {
3493 log_error_errno(r, "Failed to get default event source: %m");
3494 goto finish;
3495 }
88213476 3496
03cfe0d5
LP
3497 if (arg_kill_signal > 0) {
3498 /* Try to kill the init system on SIGINT or SIGTERM */
3499 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3500 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3501 } else {
3502 /* Immediately exit */
3503 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3504 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3505 }
023fb90b 3506
03cfe0d5
LP
3507 /* simply exit on sigchld */
3508 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3509
03cfe0d5 3510 if (arg_expose_ports) {
7a8f6325 3511 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3512 if (r < 0)
3513 goto finish;
023fb90b 3514
7a8f6325 3515 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3516 }
023fb90b 3517
03cfe0d5 3518 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3519
ae3dde80 3520 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3521 if (r < 0) {
3522 log_error_errno(r, "Failed to create PTY forwarder: %m");
3523 goto finish;
3524 }
023fb90b 3525
03cfe0d5
LP
3526 r = sd_event_loop(event);
3527 if (r < 0) {
3528 log_error_errno(r, "Failed to run event loop: %m");
3529 goto finish;
3530 }
6d0b55c2 3531
03cfe0d5 3532 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3533
03cfe0d5 3534 forward = pty_forward_free(forward);
6d0b55c2 3535
03cfe0d5
LP
3536 if (!arg_quiet && last_char != '\n')
3537 putc('\n', stdout);
04d39279 3538
03cfe0d5 3539 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3540 if (arg_register && !arg_keep_unit)
3541 terminate_machine(pid);
1f0cd86b 3542
840295fc 3543 /* Normally redundant, but better safe than sorry */
04d39279 3544 kill(pid, SIGKILL);
a258bf26 3545
113cea80 3546 r = wait_for_container(pid, &container_status);
04d39279
LP
3547 pid = 0;
3548
ec16945e 3549 if (r < 0)
ce9f1527
LP
3550 /* We failed to wait for the container, or the
3551 * container exited abnormally */
ec16945e
LP
3552 goto finish;
3553 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3554 /* The container exited with a non-zero
3555 * status, or with zero status and no reboot
3556 * was requested. */
ec16945e 3557 ret = r;
d87be9b0 3558 break;
ec16945e 3559 }
88213476 3560
113cea80 3561 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3562
3563 if (arg_keep_unit) {
3564 /* Special handling if we are running as a
3565 * service: instead of simply restarting the
3566 * machine we want to restart the entire
3567 * service, so let's inform systemd about this
3568 * with the special exit code 133. The service
3569 * file uses RestartForceExitStatus=133 so
3570 * that this results in a full nspawn
3571 * restart. This is necessary since we might
3572 * have cgroup parameters set we want to have
3573 * flushed out. */
ec16945e
LP
3574 ret = 133;
3575 r = 0;
ce38dbc8
LP
3576 break;
3577 }
6d0b55c2 3578
7a8f6325 3579 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3580 }
88213476
LP
3581
3582finish:
af4ec430
LP
3583 sd_notify(false,
3584 "STOPPING=1\n"
3585 "STATUS=Terminating...");
3586
9444b1f2
LP
3587 if (pid > 0)
3588 kill(pid, SIGKILL);
88213476 3589
503546da
LP
3590 /* Try to flush whatever is still queued in the pty */
3591 if (master >= 0)
59f448cf 3592 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3593
03cfe0d5
LP
3594 loop_remove(loop_nr, &image_fd);
3595
ec16945e
LP
3596 if (remove_subvol && arg_directory) {
3597 int k;
3598
5bcd08db 3599 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3600 if (k < 0)
3601 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3602 }
3603
785890ac
LP
3604 if (arg_machine) {
3605 const char *p;
3606
63c372cb 3607 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3608 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3609 }
3610
7a8f6325 3611 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3612
04d391da 3613 free(arg_directory);
ec16945e
LP
3614 free(arg_template);
3615 free(arg_image);
7027ff61 3616 free(arg_machine);
c74e630d
LP
3617 free(arg_user);
3618 strv_free(arg_setenv);
f757855e 3619 free(arg_network_bridge);
c74e630d
LP
3620 strv_free(arg_network_interfaces);
3621 strv_free(arg_network_macvlan);
4bbfe7ad 3622 strv_free(arg_network_ipvlan);
f757855e
LP
3623 strv_free(arg_parameters);
3624 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3625 expose_port_free_all(arg_expose_ports);
6d0b55c2 3626
ec16945e 3627 return r < 0 ? EXIT_FAILURE : ret;
88213476 3628}