]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
doc: correct orthography, word forms and missing/extraneous words
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e 48
b5efdb8a 49#include "alloc-util.h"
8fe0087e
LP
50#include "barrier.h"
51#include "base-filesystem.h"
52#include "blkid-util.h"
53#include "btrfs-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
8fe0087e
LP
59#include "env-util.h"
60#include "event-util.h"
3ffd4af2 61#include "fd-util.h"
842f3b0f 62#include "fdset.h"
a5c32cff 63#include "fileio.h"
8fe0087e 64#include "formats-util.h"
f4f15635 65#include "fs-util.h"
1b9e5b12 66#include "gpt.h"
8fe0087e
LP
67#include "hostname-util.h"
68#include "log.h"
69#include "loopback-setup.h"
1b9cebf6 70#include "machine-image.h"
8fe0087e
LP
71#include "macro.h"
72#include "missing.h"
73#include "mkdir.h"
4349cd7c 74#include "mount-util.h"
8fe0087e 75#include "netlink-util.h"
07630cea
LP
76#include "nspawn-cgroup.h"
77#include "nspawn-expose-ports.h"
78#include "nspawn-mount.h"
79#include "nspawn-network.h"
80#include "nspawn-register.h"
81#include "nspawn-settings.h"
82#include "nspawn-setuid.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
88#include "rm-rf.h"
e9642be2
LP
89#ifdef HAVE_SECCOMP
90#include "seccomp-util.h"
91#endif
8fe0087e 92#include "signal-util.h"
2583fbea 93#include "socket-util.h"
8fcde012 94#include "stat-util.h"
15a5e950 95#include "stdio-util.h"
07630cea 96#include "string-util.h"
8fe0087e
LP
97#include "strv.h"
98#include "terminal-util.h"
99#include "udev-util.h"
affb60b1 100#include "umask-util.h"
b1d4f8e1 101#include "user-util.h"
8fe0087e 102#include "util.h"
e9642be2 103
113cea80
DH
104typedef enum ContainerStatus {
105 CONTAINER_TERMINATED,
106 CONTAINER_REBOOTED
107} ContainerStatus;
108
57fb9fb5
LP
109typedef enum LinkJournal {
110 LINK_NO,
111 LINK_AUTO,
112 LINK_HOST,
113 LINK_GUEST
114} LinkJournal;
88213476
LP
115
116static char *arg_directory = NULL;
ec16945e 117static char *arg_template = NULL;
687d0825 118static char *arg_user = NULL;
9444b1f2 119static sd_id128_t arg_uuid = {};
7027ff61 120static char *arg_machine = NULL;
c74e630d
LP
121static const char *arg_selinux_context = NULL;
122static const char *arg_selinux_apifs_context = NULL;
9444b1f2 123static const char *arg_slice = NULL;
ff01d048 124static bool arg_private_network = false;
bc2f673e 125static bool arg_read_only = false;
0f0dbc46 126static bool arg_boot = false;
ec16945e 127static bool arg_ephemeral = false;
57fb9fb5 128static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 129static bool arg_link_journal_try = false;
5076f0cc
LP
130static uint64_t arg_retain =
131 (1ULL << CAP_CHOWN) |
132 (1ULL << CAP_DAC_OVERRIDE) |
133 (1ULL << CAP_DAC_READ_SEARCH) |
134 (1ULL << CAP_FOWNER) |
135 (1ULL << CAP_FSETID) |
136 (1ULL << CAP_IPC_OWNER) |
137 (1ULL << CAP_KILL) |
138 (1ULL << CAP_LEASE) |
139 (1ULL << CAP_LINUX_IMMUTABLE) |
140 (1ULL << CAP_NET_BIND_SERVICE) |
141 (1ULL << CAP_NET_BROADCAST) |
142 (1ULL << CAP_NET_RAW) |
143 (1ULL << CAP_SETGID) |
144 (1ULL << CAP_SETFCAP) |
145 (1ULL << CAP_SETPCAP) |
146 (1ULL << CAP_SETUID) |
147 (1ULL << CAP_SYS_ADMIN) |
148 (1ULL << CAP_SYS_CHROOT) |
149 (1ULL << CAP_SYS_NICE) |
150 (1ULL << CAP_SYS_PTRACE) |
151 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 152 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
153 (1ULL << CAP_SYS_BOOT) |
154 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
155 (1ULL << CAP_AUDIT_CONTROL) |
156 (1ULL << CAP_MKNOD);
5a8af538
LP
157static CustomMount *arg_custom_mounts = NULL;
158static unsigned arg_n_custom_mounts = 0;
f4889f65 159static char **arg_setenv = NULL;
284c0b91 160static bool arg_quiet = false;
8a96d94e 161static bool arg_share_system = false;
eb91eb18 162static bool arg_register = true;
89f7c846 163static bool arg_keep_unit = false;
aa28aefe 164static char **arg_network_interfaces = NULL;
c74e630d 165static char **arg_network_macvlan = NULL;
4bbfe7ad 166static char **arg_network_ipvlan = NULL;
69c79d3c 167static bool arg_network_veth = false;
f757855e 168static char *arg_network_bridge = NULL;
050f7277 169static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 170static char *arg_image = NULL;
f757855e 171static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 172static ExposePort *arg_expose_ports = NULL;
f36933fe 173static char **arg_property = NULL;
6dac160c
LP
174static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175static bool arg_userns = false;
c6c8f6e2 176static int arg_kill_signal = 0;
efdb0237 177static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
178static SettingsMask arg_settings_mask = 0;
179static int arg_settings_trusted = -1;
180static char **arg_parameters = NULL;
88213476 181
601185b4 182static void help(void) {
88213476
LP
183 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
184 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
185 " -h --help Show this help\n"
186 " --version Print version string\n"
69c79d3c 187 " -q --quiet Do not show status information\n"
1b9e5b12 188 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
189 " --template=PATH Initialize root directory from template directory,\n"
190 " if missing\n"
191 " -x --ephemeral Run container with snapshot of root directory, and\n"
192 " remove it after exit\n"
193 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
194 " -b --boot Boot up full system (i.e. invoke init)\n"
195 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 196 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 197 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 198 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 199 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
200 " --private-users[=UIDBASE[:NUIDS]]\n"
201 " Run within user namespace\n"
69c79d3c
LP
202 " --private-network Disable network in container\n"
203 " --network-interface=INTERFACE\n"
204 " Assign an existing network interface to the\n"
205 " container\n"
c74e630d
LP
206 " --network-macvlan=INTERFACE\n"
207 " Create a macvlan network interface based on an\n"
208 " existing network interface to the container\n"
4bbfe7ad
TG
209 " --network-ipvlan=INTERFACE\n"
210 " Create a ipvlan network interface based on an\n"
211 " existing network interface to the container\n"
a8eaaee7 212 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 213 " and container\n"
ab046dde 214 " --network-bridge=INTERFACE\n"
a8eaaee7 215 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
216 " and container and add it to an existing bridge on\n"
217 " the host\n"
6d0b55c2 218 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 219 " Expose a container IP port on the host\n"
82adf6af
LP
220 " -Z --selinux-context=SECLABEL\n"
221 " Set the SELinux security context to be used by\n"
222 " processes in the container\n"
223 " -L --selinux-apifs-context=SECLABEL\n"
224 " Set the SELinux security context to be used by\n"
225 " API/tmpfs file systems in the container\n"
a8828ed9
DW
226 " --capability=CAP In addition to the default, retain specified\n"
227 " capability\n"
228 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 229 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
230 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
231 " try-guest, try-host\n"
232 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 233 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
234 " --bind=PATH[:PATH[:OPTIONS]]\n"
235 " Bind mount a file or directory from the host into\n"
a8828ed9 236 " the container\n"
5e5bfa6e
EY
237 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
238 " Similar, but creates a read-only bind mount\n"
06c17c39 239 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
240 " --overlay=PATH[:PATH...]:PATH\n"
241 " Create an overlay mount from the host to \n"
242 " the container\n"
243 " --overlay-ro=PATH[:PATH...]:PATH\n"
244 " Similar, but creates a read-only overlay mount\n"
284c0b91 245 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 246 " --share-system Share system namespaces with host\n"
eb91eb18 247 " --register=BOOLEAN Register container as machine\n"
89f7c846 248 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 249 " the service unit nspawn is running in\n"
6d0b55c2 250 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 251 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 252 , program_invocation_short_name);
88213476
LP
253}
254
5a8af538
LP
255
256static int custom_mounts_prepare(void) {
257 unsigned i;
258 int r;
259
260 /* Ensure the mounts are applied prefix first. */
261 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
262
263 /* Allocate working directories for the overlay file systems that need it */
264 for (i = 0; i < arg_n_custom_mounts; i++) {
265 CustomMount *m = &arg_custom_mounts[i];
266
825d5287
RM
267 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
268 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
269 return -EINVAL;
270 }
271
5a8af538
LP
272 if (m->type != CUSTOM_MOUNT_OVERLAY)
273 continue;
274
275 if (m->work_dir)
276 continue;
277
278 if (m->read_only)
279 continue;
280
14bcf25c 281 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
282 if (r < 0)
283 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
284 }
285
286 return 0;
287}
288
efdb0237
LP
289static int detect_unified_cgroup_hierarchy(void) {
290 const char *e;
291 int r;
292
293 /* Allow the user to control whether the unified hierarchy is used */
294 e = getenv("UNIFIED_CGROUP_HIERARCHY");
295 if (e) {
296 r = parse_boolean(e);
297 if (r < 0)
298 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
299
300 arg_unified_cgroup_hierarchy = r;
301 return 0;
302 }
303
304 /* Otherwise inherit the default from the host system */
305 r = cg_unified();
306 if (r < 0)
307 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
308
309 arg_unified_cgroup_hierarchy = r;
310 return 0;
311}
312
88213476
LP
313static int parse_argv(int argc, char *argv[]) {
314
a41fe3a2 315 enum {
acbeb427
ZJS
316 ARG_VERSION = 0x100,
317 ARG_PRIVATE_NETWORK,
bc2f673e 318 ARG_UUID,
5076f0cc 319 ARG_READ_ONLY,
57fb9fb5 320 ARG_CAPABILITY,
420c7379 321 ARG_DROP_CAPABILITY,
17fe0523
LP
322 ARG_LINK_JOURNAL,
323 ARG_BIND,
f4889f65 324 ARG_BIND_RO,
06c17c39 325 ARG_TMPFS,
5a8af538
LP
326 ARG_OVERLAY,
327 ARG_OVERLAY_RO,
f4889f65 328 ARG_SETENV,
eb91eb18 329 ARG_SHARE_SYSTEM,
89f7c846 330 ARG_REGISTER,
aa28aefe 331 ARG_KEEP_UNIT,
69c79d3c 332 ARG_NETWORK_INTERFACE,
c74e630d 333 ARG_NETWORK_MACVLAN,
4bbfe7ad 334 ARG_NETWORK_IPVLAN,
ab046dde 335 ARG_NETWORK_BRIDGE,
6afc95b7 336 ARG_PERSONALITY,
4d9f07b4 337 ARG_VOLATILE,
ec16945e 338 ARG_TEMPLATE,
f36933fe 339 ARG_PROPERTY,
6dac160c 340 ARG_PRIVATE_USERS,
c6c8f6e2 341 ARG_KILL_SIGNAL,
f757855e 342 ARG_SETTINGS,
a41fe3a2
LP
343 };
344
88213476 345 static const struct option options[] = {
aa28aefe
LP
346 { "help", no_argument, NULL, 'h' },
347 { "version", no_argument, NULL, ARG_VERSION },
348 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
349 { "template", required_argument, NULL, ARG_TEMPLATE },
350 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
351 { "user", required_argument, NULL, 'u' },
352 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
353 { "boot", no_argument, NULL, 'b' },
354 { "uuid", required_argument, NULL, ARG_UUID },
355 { "read-only", no_argument, NULL, ARG_READ_ONLY },
356 { "capability", required_argument, NULL, ARG_CAPABILITY },
357 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
358 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
359 { "bind", required_argument, NULL, ARG_BIND },
360 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 361 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
362 { "overlay", required_argument, NULL, ARG_OVERLAY },
363 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
364 { "machine", required_argument, NULL, 'M' },
365 { "slice", required_argument, NULL, 'S' },
366 { "setenv", required_argument, NULL, ARG_SETENV },
367 { "selinux-context", required_argument, NULL, 'Z' },
368 { "selinux-apifs-context", required_argument, NULL, 'L' },
369 { "quiet", no_argument, NULL, 'q' },
370 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
371 { "register", required_argument, NULL, ARG_REGISTER },
372 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
373 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 374 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 375 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 376 { "network-veth", no_argument, NULL, 'n' },
ab046dde 377 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 378 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 379 { "image", required_argument, NULL, 'i' },
4d9f07b4 380 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 381 { "port", required_argument, NULL, 'p' },
f36933fe 382 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 383 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 384 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 385 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 386 {}
88213476
LP
387 };
388
9444b1f2 389 int c, r;
6cbe4ed1 390 const char *p;
a42c8b54 391 uint64_t plus = 0, minus = 0;
f757855e 392 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
393
394 assert(argc >= 0);
395 assert(argv);
396
0dfaa006 397 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
398
399 switch (c) {
400
401 case 'h':
601185b4
ZJS
402 help();
403 return 0;
88213476 404
acbeb427 405 case ARG_VERSION:
3f6fd1ba 406 return version();
acbeb427 407
88213476 408 case 'D':
0f03c2a4 409 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 410 if (r < 0)
0f03c2a4 411 return r;
ec16945e
LP
412 break;
413
414 case ARG_TEMPLATE:
0f03c2a4 415 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 416 if (r < 0)
0f03c2a4 417 return r;
88213476
LP
418 break;
419
1b9e5b12 420 case 'i':
0f03c2a4 421 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 422 if (r < 0)
0f03c2a4 423 return r;
ec16945e
LP
424 break;
425
426 case 'x':
427 arg_ephemeral = true;
1b9e5b12
LP
428 break;
429
687d0825 430 case 'u':
2fc09a9c
DM
431 r = free_and_strdup(&arg_user, optarg);
432 if (r < 0)
7027ff61 433 return log_oom();
687d0825 434
f757855e 435 arg_settings_mask |= SETTING_USER;
687d0825
MV
436 break;
437
ab046dde 438 case ARG_NETWORK_BRIDGE:
f757855e
LP
439 r = free_and_strdup(&arg_network_bridge, optarg);
440 if (r < 0)
441 return log_oom();
ab046dde
TG
442
443 /* fall through */
444
0dfaa006 445 case 'n':
69c79d3c
LP
446 arg_network_veth = true;
447 arg_private_network = true;
f757855e 448 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
449 break;
450
aa28aefe 451 case ARG_NETWORK_INTERFACE:
c74e630d
LP
452 if (strv_extend(&arg_network_interfaces, optarg) < 0)
453 return log_oom();
454
455 arg_private_network = true;
f757855e 456 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
457 break;
458
459 case ARG_NETWORK_MACVLAN:
460 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
461 return log_oom();
462
4bbfe7ad 463 arg_private_network = true;
f757855e 464 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
465 break;
466
467 case ARG_NETWORK_IPVLAN:
468 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
469 return log_oom();
470
aa28aefe
LP
471 /* fall through */
472
ff01d048
LP
473 case ARG_PRIVATE_NETWORK:
474 arg_private_network = true;
f757855e 475 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
476 break;
477
0f0dbc46
LP
478 case 'b':
479 arg_boot = true;
f757855e 480 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
481 break;
482
144f0fc0 483 case ARG_UUID:
9444b1f2
LP
484 r = sd_id128_from_string(optarg, &arg_uuid);
485 if (r < 0) {
aa96c6cb 486 log_error("Invalid UUID: %s", optarg);
9444b1f2 487 return r;
aa96c6cb 488 }
f757855e
LP
489
490 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 491 break;
aa96c6cb 492
9444b1f2 493 case 'S':
c74e630d 494 arg_slice = optarg;
144f0fc0
LP
495 break;
496
7027ff61 497 case 'M':
c1521918 498 if (isempty(optarg))
97b11eed 499 arg_machine = mfree(arg_machine);
c1521918 500 else {
0c3c4284 501 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
502 log_error("Invalid machine name: %s", optarg);
503 return -EINVAL;
504 }
7027ff61 505
0c3c4284
LP
506 r = free_and_strdup(&arg_machine, optarg);
507 if (r < 0)
eb91eb18
LP
508 return log_oom();
509
510 break;
511 }
7027ff61 512
82adf6af
LP
513 case 'Z':
514 arg_selinux_context = optarg;
a8828ed9
DW
515 break;
516
82adf6af
LP
517 case 'L':
518 arg_selinux_apifs_context = optarg;
a8828ed9
DW
519 break;
520
bc2f673e
LP
521 case ARG_READ_ONLY:
522 arg_read_only = true;
f757855e 523 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
524 break;
525
420c7379
LP
526 case ARG_CAPABILITY:
527 case ARG_DROP_CAPABILITY: {
6cbe4ed1
SS
528 p = optarg;
529 for(;;) {
530 _cleanup_free_ char *t = NULL;
5076f0cc 531
6cbe4ed1
SS
532 r = extract_first_word(&p, &t, ",", 0);
533 if (r < 0)
534 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 535
6cbe4ed1
SS
536 if (r == 0)
537 break;
5076f0cc 538
39ed67d1
LP
539 if (streq(t, "all")) {
540 if (c == ARG_CAPABILITY)
a42c8b54 541 plus = (uint64_t) -1;
39ed67d1 542 else
a42c8b54 543 minus = (uint64_t) -1;
39ed67d1 544 } else {
2822da4f
LP
545 int cap;
546
547 cap = capability_from_name(t);
548 if (cap < 0) {
39ed67d1
LP
549 log_error("Failed to parse capability %s.", t);
550 return -EINVAL;
551 }
552
553 if (c == ARG_CAPABILITY)
a42c8b54 554 plus |= 1ULL << (uint64_t) cap;
39ed67d1 555 else
a42c8b54 556 minus |= 1ULL << (uint64_t) cap;
5076f0cc 557 }
5076f0cc
LP
558 }
559
f757855e 560 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
561 break;
562 }
563
57fb9fb5
LP
564 case 'j':
565 arg_link_journal = LINK_GUEST;
574edc90 566 arg_link_journal_try = true;
57fb9fb5
LP
567 break;
568
569 case ARG_LINK_JOURNAL:
53e438e3 570 if (streq(optarg, "auto")) {
57fb9fb5 571 arg_link_journal = LINK_AUTO;
53e438e3
LP
572 arg_link_journal_try = false;
573 } else if (streq(optarg, "no")) {
57fb9fb5 574 arg_link_journal = LINK_NO;
53e438e3
LP
575 arg_link_journal_try = false;
576 } else if (streq(optarg, "guest")) {
57fb9fb5 577 arg_link_journal = LINK_GUEST;
53e438e3
LP
578 arg_link_journal_try = false;
579 } else if (streq(optarg, "host")) {
57fb9fb5 580 arg_link_journal = LINK_HOST;
53e438e3
LP
581 arg_link_journal_try = false;
582 } else if (streq(optarg, "try-guest")) {
574edc90
MP
583 arg_link_journal = LINK_GUEST;
584 arg_link_journal_try = true;
585 } else if (streq(optarg, "try-host")) {
586 arg_link_journal = LINK_HOST;
587 arg_link_journal_try = true;
588 } else {
57fb9fb5
LP
589 log_error("Failed to parse link journal mode %s", optarg);
590 return -EINVAL;
591 }
592
593 break;
594
17fe0523 595 case ARG_BIND:
f757855e
LP
596 case ARG_BIND_RO:
597 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
598 if (r < 0)
599 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 600
f757855e 601 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 602 break;
06c17c39 603
f757855e
LP
604 case ARG_TMPFS:
605 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
606 if (r < 0)
607 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 608
f757855e 609 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 610 break;
5a8af538
LP
611
612 case ARG_OVERLAY:
613 case ARG_OVERLAY_RO: {
614 _cleanup_free_ char *upper = NULL, *destination = NULL;
615 _cleanup_strv_free_ char **lower = NULL;
616 CustomMount *m;
617 unsigned n = 0;
618 char **i;
619
62f9f39a
RM
620 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
621 if (r == -ENOMEM)
06c17c39 622 return log_oom();
62f9f39a
RM
623 else if (r < 0) {
624 log_error("Invalid overlay specification: %s", optarg);
625 return r;
626 }
06c17c39 627
5a8af538
LP
628 STRV_FOREACH(i, lower) {
629 if (!path_is_absolute(*i)) {
630 log_error("Overlay path %s is not absolute.", *i);
631 return -EINVAL;
632 }
633
634 n++;
635 }
636
637 if (n < 2) {
638 log_error("--overlay= needs at least two colon-separated directories specified.");
639 return -EINVAL;
640 }
641
642 if (n == 2) {
643 /* If two parameters are specified,
644 * the first one is the lower, the
645 * second one the upper directory. And
af86c440
ZJS
646 * we'll also define the destination
647 * mount point the same as the upper. */
5a8af538
LP
648 upper = lower[1];
649 lower[1] = NULL;
650
651 destination = strdup(upper);
652 if (!destination)
653 return log_oom();
654
655 } else {
656 upper = lower[n - 2];
657 destination = lower[n - 1];
658 lower[n - 2] = NULL;
659 }
660
f757855e 661 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
662 if (!m)
663 return log_oom();
664
665 m->destination = destination;
666 m->source = upper;
667 m->lower = lower;
668 m->read_only = c == ARG_OVERLAY_RO;
669
670 upper = destination = NULL;
671 lower = NULL;
06c17c39 672
f757855e 673 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
674 break;
675 }
676
f4889f65
LP
677 case ARG_SETENV: {
678 char **n;
679
680 if (!env_assignment_is_valid(optarg)) {
681 log_error("Environment variable assignment '%s' is not valid.", optarg);
682 return -EINVAL;
683 }
684
685 n = strv_env_set(arg_setenv, optarg);
686 if (!n)
687 return log_oom();
688
689 strv_free(arg_setenv);
690 arg_setenv = n;
f757855e
LP
691
692 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
693 break;
694 }
695
284c0b91
LP
696 case 'q':
697 arg_quiet = true;
698 break;
699
8a96d94e
LP
700 case ARG_SHARE_SYSTEM:
701 arg_share_system = true;
702 break;
703
eb91eb18
LP
704 case ARG_REGISTER:
705 r = parse_boolean(optarg);
706 if (r < 0) {
707 log_error("Failed to parse --register= argument: %s", optarg);
708 return r;
709 }
710
711 arg_register = r;
712 break;
713
89f7c846
LP
714 case ARG_KEEP_UNIT:
715 arg_keep_unit = true;
716 break;
717
6afc95b7
LP
718 case ARG_PERSONALITY:
719
ac45f971 720 arg_personality = personality_from_string(optarg);
050f7277 721 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
722 log_error("Unknown or unsupported personality '%s'.", optarg);
723 return -EINVAL;
724 }
725
f757855e 726 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
727 break;
728
4d9f07b4
LP
729 case ARG_VOLATILE:
730
731 if (!optarg)
f757855e 732 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 733 else {
f757855e 734 VolatileMode m;
4d9f07b4 735
f757855e
LP
736 m = volatile_mode_from_string(optarg);
737 if (m < 0) {
738 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 739 return -EINVAL;
f757855e
LP
740 } else
741 arg_volatile_mode = m;
6d0b55c2
LP
742 }
743
f757855e
LP
744 arg_settings_mask |= SETTING_VOLATILE_MODE;
745 break;
6d0b55c2 746
f757855e
LP
747 case 'p':
748 r = expose_port_parse(&arg_expose_ports, optarg);
749 if (r == -EEXIST)
750 return log_error_errno(r, "Duplicate port specification: %s", optarg);
751 if (r < 0)
752 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 753
f757855e 754 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 755 break;
6d0b55c2 756
f36933fe
LP
757 case ARG_PROPERTY:
758 if (strv_extend(&arg_property, optarg) < 0)
759 return log_oom();
760
761 break;
762
6dac160c
LP
763 case ARG_PRIVATE_USERS:
764 if (optarg) {
765 _cleanup_free_ char *buffer = NULL;
766 const char *range, *shift;
767
768 range = strchr(optarg, ':');
769 if (range) {
770 buffer = strndup(optarg, range - optarg);
771 if (!buffer)
772 return log_oom();
773 shift = buffer;
774
775 range++;
776 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
777 log_error("Failed to parse UID range: %s", range);
778 return -EINVAL;
779 }
780 } else
781 shift = optarg;
782
783 if (parse_uid(shift, &arg_uid_shift) < 0) {
784 log_error("Failed to parse UID: %s", optarg);
785 return -EINVAL;
786 }
787 }
788
789 arg_userns = true;
790 break;
791
c6c8f6e2
LP
792 case ARG_KILL_SIGNAL:
793 arg_kill_signal = signal_from_string_try_harder(optarg);
794 if (arg_kill_signal < 0) {
795 log_error("Cannot parse signal: %s", optarg);
796 return -EINVAL;
797 }
798
f757855e
LP
799 arg_settings_mask |= SETTING_KILL_SIGNAL;
800 break;
801
802 case ARG_SETTINGS:
803
804 /* no → do not read files
805 * yes → read files, do not override cmdline, trust only subset
806 * override → read files, override cmdline, trust only subset
807 * trusted → read files, do not override cmdline, trust all
808 */
809
810 r = parse_boolean(optarg);
811 if (r < 0) {
812 if (streq(optarg, "trusted")) {
813 mask_all_settings = false;
814 mask_no_settings = false;
815 arg_settings_trusted = true;
816
817 } else if (streq(optarg, "override")) {
818 mask_all_settings = false;
819 mask_no_settings = true;
820 arg_settings_trusted = -1;
821 } else
822 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
823 } else if (r > 0) {
824 /* yes */
825 mask_all_settings = false;
826 mask_no_settings = false;
827 arg_settings_trusted = -1;
828 } else {
829 /* no */
830 mask_all_settings = true;
831 mask_no_settings = false;
832 arg_settings_trusted = false;
833 }
834
c6c8f6e2
LP
835 break;
836
88213476
LP
837 case '?':
838 return -EINVAL;
839
840 default:
eb9da376 841 assert_not_reached("Unhandled option");
88213476 842 }
88213476 843
eb91eb18
LP
844 if (arg_share_system)
845 arg_register = false;
846
847 if (arg_boot && arg_share_system) {
848 log_error("--boot and --share-system may not be combined.");
849 return -EINVAL;
850 }
851
89f7c846
LP
852 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
853 log_error("--keep-unit may not be used when invoked from a user session.");
854 return -EINVAL;
855 }
856
1b9e5b12
LP
857 if (arg_directory && arg_image) {
858 log_error("--directory= and --image= may not be combined.");
859 return -EINVAL;
860 }
861
ec16945e
LP
862 if (arg_template && arg_image) {
863 log_error("--template= and --image= may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_template && !(arg_directory || arg_machine)) {
868 log_error("--template= needs --directory= or --machine=.");
869 return -EINVAL;
870 }
871
872 if (arg_ephemeral && arg_template) {
873 log_error("--ephemeral and --template= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_ephemeral && arg_image) {
878 log_error("--ephemeral and --image= may not be combined.");
879 return -EINVAL;
880 }
881
df9a75e4
LP
882 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
883 log_error("--ephemeral and --link-journal= may not be combined.");
884 return -EINVAL;
885 }
886
f757855e
LP
887 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
888 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
889
890 if (argc > optind) {
891 arg_parameters = strv_copy(argv + optind);
892 if (!arg_parameters)
893 return log_oom();
894
895 arg_settings_mask |= SETTING_BOOT;
896 }
897
898 /* Load all settings from .nspawn files */
899 if (mask_no_settings)
900 arg_settings_mask = 0;
901
902 /* Don't load any settings from .nspawn files */
903 if (mask_all_settings)
904 arg_settings_mask = _SETTINGS_MASK_ALL;
905
906 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
907
908 r = detect_unified_cgroup_hierarchy();
909 if (r < 0)
910 return r;
911
912 return 1;
913}
914
915static int verify_arguments(void) {
916
917 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
918 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
919 return -EINVAL;
920 }
921
6d0b55c2
LP
922 if (arg_expose_ports && !arg_private_network) {
923 log_error("Cannot use --port= without private networking.");
924 return -EINVAL;
925 }
926
c6c8f6e2
LP
927 if (arg_boot && arg_kill_signal <= 0)
928 arg_kill_signal = SIGRTMIN+3;
929
f757855e 930 return 0;
88213476
LP
931}
932
03cfe0d5
LP
933static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
934 assert(p);
935
936 if (!arg_userns)
937 return 0;
938
939 if (uid == UID_INVALID && gid == GID_INVALID)
940 return 0;
941
942 if (uid != UID_INVALID) {
943 uid += arg_uid_shift;
944
945 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
946 return -EOVERFLOW;
947 }
948
949 if (gid != GID_INVALID) {
950 gid += (gid_t) arg_uid_shift;
951
952 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
953 return -EOVERFLOW;
954 }
955
956 if (lchown(p, uid, gid) < 0)
957 return -errno;
b12afc8c
LP
958
959 return 0;
960}
961
03cfe0d5
LP
962static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
963 const char *q;
964
965 q = prefix_roota(root, path);
966 if (mkdir(q, mode) < 0) {
967 if (errno == EEXIST)
968 return 0;
969 return -errno;
970 }
971
972 return userns_lchown(q, uid, gid);
973}
974
e58a1277 975static int setup_timezone(const char *dest) {
03cfe0d5
LP
976 _cleanup_free_ char *p = NULL, *q = NULL;
977 const char *where, *check, *what;
d4036145
LP
978 char *z, *y;
979 int r;
f8440af5 980
e58a1277
LP
981 assert(dest);
982
983 /* Fix the timezone, if possible */
d4036145
LP
984 r = readlink_malloc("/etc/localtime", &p);
985 if (r < 0) {
986 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
987 return 0;
988 }
989
990 z = path_startswith(p, "../usr/share/zoneinfo/");
991 if (!z)
992 z = path_startswith(p, "/usr/share/zoneinfo/");
993 if (!z) {
994 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
995 return 0;
996 }
997
03cfe0d5 998 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
999 r = readlink_malloc(where, &q);
1000 if (r >= 0) {
1001 y = path_startswith(q, "../usr/share/zoneinfo/");
1002 if (!y)
1003 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1004
d4036145
LP
1005 /* Already pointing to the right place? Then do nothing .. */
1006 if (y && streq(y, z))
1007 return 0;
1008 }
1009
03cfe0d5
LP
1010 check = strjoina("/usr/share/zoneinfo/", z);
1011 check = prefix_root(dest, check);
1012 if (laccess(check, F_OK) < 0) {
d4036145
LP
1013 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1014 return 0;
1015 }
68fb0892 1016
79d80fc1
TG
1017 r = unlink(where);
1018 if (r < 0 && errno != ENOENT) {
56f64d95 1019 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1020 return 0;
1021 }
4d9f07b4 1022
03cfe0d5 1023 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1024 if (symlink(what, where) < 0) {
56f64d95 1025 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1026 return 0;
1027 }
e58a1277 1028
03cfe0d5
LP
1029 r = userns_lchown(where, 0, 0);
1030 if (r < 0)
1031 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1032
e58a1277 1033 return 0;
88213476
LP
1034}
1035
2547bb41 1036static int setup_resolv_conf(const char *dest) {
03cfe0d5 1037 const char *where = NULL;
79d80fc1 1038 int r;
2547bb41
LP
1039
1040 assert(dest);
1041
1042 if (arg_private_network)
1043 return 0;
1044
1045 /* Fix resolv.conf, if possible */
03cfe0d5 1046 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1047
f2068bcc 1048 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1049 if (r < 0) {
68a313c5
LP
1050 /* If the file already exists as symlink, let's
1051 * suppress the warning, under the assumption that
1052 * resolved or something similar runs inside and the
1053 * symlink points there.
1054 *
1055 * If the disk image is read-only, there's also no
1056 * point in complaining.
1057 */
1058 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1059 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1060 return 0;
1061 }
2547bb41 1062
03cfe0d5
LP
1063 r = userns_lchown(where, 0, 0);
1064 if (r < 0)
1065 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1066
2547bb41
LP
1067 return 0;
1068}
1069
9f24adc2 1070static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1071 assert(s);
9f24adc2
LP
1072
1073 snprintf(s, 37,
1074 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1075 SD_ID128_FORMAT_VAL(id));
1076
1077 return s;
1078}
1079
04bc4a3f 1080static int setup_boot_id(const char *dest) {
03cfe0d5 1081 const char *from, *to;
39883f62 1082 sd_id128_t rnd = {};
04bc4a3f
LP
1083 char as_uuid[37];
1084 int r;
1085
eb91eb18
LP
1086 if (arg_share_system)
1087 return 0;
1088
04bc4a3f
LP
1089 /* Generate a new randomized boot ID, so that each boot-up of
1090 * the container gets a new one */
1091
03cfe0d5
LP
1092 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1093 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1094
1095 r = sd_id128_randomize(&rnd);
f647962d
MS
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1098
9f24adc2 1099 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1100
4c1fc3e4 1101 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1104
03cfe0d5
LP
1105 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1106 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1107 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1108 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1109
1110 unlink(from);
04bc4a3f
LP
1111 return r;
1112}
1113
e58a1277 1114static int copy_devnodes(const char *dest) {
88213476
LP
1115
1116 static const char devnodes[] =
1117 "null\0"
1118 "zero\0"
1119 "full\0"
1120 "random\0"
1121 "urandom\0"
85614d66
TG
1122 "tty\0"
1123 "net/tun\0";
88213476
LP
1124
1125 const char *d;
e58a1277 1126 int r = 0;
7fd1b19b 1127 _cleanup_umask_ mode_t u;
a258bf26
LP
1128
1129 assert(dest);
124640f1
LP
1130
1131 u = umask(0000);
88213476 1132
03cfe0d5
LP
1133 /* Create /dev/net, so that we can create /dev/net/tun in it */
1134 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1135 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1136
88213476 1137 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1138 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1139 struct stat st;
88213476 1140
7f112f50 1141 from = strappend("/dev/", d);
03cfe0d5 1142 to = prefix_root(dest, from);
88213476
LP
1143
1144 if (stat(from, &st) < 0) {
1145
4a62c710
MS
1146 if (errno != ENOENT)
1147 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1148
a258bf26 1149 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1150
03cfe0d5 1151 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1152 return -EIO;
a258bf26 1153
85614d66 1154 } else {
81f5049b
AC
1155 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1156 if (errno != EPERM)
1157 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1158
1159 /* Some systems abusively restrict mknod but
1160 * allow bind mounts. */
1161 r = touch(to);
1162 if (r < 0)
1163 return log_error_errno(r, "touch (%s) failed: %m", to);
1164 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1165 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1166 }
6278cf60 1167
03cfe0d5
LP
1168 r = userns_lchown(to, 0, 0);
1169 if (r < 0)
1170 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1171 }
88213476
LP
1172 }
1173
e58a1277
LP
1174 return r;
1175}
88213476 1176
03cfe0d5
LP
1177static int setup_pts(const char *dest) {
1178 _cleanup_free_ char *options = NULL;
1179 const char *p;
1180
1181#ifdef HAVE_SELINUX
1182 if (arg_selinux_apifs_context)
1183 (void) asprintf(&options,
3dce8915 1184 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1185 arg_uid_shift + TTY_GID,
1186 arg_selinux_apifs_context);
1187 else
1188#endif
1189 (void) asprintf(&options,
3dce8915 1190 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1191 arg_uid_shift + TTY_GID);
f2d88580 1192
03cfe0d5 1193 if (!options)
f2d88580
LP
1194 return log_oom();
1195
03cfe0d5 1196 /* Mount /dev/pts itself */
cc9fce65 1197 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1198 if (mkdir(p, 0755) < 0)
1199 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1200 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1201 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1202 if (userns_lchown(p, 0, 0) < 0)
1203 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1204
1205 /* Create /dev/ptmx symlink */
1206 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1207 if (symlink("pts/ptmx", p) < 0)
1208 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1209 if (userns_lchown(p, 0, 0) < 0)
1210 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1211
03cfe0d5
LP
1212 /* And fix /dev/pts/ptmx ownership */
1213 p = prefix_roota(dest, "/dev/pts/ptmx");
1214 if (userns_lchown(p, 0, 0) < 0)
1215 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1216
f2d88580
LP
1217 return 0;
1218}
1219
e58a1277 1220static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1221 _cleanup_umask_ mode_t u;
1222 const char *to;
e58a1277 1223 int r;
e58a1277
LP
1224
1225 assert(dest);
1226 assert(console);
1227
1228 u = umask(0000);
1229
03cfe0d5 1230 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1231 if (r < 0)
1232 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1233
a258bf26
LP
1234 /* We need to bind mount the right tty to /dev/console since
1235 * ptys can only exist on pts file systems. To have something
81f5049b 1236 * to bind mount things on we create a empty regular file. */
a258bf26 1237
03cfe0d5 1238 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1239 r = touch(to);
1240 if (r < 0)
1241 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1242
4543768d 1243 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1244 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1245
25ea79fe 1246 return 0;
e58a1277
LP
1247}
1248
1249static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1250 const char *from, *to;
7fd1b19b 1251 _cleanup_umask_ mode_t u;
d9603714 1252 int fd, r;
e58a1277 1253
e58a1277 1254 assert(kmsg_socket >= 0);
a258bf26 1255
e58a1277 1256 u = umask(0000);
a258bf26 1257
03cfe0d5 1258 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1259 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1260 * on the reading side behave very similar to /proc/kmsg,
1261 * their writing side behaves differently from /dev/kmsg in
1262 * that writing blocks when nothing is reading. In order to
1263 * avoid any problems with containers deadlocking due to this
1264 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1265 from = prefix_roota(dest, "/run/kmsg");
1266 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1267
4a62c710 1268 if (mkfifo(from, 0600) < 0)
03cfe0d5 1269 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1270 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1271 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1272
1273 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1274 if (fd < 0)
1275 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1276
e58a1277
LP
1277 /* Store away the fd in the socket, so that it stays open as
1278 * long as we run the child */
3ee897d6 1279 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1280 safe_close(fd);
e58a1277 1281
d9603714
DH
1282 if (r < 0)
1283 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1284
03cfe0d5
LP
1285 /* And now make the FIFO unavailable as /run/kmsg... */
1286 (void) unlink(from);
1287
25ea79fe 1288 return 0;
88213476
LP
1289}
1290
1c4baffc 1291static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1292 union in_addr_union *exposed = userdata;
1293
1294 assert(rtnl);
1295 assert(m);
1296 assert(exposed);
1297
7a8f6325 1298 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1299 return 0;
1300}
1301
3a74cea5 1302static int setup_hostname(void) {
3a74cea5 1303
eb91eb18
LP
1304 if (arg_share_system)
1305 return 0;
1306
605f81a8 1307 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1308 return -errno;
3a74cea5 1309
7027ff61 1310 return 0;
3a74cea5
LP
1311}
1312
57fb9fb5 1313static int setup_journal(const char *directory) {
4d680aee 1314 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1315 _cleanup_free_ char *b = NULL, *d = NULL;
1316 const char *etc_machine_id, *p, *q;
27407a01 1317 char *id;
57fb9fb5
LP
1318 int r;
1319
df9a75e4
LP
1320 /* Don't link journals in ephemeral mode */
1321 if (arg_ephemeral)
1322 return 0;
1323
03cfe0d5 1324 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1325
03cfe0d5 1326 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1327 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1328 return 0;
f647962d 1329 else if (r < 0)
03cfe0d5 1330 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1331
27407a01
ZJS
1332 id = strstrip(b);
1333 if (isempty(id) && arg_link_journal == LINK_AUTO)
1334 return 0;
57fb9fb5 1335
27407a01
ZJS
1336 /* Verify validity */
1337 r = sd_id128_from_string(id, &machine_id);
f647962d 1338 if (r < 0)
03cfe0d5 1339 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1340
4d680aee 1341 r = sd_id128_get_machine(&this_id);
f647962d
MS
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1344
1345 if (sd_id128_equal(machine_id, this_id)) {
1346 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1347 "Host and machine ids are equal (%s): refusing to link journals", id);
1348 if (arg_link_journal == LINK_AUTO)
1349 return 0;
df9a75e4 1350 return -EEXIST;
4d680aee
ZJS
1351 }
1352
1353 if (arg_link_journal == LINK_NO)
1354 return 0;
1355
03cfe0d5
LP
1356 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to create /var: %m");
1359
1360 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to create /var/log: %m");
1363
1364 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1365 if (r < 0)
1366 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1367
1368 p = strjoina("/var/log/journal/", id);
1369 q = prefix_roota(directory, p);
27407a01 1370
e26d6ce5 1371 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1372 if (arg_link_journal != LINK_AUTO) {
1373 log_error("%s: already a mount point, refusing to use for journal", p);
1374 return -EEXIST;
1375 }
1376
1377 return 0;
57fb9fb5
LP
1378 }
1379
e26d6ce5 1380 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1381 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1382 log_error("%s: already a mount point, refusing to use for journal", q);
1383 return -EEXIST;
57fb9fb5
LP
1384 }
1385
27407a01 1386 return 0;
57fb9fb5
LP
1387 }
1388
1389 r = readlink_and_make_absolute(p, &d);
1390 if (r >= 0) {
1391 if ((arg_link_journal == LINK_GUEST ||
1392 arg_link_journal == LINK_AUTO) &&
1393 path_equal(d, q)) {
1394
03cfe0d5 1395 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1396 if (r < 0)
56f64d95 1397 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1398 return 0;
57fb9fb5
LP
1399 }
1400
4a62c710
MS
1401 if (unlink(p) < 0)
1402 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1403 } else if (r == -EINVAL) {
1404
1405 if (arg_link_journal == LINK_GUEST &&
1406 rmdir(p) < 0) {
1407
27407a01
ZJS
1408 if (errno == ENOTDIR) {
1409 log_error("%s already exists and is neither a symlink nor a directory", p);
1410 return r;
1411 } else {
56f64d95 1412 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1413 return -errno;
57fb9fb5 1414 }
57fb9fb5
LP
1415 }
1416 } else if (r != -ENOENT) {
56f64d95 1417 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1418 return r;
57fb9fb5
LP
1419 }
1420
1421 if (arg_link_journal == LINK_GUEST) {
1422
1423 if (symlink(q, p) < 0) {
574edc90 1424 if (arg_link_journal_try) {
56f64d95 1425 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1426 return 0;
1427 } else {
56f64d95 1428 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1429 return -errno;
1430 }
57fb9fb5
LP
1431 }
1432
03cfe0d5 1433 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1434 if (r < 0)
56f64d95 1435 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1436 return 0;
57fb9fb5
LP
1437 }
1438
1439 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1440 /* don't create parents here -- if the host doesn't have
1441 * permanent journal set up, don't force it here */
1442 r = mkdir(p, 0755);
57fb9fb5 1443 if (r < 0) {
574edc90 1444 if (arg_link_journal_try) {
56f64d95 1445 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1446 return 0;
1447 } else {
56f64d95 1448 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1449 return r;
1450 }
57fb9fb5
LP
1451 }
1452
27407a01
ZJS
1453 } else if (access(p, F_OK) < 0)
1454 return 0;
57fb9fb5 1455
cdb2b9d0
LP
1456 if (dir_is_empty(q) == 0)
1457 log_warning("%s is not empty, proceeding anyway.", q);
1458
03cfe0d5 1459 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 1460 if (r < 0) {
56f64d95 1461 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1462 return r;
57fb9fb5
LP
1463 }
1464
4543768d 1465 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1466 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1467
27407a01 1468 return 0;
57fb9fb5
LP
1469}
1470
88213476 1471static int drop_capabilities(void) {
5076f0cc 1472 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1473}
1474
db999e0f
LP
1475static int reset_audit_loginuid(void) {
1476 _cleanup_free_ char *p = NULL;
1477 int r;
1478
1479 if (arg_share_system)
1480 return 0;
1481
1482 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1483 if (r == -ENOENT)
db999e0f 1484 return 0;
f647962d
MS
1485 if (r < 0)
1486 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1487
1488 /* Already reset? */
1489 if (streq(p, "4294967295"))
1490 return 0;
1491
ad118bda 1492 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1493 if (r < 0) {
10a87006
LP
1494 log_error_errno(r,
1495 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1496 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1497 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1498 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1499 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1500
db999e0f 1501 sleep(5);
77b6e194 1502 }
db999e0f
LP
1503
1504 return 0;
77b6e194
LP
1505}
1506
28650077 1507static int setup_seccomp(void) {
24fb1112
LP
1508
1509#ifdef HAVE_SECCOMP
9a71b112
JF
1510 static const struct {
1511 uint64_t capability;
1512 int syscall_num;
1513 } blacklist[] = {
5ba7a268
LP
1514 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1515 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1516 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1517 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1518 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1519 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1520 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1521 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1522 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1523 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1524 };
1525
24fb1112 1526 scmp_filter_ctx seccomp;
28650077 1527 unsigned i;
24fb1112
LP
1528 int r;
1529
24fb1112
LP
1530 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1531 if (!seccomp)
1532 return log_oom();
1533
e9642be2 1534 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1535 if (r < 0) {
da927ba9 1536 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1537 goto finish;
1538 }
1539
28650077 1540 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1541 if (arg_retain & (1ULL << blacklist[i].capability))
1542 continue;
1543
1544 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1545 if (r == -EFAULT)
1546 continue; /* unknown syscall */
1547 if (r < 0) {
da927ba9 1548 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1549 goto finish;
1550 }
1551 }
1552
d0a0ccf3 1553
28650077
LP
1554 /*
1555 Audit is broken in containers, much of the userspace audit
1556 hookup will fail if running inside a container. We don't
1557 care and just turn off creation of audit sockets.
1558
1559 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1560 with EAFNOSUPPORT which audit userspace uses as indication
1561 that audit is disabled in the kernel.
1562 */
1563
3302da46 1564 r = seccomp_rule_add(
24fb1112
LP
1565 seccomp,
1566 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1567 SCMP_SYS(socket),
1568 2,
1569 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1570 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1571 if (r < 0) {
da927ba9 1572 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1573 goto finish;
1574 }
1575
1576 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1577 if (r < 0) {
da927ba9 1578 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1579 goto finish;
1580 }
1581
1582 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1583 if (r == -EINVAL) {
1584 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1585 r = 0;
1586 goto finish;
1587 }
1588 if (r < 0) {
da927ba9 1589 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1590 goto finish;
1591 }
24fb1112
LP
1592
1593finish:
1594 seccomp_release(seccomp);
1595 return r;
1596#else
1597 return 0;
1598#endif
1599
1600}
1601
785890ac
LP
1602static int setup_propagate(const char *root) {
1603 const char *p, *q;
1604
1605 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1606 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1607 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1608 (void) mkdir_p(p, 0600);
1609
03cfe0d5
LP
1610 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1611 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1612
1613 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1614 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1615
1616 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1617 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1618
03cfe0d5 1619 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1620 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1621 return log_error_errno(errno, "Failed to install propagation bind mount.");
1622
1623 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1624 return log_error_errno(errno, "Failed to make propagation mount read-only");
1625
1626 return 0;
1627}
1628
1b9e5b12
LP
1629static int setup_image(char **device_path, int *loop_nr) {
1630 struct loop_info64 info = {
1631 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1632 };
1633 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1634 _cleanup_free_ char* loopdev = NULL;
1635 struct stat st;
1636 int r, nr;
1637
1638 assert(device_path);
1639 assert(loop_nr);
ec16945e 1640 assert(arg_image);
1b9e5b12
LP
1641
1642 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1643 if (fd < 0)
1644 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1645
4a62c710
MS
1646 if (fstat(fd, &st) < 0)
1647 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1648
1649 if (S_ISBLK(st.st_mode)) {
1650 char *p;
1651
1652 p = strdup(arg_image);
1653 if (!p)
1654 return log_oom();
1655
1656 *device_path = p;
1657
1658 *loop_nr = -1;
1659
1660 r = fd;
1661 fd = -1;
1662
1663 return r;
1664 }
1665
1666 if (!S_ISREG(st.st_mode)) {
56f64d95 1667 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
1668 return -EINVAL;
1669 }
1670
1671 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1672 if (control < 0)
1673 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1674
1675 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1676 if (nr < 0)
1677 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1678
1679 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1680 return log_oom();
1681
1682 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1683 if (loop < 0)
1684 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1685
4a62c710
MS
1686 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1687 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1688
1689 if (arg_read_only)
1690 info.lo_flags |= LO_FLAGS_READ_ONLY;
1691
4a62c710
MS
1692 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1693 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1694
1695 *device_path = loopdev;
1696 loopdev = NULL;
1697
1698 *loop_nr = nr;
1699
1700 r = loop;
1701 loop = -1;
1702
1703 return r;
1704}
1705
ada4799a
LP
1706#define PARTITION_TABLE_BLURB \
1707 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1708 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1709 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1710 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1711 "to be bootable with systemd-nspawn."
1712
1b9e5b12
LP
1713static int dissect_image(
1714 int fd,
727fd4fd
LP
1715 char **root_device, bool *root_device_rw,
1716 char **home_device, bool *home_device_rw,
1717 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1718 bool *secondary) {
1719
1720#ifdef HAVE_BLKID
01dc33ce
ZJS
1721 int home_nr = -1, srv_nr = -1;
1722#ifdef GPT_ROOT_NATIVE
1723 int root_nr = -1;
1724#endif
1725#ifdef GPT_ROOT_SECONDARY
1726 int secondary_root_nr = -1;
1727#endif
f6c51a81 1728 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1729 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1730 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1731 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1732 _cleanup_udev_unref_ struct udev *udev = NULL;
1733 struct udev_list_entry *first, *item;
f6c51a81 1734 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1735 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1736 const char *pttype = NULL;
1737 blkid_partlist pl;
1738 struct stat st;
c09ef2e4 1739 unsigned i;
1b9e5b12
LP
1740 int r;
1741
1742 assert(fd >= 0);
1743 assert(root_device);
1744 assert(home_device);
1745 assert(srv_device);
1746 assert(secondary);
ec16945e 1747 assert(arg_image);
1b9e5b12
LP
1748
1749 b = blkid_new_probe();
1750 if (!b)
1751 return log_oom();
1752
1753 errno = 0;
1754 r = blkid_probe_set_device(b, fd, 0, 0);
1755 if (r != 0) {
1756 if (errno == 0)
1757 return log_oom();
1758
56f64d95 1759 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1760 return -errno;
1761 }
1762
1763 blkid_probe_enable_partitions(b, 1);
1764 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1765
1766 errno = 0;
1767 r = blkid_do_safeprobe(b);
1768 if (r == -2 || r == 1) {
ada4799a
LP
1769 log_error("Failed to identify any partition table on\n"
1770 " %s\n"
1771 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1772 return -EINVAL;
1773 } else if (r != 0) {
1774 if (errno == 0)
1775 errno = EIO;
56f64d95 1776 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1777 return -errno;
1778 }
1779
48861960 1780 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1781
1782 is_gpt = streq_ptr(pttype, "gpt");
1783 is_mbr = streq_ptr(pttype, "dos");
1784
1785 if (!is_gpt && !is_mbr) {
1786 log_error("No GPT or MBR partition table discovered on\n"
1787 " %s\n"
1788 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1789 return -EINVAL;
1790 }
1791
1792 errno = 0;
1793 pl = blkid_probe_get_partitions(b);
1794 if (!pl) {
1795 if (errno == 0)
1796 return log_oom();
1797
1798 log_error("Failed to list partitions of %s", arg_image);
1799 return -errno;
1800 }
1801
1802 udev = udev_new();
1803 if (!udev)
1804 return log_oom();
1805
4a62c710
MS
1806 if (fstat(fd, &st) < 0)
1807 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1808
c09ef2e4
LP
1809 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1810 if (!d)
1b9e5b12
LP
1811 return log_oom();
1812
c09ef2e4
LP
1813 for (i = 0;; i++) {
1814 int n, m;
1b9e5b12 1815
c09ef2e4
LP
1816 if (i >= 10) {
1817 log_error("Kernel partitions never appeared.");
1818 return -ENXIO;
1819 }
1820
1821 e = udev_enumerate_new(udev);
1822 if (!e)
1823 return log_oom();
1824
1825 r = udev_enumerate_add_match_parent(e, d);
1826 if (r < 0)
1827 return log_oom();
1828
1829 r = udev_enumerate_scan_devices(e);
1830 if (r < 0)
1831 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1832
1833 /* Count the partitions enumerated by the kernel */
1834 n = 0;
1835 first = udev_enumerate_get_list_entry(e);
1836 udev_list_entry_foreach(item, first)
1837 n++;
1838
1839 /* Count the partitions enumerated by blkid */
1840 m = blkid_partlist_numof_partitions(pl);
1841 if (n == m + 1)
1842 break;
1843 if (n > m + 1) {
1844 log_error("blkid and kernel partition list do not match.");
1845 return -EIO;
1846 }
1847 if (n < m + 1) {
1848 unsigned j;
1849
1850 /* The kernel has probed fewer partitions than
1851 * blkid? Maybe the kernel prober is still
1852 * running or it got EBUSY because udev
1853 * already opened the device. Let's reprobe
1854 * the device, which is a synchronous call
1855 * that waits until probing is complete. */
1856
1857 for (j = 0; j < 20; j++) {
1858
1859 r = ioctl(fd, BLKRRPART, 0);
1860 if (r < 0)
1861 r = -errno;
1862 if (r >= 0 || r != -EBUSY)
1863 break;
1864
1865 /* If something else has the device
1866 * open, such as an udev rule, the
1867 * ioctl will return EBUSY. Since
1868 * there's no way to wait until it
1869 * isn't busy anymore, let's just wait
1870 * a bit, and try again.
1871 *
1872 * This is really something they
1873 * should fix in the kernel! */
1874
1875 usleep(50 * USEC_PER_MSEC);
1876 }
1877
1878 if (r < 0)
1879 return log_error_errno(r, "Failed to reread partition table: %m");
1880 }
1881
1882 e = udev_enumerate_unref(e);
1883 }
1b9e5b12
LP
1884
1885 first = udev_enumerate_get_list_entry(e);
1886 udev_list_entry_foreach(item, first) {
1887 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1888 const char *node;
727fd4fd 1889 unsigned long long flags;
1b9e5b12
LP
1890 blkid_partition pp;
1891 dev_t qn;
1892 int nr;
1893
1894 errno = 0;
1895 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1896 if (!q) {
1897 if (!errno)
1898 errno = ENOMEM;
1899
56f64d95 1900 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1901 return -errno;
1902 }
1903
1904 qn = udev_device_get_devnum(q);
1905 if (major(qn) == 0)
1906 continue;
1907
1908 if (st.st_rdev == qn)
1909 continue;
1910
1911 node = udev_device_get_devnode(q);
1912 if (!node)
1913 continue;
1914
1915 pp = blkid_partlist_devno_to_partition(pl, qn);
1916 if (!pp)
1917 continue;
1918
727fd4fd 1919 flags = blkid_partition_get_flags(pp);
727fd4fd 1920
1b9e5b12
LP
1921 nr = blkid_partition_get_partno(pp);
1922 if (nr < 0)
1923 continue;
1924
ada4799a
LP
1925 if (is_gpt) {
1926 sd_id128_t type_id;
1927 const char *stype;
1b9e5b12 1928
f6c51a81
LP
1929 if (flags & GPT_FLAG_NO_AUTO)
1930 continue;
1931
ada4799a
LP
1932 stype = blkid_partition_get_type_string(pp);
1933 if (!stype)
1934 continue;
1b9e5b12 1935
ada4799a 1936 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1937 continue;
1938
ada4799a 1939 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1940
ada4799a
LP
1941 if (home && nr >= home_nr)
1942 continue;
1b9e5b12 1943
ada4799a
LP
1944 home_nr = nr;
1945 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1946
ada4799a
LP
1947 r = free_and_strdup(&home, node);
1948 if (r < 0)
1949 return log_oom();
727fd4fd 1950
ada4799a
LP
1951 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1952
1953 if (srv && nr >= srv_nr)
1954 continue;
1955
1956 srv_nr = nr;
1957 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1958
1959 r = free_and_strdup(&srv, node);
1960 if (r < 0)
1961 return log_oom();
1962 }
1b9e5b12 1963#ifdef GPT_ROOT_NATIVE
ada4799a 1964 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1965
ada4799a
LP
1966 if (root && nr >= root_nr)
1967 continue;
1b9e5b12 1968
ada4799a
LP
1969 root_nr = nr;
1970 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1971
ada4799a
LP
1972 r = free_and_strdup(&root, node);
1973 if (r < 0)
1974 return log_oom();
1975 }
1b9e5b12
LP
1976#endif
1977#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1978 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1979
1980 if (secondary_root && nr >= secondary_root_nr)
1981 continue;
1982
1983 secondary_root_nr = nr;
1984 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1985
1986 r = free_and_strdup(&secondary_root, node);
1987 if (r < 0)
1988 return log_oom();
1989 }
1990#endif
f6c51a81
LP
1991 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1992
1993 if (generic)
1994 multiple_generic = true;
1995 else {
1996 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1997
1998 r = free_and_strdup(&generic, node);
1999 if (r < 0)
2000 return log_oom();
2001 }
2002 }
ada4799a
LP
2003
2004 } else if (is_mbr) {
2005 int type;
1b9e5b12 2006
f6c51a81
LP
2007 if (flags != 0x80) /* Bootable flag */
2008 continue;
2009
ada4799a
LP
2010 type = blkid_partition_get_type(pp);
2011 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2012 continue;
2013
f6c51a81
LP
2014 if (generic)
2015 multiple_generic = true;
2016 else {
2017 generic_rw = true;
727fd4fd 2018
f6c51a81
LP
2019 r = free_and_strdup(&root, node);
2020 if (r < 0)
2021 return log_oom();
2022 }
1b9e5b12 2023 }
1b9e5b12
LP
2024 }
2025
1b9e5b12
LP
2026 if (root) {
2027 *root_device = root;
2028 root = NULL;
727fd4fd
LP
2029
2030 *root_device_rw = root_rw;
1b9e5b12
LP
2031 *secondary = false;
2032 } else if (secondary_root) {
2033 *root_device = secondary_root;
2034 secondary_root = NULL;
727fd4fd
LP
2035
2036 *root_device_rw = secondary_root_rw;
1b9e5b12 2037 *secondary = true;
f6c51a81
LP
2038 } else if (generic) {
2039
2040 /* There were no partitions with precise meanings
2041 * around, but we found generic partitions. In this
2042 * case, if there's only one, we can go ahead and boot
2043 * it, otherwise we bail out, because we really cannot
2044 * make any sense of it. */
2045
2046 if (multiple_generic) {
2047 log_error("Identified multiple bootable Linux partitions on\n"
2048 " %s\n"
2049 PARTITION_TABLE_BLURB, arg_image);
2050 return -EINVAL;
2051 }
2052
2053 *root_device = generic;
2054 generic = NULL;
2055
2056 *root_device_rw = generic_rw;
2057 *secondary = false;
2058 } else {
2059 log_error("Failed to identify root partition in disk image\n"
2060 " %s\n"
2061 PARTITION_TABLE_BLURB, arg_image);
2062 return -EINVAL;
1b9e5b12
LP
2063 }
2064
2065 if (home) {
2066 *home_device = home;
2067 home = NULL;
727fd4fd
LP
2068
2069 *home_device_rw = home_rw;
1b9e5b12
LP
2070 }
2071
2072 if (srv) {
2073 *srv_device = srv;
2074 srv = NULL;
727fd4fd
LP
2075
2076 *srv_device_rw = srv_rw;
1b9e5b12
LP
2077 }
2078
2079 return 0;
2080#else
2081 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2082 return -EOPNOTSUPP;
1b9e5b12
LP
2083#endif
2084}
2085
727fd4fd 2086static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2087#ifdef HAVE_BLKID
2088 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2089 const char *fstype, *p;
2090 int r;
2091
2092 assert(what);
2093 assert(where);
2094
727fd4fd
LP
2095 if (arg_read_only)
2096 rw = false;
2097
1b9e5b12 2098 if (directory)
63c372cb 2099 p = strjoina(where, directory);
1b9e5b12
LP
2100 else
2101 p = where;
2102
2103 errno = 0;
2104 b = blkid_new_probe_from_filename(what);
2105 if (!b) {
2106 if (errno == 0)
2107 return log_oom();
56f64d95 2108 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2109 return -errno;
2110 }
2111
2112 blkid_probe_enable_superblocks(b, 1);
2113 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2114
2115 errno = 0;
2116 r = blkid_do_safeprobe(b);
2117 if (r == -1 || r == 1) {
2118 log_error("Cannot determine file system type of %s", what);
2119 return -EINVAL;
2120 } else if (r != 0) {
2121 if (errno == 0)
2122 errno = EIO;
56f64d95 2123 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2124 return -errno;
2125 }
2126
2127 errno = 0;
2128 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2129 if (errno == 0)
2130 errno = EINVAL;
2131 log_error("Failed to determine file system type of %s", what);
2132 return -errno;
2133 }
2134
2135 if (streq(fstype, "crypto_LUKS")) {
2136 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2137 return -EOPNOTSUPP;
1b9e5b12
LP
2138 }
2139
4a62c710
MS
2140 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2141 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2142
2143 return 0;
2144#else
2145 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2146 return -EOPNOTSUPP;
1b9e5b12
LP
2147#endif
2148}
2149
727fd4fd
LP
2150static int mount_devices(
2151 const char *where,
2152 const char *root_device, bool root_device_rw,
2153 const char *home_device, bool home_device_rw,
2154 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2155 int r;
2156
2157 assert(where);
2158
2159 if (root_device) {
727fd4fd 2160 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2161 if (r < 0)
2162 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2163 }
2164
2165 if (home_device) {
727fd4fd 2166 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2169 }
2170
2171 if (srv_device) {
727fd4fd 2172 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2175 }
2176
2177 return 0;
2178}
2179
2180static void loop_remove(int nr, int *image_fd) {
2181 _cleanup_close_ int control = -1;
e8c8ddcc 2182 int r;
1b9e5b12
LP
2183
2184 if (nr < 0)
2185 return;
2186
2187 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2188 r = ioctl(*image_fd, LOOP_CLR_FD);
2189 if (r < 0)
5e4074aa 2190 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2191 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2192 }
2193
2194 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2195 if (control < 0) {
56f64d95 2196 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2197 return;
e8c8ddcc 2198 }
1b9e5b12 2199
e8c8ddcc
TG
2200 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2201 if (r < 0)
5e4074aa 2202 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2203}
2204
113cea80 2205/*
6d416b9c
LS
2206 * Return values:
2207 * < 0 : wait_for_terminate() failed to get the state of the
2208 * container, the container was terminated by a signal, or
2209 * failed for an unknown reason. No change is made to the
2210 * container argument.
2211 * > 0 : The program executed in the container terminated with an
2212 * error. The exit code of the program executed in the
919699ec
LP
2213 * container is returned. The container argument has been set
2214 * to CONTAINER_TERMINATED.
6d416b9c
LS
2215 * 0 : The container is being rebooted, has been shut down or exited
2216 * successfully. The container argument has been set to either
2217 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2218 *
6d416b9c
LS
2219 * That is, success is indicated by a return value of zero, and an
2220 * error is indicated by a non-zero value.
113cea80
DH
2221 */
2222static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2223 siginfo_t status;
919699ec 2224 int r;
113cea80
DH
2225
2226 r = wait_for_terminate(pid, &status);
f647962d
MS
2227 if (r < 0)
2228 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2229
2230 switch (status.si_code) {
fddbb89c 2231
113cea80 2232 case CLD_EXITED:
919699ec
LP
2233 if (status.si_status == 0) {
2234 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2235
fddbb89c 2236 } else
919699ec 2237 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2238
919699ec
LP
2239 *container = CONTAINER_TERMINATED;
2240 return status.si_status;
113cea80
DH
2241
2242 case CLD_KILLED:
2243 if (status.si_status == SIGINT) {
113cea80 2244
919699ec 2245 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2246 *container = CONTAINER_TERMINATED;
919699ec
LP
2247 return 0;
2248
113cea80 2249 } else if (status.si_status == SIGHUP) {
113cea80 2250
919699ec 2251 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2252 *container = CONTAINER_REBOOTED;
919699ec 2253 return 0;
113cea80 2254 }
919699ec 2255
113cea80
DH
2256 /* CLD_KILLED fallthrough */
2257
2258 case CLD_DUMPED:
fddbb89c 2259 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2260 return -EIO;
113cea80
DH
2261
2262 default:
fddbb89c 2263 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2264 return -EIO;
113cea80
DH
2265 }
2266
2267 return r;
2268}
2269
023fb90b
LP
2270static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2271 pid_t pid;
2272
2273 pid = PTR_TO_UINT32(userdata);
2274 if (pid > 0) {
c6c8f6e2 2275 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2276 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2277 sd_event_source_set_userdata(s, NULL);
2278 return 0;
2279 }
2280 }
2281
2282 sd_event_exit(sd_event_source_get_event(s), 0);
2283 return 0;
2284}
2285
ec16945e 2286static int determine_names(void) {
1b9cebf6 2287 int r;
ec16945e 2288
c1521918
LP
2289 if (arg_template && !arg_directory && arg_machine) {
2290
2291 /* If --template= was specified then we should not
2292 * search for a machine, but instead create a new one
2293 * in /var/lib/machine. */
2294
2295 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2296 if (!arg_directory)
2297 return log_oom();
2298 }
2299
ec16945e 2300 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2301 if (arg_machine) {
2302 _cleanup_(image_unrefp) Image *i = NULL;
2303
2304 r = image_find(arg_machine, &i);
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2307 else if (r == 0) {
2308 log_error("No image for machine '%s': %m", arg_machine);
2309 return -ENOENT;
2310 }
2311
aceac2f0 2312 if (i->type == IMAGE_RAW)
0f03c2a4 2313 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2314 else
0f03c2a4 2315 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2316 if (r < 0)
2317 return log_error_errno(r, "Invalid image directory: %m");
2318
aee327b8
LP
2319 if (!arg_ephemeral)
2320 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2321 } else
ec16945e
LP
2322 arg_directory = get_current_dir_name();
2323
1b9cebf6
LP
2324 if (!arg_directory && !arg_machine) {
2325 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2326 return -EINVAL;
2327 }
2328 }
2329
2330 if (!arg_machine) {
b9ba4dab
LP
2331 if (arg_directory && path_equal(arg_directory, "/"))
2332 arg_machine = gethostname_malloc();
2333 else
2334 arg_machine = strdup(basename(arg_image ?: arg_directory));
2335
ec16945e
LP
2336 if (!arg_machine)
2337 return log_oom();
2338
ae691c1d 2339 hostname_cleanup(arg_machine);
ec16945e
LP
2340 if (!machine_name_is_valid(arg_machine)) {
2341 log_error("Failed to determine machine name automatically, please use -M.");
2342 return -EINVAL;
2343 }
b9ba4dab
LP
2344
2345 if (arg_ephemeral) {
2346 char *b;
2347
2348 /* Add a random suffix when this is an
2349 * ephemeral machine, so that we can run many
2350 * instances at once without manually having
2351 * to specify -M each time. */
2352
2353 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2354 return log_oom();
2355
2356 free(arg_machine);
2357 arg_machine = b;
2358 }
ec16945e
LP
2359 }
2360
2361 return 0;
2362}
2363
03cfe0d5 2364static int determine_uid_shift(const char *directory) {
6dac160c
LP
2365 int r;
2366
03cfe0d5
LP
2367 if (!arg_userns) {
2368 arg_uid_shift = 0;
6dac160c 2369 return 0;
03cfe0d5 2370 }
6dac160c
LP
2371
2372 if (arg_uid_shift == UID_INVALID) {
2373 struct stat st;
2374
03cfe0d5 2375 r = stat(directory, &st);
6dac160c 2376 if (r < 0)
03cfe0d5 2377 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2378
2379 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2380
2381 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2382 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2383 return -EINVAL;
2384 }
2385
2386 arg_uid_range = UINT32_C(0x10000);
2387 }
2388
2389 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2390 log_error("UID base too high for UID range.");
2391 return -EINVAL;
2392 }
2393
2394 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2395 return 0;
2396}
2397
03cfe0d5
LP
2398static int inner_child(
2399 Barrier *barrier,
2400 const char *directory,
2401 bool secondary,
2402 int kmsg_socket,
2403 int rtnl_socket,
f757855e 2404 FDSet *fds) {
69c79d3c 2405
03cfe0d5
LP
2406 _cleanup_free_ char *home = NULL;
2407 unsigned n_env = 2;
2408 const char *envp[] = {
2409 "PATH=" DEFAULT_PATH_SPLIT_USR,
2410 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2411 NULL, /* TERM */
2412 NULL, /* HOME */
2413 NULL, /* USER */
2414 NULL, /* LOGNAME */
2415 NULL, /* container_uuid */
2416 NULL, /* LISTEN_FDS */
2417 NULL, /* LISTEN_PID */
2418 NULL
2419 };
88213476 2420
2371271c 2421 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2422 int r;
88213476 2423
03cfe0d5
LP
2424 assert(barrier);
2425 assert(directory);
2426 assert(kmsg_socket >= 0);
88213476 2427
efdb0237
LP
2428 cg_unified_flush();
2429
03cfe0d5
LP
2430 if (arg_userns) {
2431 /* Tell the parent, that it now can write the UID map. */
2432 (void) barrier_place(barrier); /* #1 */
7027ff61 2433
03cfe0d5
LP
2434 /* Wait until the parent wrote the UID map */
2435 if (!barrier_place_and_sync(barrier)) { /* #2 */
2436 log_error("Parent died too early");
2437 return -ESRCH;
2438 }
88213476
LP
2439 }
2440
d1678248 2441 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2442 if (r < 0)
2443 return r;
2444
d8fc6a00
LP
2445 r = mount_sysfs(NULL);
2446 if (r < 0)
2447 return r;
2448
03cfe0d5
LP
2449 /* Wait until we are cgroup-ified, so that we
2450 * can mount the right cgroup path writable */
2451 if (!barrier_place_and_sync(barrier)) { /* #3 */
2452 log_error("Parent died too early");
2453 return -ESRCH;
88213476
LP
2454 }
2455
e83bebef 2456 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2457 if (r < 0)
2458 return r;
ec16945e 2459
03cfe0d5
LP
2460 r = reset_uid_gid();
2461 if (r < 0)
2462 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2463
03cfe0d5
LP
2464 r = setup_boot_id(NULL);
2465 if (r < 0)
2466 return r;
ec16945e 2467
03cfe0d5
LP
2468 r = setup_kmsg(NULL, kmsg_socket);
2469 if (r < 0)
2470 return r;
2471 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2472
03cfe0d5 2473 umask(0022);
30535c16 2474
03cfe0d5
LP
2475 if (setsid() < 0)
2476 return log_error_errno(errno, "setsid() failed: %m");
2477
2478 if (arg_private_network)
2479 loopback_setup();
2480
7a8f6325
LP
2481 if (arg_expose_ports) {
2482 r = expose_port_send_rtnl(rtnl_socket);
2483 if (r < 0)
2484 return r;
2485 rtnl_socket = safe_close(rtnl_socket);
2486 }
03cfe0d5
LP
2487
2488 if (drop_capabilities() < 0)
2489 return log_error_errno(errno, "drop_capabilities() failed: %m");
2490
2491 setup_hostname();
2492
050f7277 2493 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2494 if (personality(arg_personality) < 0)
2495 return log_error_errno(errno, "personality() failed: %m");
2496 } else if (secondary) {
2497 if (personality(PER_LINUX32) < 0)
2498 return log_error_errno(errno, "personality() failed: %m");
2499 }
2500
2501#ifdef HAVE_SELINUX
2502 if (arg_selinux_context)
2503 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2504 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2505#endif
2506
ee645080 2507 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2508 if (r < 0)
2509 return r;
2510
2511 envp[n_env] = strv_find_prefix(environ, "TERM=");
2512 if (envp[n_env])
2513 n_env ++;
2514
2515 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2516 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2517 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2518 return log_oom();
2519
2520 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2521 char as_uuid[37];
2522
2523 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2524 return log_oom();
2525 }
2526
2527 if (fdset_size(fds) > 0) {
2528 r = fdset_cloexec(fds, false);
2529 if (r < 0)
2530 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2531
2532 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2533 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2534 return log_oom();
2535 }
2536
2371271c
TG
2537 env_use = strv_env_merge(2, envp, arg_setenv);
2538 if (!env_use)
2539 return log_oom();
03cfe0d5
LP
2540
2541 /* Let the parent know that we are ready and
2542 * wait until the parent is ready with the
2543 * setup, too... */
2544 if (!barrier_place_and_sync(barrier)) { /* #4 */
2545 log_error("Parent died too early");
2546 return -ESRCH;
2547 }
2548
2549 /* Now, explicitly close the log, so that we
2550 * then can close all remaining fds. Closing
2551 * the log explicitly first has the benefit
2552 * that the logging subsystem knows about it,
2553 * and is thus ready to be reopened should we
2554 * need it again. Note that the other fds
2555 * closed here are at least the locking and
2556 * barrier fds. */
2557 log_close();
2558 (void) fdset_close_others(fds);
2559
2560 if (arg_boot) {
2561 char **a;
2562 size_t m;
2563
2564 /* Automatically search for the init system */
2565
f757855e 2566 m = 1 + strv_length(arg_parameters);
03cfe0d5 2567 a = newa(char*, m + 1);
f757855e
LP
2568 if (strv_isempty(arg_parameters))
2569 a[1] = NULL;
2570 else
2571 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2572
2573 a[0] = (char*) "/usr/lib/systemd/systemd";
2574 execve(a[0], a, env_use);
2575
2576 a[0] = (char*) "/lib/systemd/systemd";
2577 execve(a[0], a, env_use);
2578
2579 a[0] = (char*) "/sbin/init";
2580 execve(a[0], a, env_use);
f757855e
LP
2581 } else if (!strv_isempty(arg_parameters))
2582 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2583 else {
f757855e 2584 chdir(home ?: "/root");
03cfe0d5
LP
2585 execle("/bin/bash", "-bash", NULL, env_use);
2586 execle("/bin/sh", "-sh", NULL, env_use);
2587 }
2588
2589 (void) log_open();
2590 return log_error_errno(errno, "execv() failed: %m");
2591}
2592
2593static int outer_child(
2594 Barrier *barrier,
2595 const char *directory,
2596 const char *console,
2597 const char *root_device, bool root_device_rw,
2598 const char *home_device, bool home_device_rw,
2599 const char *srv_device, bool srv_device_rw,
2600 bool interactive,
2601 bool secondary,
2602 int pid_socket,
2603 int kmsg_socket,
2604 int rtnl_socket,
825d5287 2605 int uid_shift_socket,
f757855e 2606 FDSet *fds) {
03cfe0d5
LP
2607
2608 pid_t pid;
2609 ssize_t l;
2610 int r;
2611
2612 assert(barrier);
2613 assert(directory);
2614 assert(console);
2615 assert(pid_socket >= 0);
2616 assert(kmsg_socket >= 0);
2617
efdb0237
LP
2618 cg_unified_flush();
2619
03cfe0d5
LP
2620 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2621 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2622
2623 if (interactive) {
2624 close_nointr(STDIN_FILENO);
2625 close_nointr(STDOUT_FILENO);
2626 close_nointr(STDERR_FILENO);
2627
2628 r = open_terminal(console, O_RDWR);
2629 if (r != STDIN_FILENO) {
2630 if (r >= 0) {
2631 safe_close(r);
2632 r = -EINVAL;
2633 }
2634
2635 return log_error_errno(r, "Failed to open console: %m");
2636 }
2637
2638 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2639 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2640 return log_error_errno(errno, "Failed to duplicate console: %m");
2641 }
2642
2643 r = reset_audit_loginuid();
2644 if (r < 0)
2645 return r;
2646
2647 /* Mark everything as slave, so that we still
2648 * receive mounts from the real root, but don't
2649 * propagate mounts to the real root. */
2650 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2651 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2652
2653 r = mount_devices(directory,
2654 root_device, root_device_rw,
2655 home_device, home_device_rw,
2656 srv_device, srv_device_rw);
2657 if (r < 0)
2658 return r;
2659
391567f4
LP
2660 r = determine_uid_shift(directory);
2661 if (r < 0)
2662 return r;
2663
825d5287
RM
2664 if (arg_userns) {
2665 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2666 if (l < 0)
2667 return log_error_errno(errno, "Failed to send UID shift: %m");
2668 if (l != sizeof(arg_uid_shift)) {
2669 log_error("Short write while sending UID shift.");
2670 return -EIO;
2671 }
2672 }
2673
03cfe0d5
LP
2674 /* Turn directory into bind mount */
2675 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2676 return log_error_errno(errno, "Failed to make bind mount: %m");
2677
e83bebef 2678 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2679 if (r < 0)
2680 return r;
2681
e83bebef 2682 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2683 if (r < 0)
2684 return r;
2685
03cfe0d5
LP
2686 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2687 if (r < 0)
2688 return r;
2689
03cfe0d5
LP
2690 if (arg_read_only) {
2691 r = bind_remount_recursive(directory, true);
2692 if (r < 0)
2693 return log_error_errno(r, "Failed to make tree read-only: %m");
2694 }
2695
d1678248 2696 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2697 if (r < 0)
2698 return r;
2699
07fa00f9
LP
2700 r = copy_devnodes(directory);
2701 if (r < 0)
03cfe0d5
LP
2702 return r;
2703
2704 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2705
07fa00f9
LP
2706 r = setup_pts(directory);
2707 if (r < 0)
03cfe0d5
LP
2708 return r;
2709
2710 r = setup_propagate(directory);
2711 if (r < 0)
2712 return r;
2713
2714 r = setup_dev_console(directory, console);
2715 if (r < 0)
2716 return r;
2717
2718 r = setup_seccomp();
2719 if (r < 0)
2720 return r;
2721
2722 r = setup_timezone(directory);
2723 if (r < 0)
2724 return r;
2725
2726 r = setup_resolv_conf(directory);
2727 if (r < 0)
2728 return r;
2729
2730 r = setup_journal(directory);
2731 if (r < 0)
2732 return r;
2733
e83bebef 2734 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2735 if (r < 0)
2736 return r;
2737
e83bebef 2738 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2739 if (r < 0)
2740 return r;
2741
2742 r = mount_move_root(directory);
2743 if (r < 0)
2744 return log_error_errno(r, "Failed to move root directory: %m");
2745
2746 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2747 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2748 (arg_private_network ? CLONE_NEWNET : 0) |
2749 (arg_userns ? CLONE_NEWUSER : 0),
2750 NULL);
2751 if (pid < 0)
2752 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2753 if (pid == 0) {
2754 pid_socket = safe_close(pid_socket);
825d5287 2755 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2756
2757 /* The inner child has all namespaces that are
2758 * requested, so that we all are owned by the user if
2759 * user namespaces are turned on. */
2760
f757855e 2761 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2762 if (r < 0)
2763 _exit(EXIT_FAILURE);
2764
2765 _exit(EXIT_SUCCESS);
2766 }
2767
2768 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2769 if (l < 0)
2770 return log_error_errno(errno, "Failed to send PID: %m");
2771 if (l != sizeof(pid)) {
2772 log_error("Short write while sending PID.");
2773 return -EIO;
2774 }
2775
2776 pid_socket = safe_close(pid_socket);
327e26d6
KN
2777 kmsg_socket = safe_close(kmsg_socket);
2778 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2779
2780 return 0;
2781}
2782
2783static int setup_uid_map(pid_t pid) {
2784 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2785 int r;
2786
2787 assert(pid > 1);
2788
2789 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2790 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2791 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2792 if (r < 0)
2793 return log_error_errno(r, "Failed to write UID map: %m");
2794
2795 /* We always assign the same UID and GID ranges */
2796 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2797 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2798 if (r < 0)
2799 return log_error_errno(r, "Failed to write GID map: %m");
2800
2801 return 0;
2802}
2803
f757855e
LP
2804static int load_settings(void) {
2805 _cleanup_(settings_freep) Settings *settings = NULL;
2806 _cleanup_fclose_ FILE *f = NULL;
2807 _cleanup_free_ char *p = NULL;
2808 const char *fn, *i;
2809 int r;
2810
2811 /* If all settings are masked, there's no point in looking for
2812 * the settings file */
2813 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2814 return 0;
2815
2816 fn = strjoina(arg_machine, ".nspawn");
2817
2818 /* We first look in the admin's directories in /etc and /run */
2819 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2820 _cleanup_free_ char *j = NULL;
2821
2822 j = strjoin(i, "/", fn, NULL);
2823 if (!j)
2824 return log_oom();
2825
2826 f = fopen(j, "re");
2827 if (f) {
2828 p = j;
2829 j = NULL;
2830
b938cb90 2831 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2832 if (arg_settings_trusted < 0)
2833 arg_settings_trusted = true;
2834
2835 break;
2836 }
2837
2838 if (errno != ENOENT)
2839 return log_error_errno(errno, "Failed to open %s: %m", j);
2840 }
2841
2842 if (!f) {
2843 /* After that, let's look for a file next to the
2844 * actual image we shall boot. */
2845
2846 if (arg_image) {
2847 p = file_in_same_dir(arg_image, fn);
2848 if (!p)
2849 return log_oom();
2850 } else if (arg_directory) {
2851 p = file_in_same_dir(arg_directory, fn);
2852 if (!p)
2853 return log_oom();
2854 }
2855
2856 if (p) {
2857 f = fopen(p, "re");
2858 if (!f && errno != ENOENT)
2859 return log_error_errno(errno, "Failed to open %s: %m", p);
2860
b938cb90 2861 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2862 if (arg_settings_trusted < 0)
2863 arg_settings_trusted = false;
2864 }
2865 }
2866
2867 if (!f)
2868 return 0;
2869
2870 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2871
2872 r = settings_load(f, p, &settings);
2873 if (r < 0)
2874 return r;
2875
2876 /* Copy over bits from the settings, unless they have been
2877 * explicitly masked by command line switches. */
2878
2879 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2880 settings->boot >= 0) {
2881 arg_boot = settings->boot;
2882
2883 strv_free(arg_parameters);
2884 arg_parameters = settings->parameters;
2885 settings->parameters = NULL;
2886 }
2887
2888 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2889 settings->environment) {
2890 strv_free(arg_setenv);
2891 arg_setenv = settings->environment;
2892 settings->environment = NULL;
2893 }
2894
2895 if ((arg_settings_mask & SETTING_USER) == 0 &&
2896 settings->user) {
2897 free(arg_user);
2898 arg_user = settings->user;
2899 settings->user = NULL;
2900 }
2901
2902 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2903 uint64_t plus;
f757855e 2904
0e265674
LP
2905 plus = settings->capability;
2906 if (settings_private_network(settings))
2907 plus |= (1ULL << CAP_NET_ADMIN);
2908
2909 if (!arg_settings_trusted && plus != 0) {
2910 if (settings->capability != 0)
2911 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2912 } else
2913 arg_retain |= plus;
f757855e
LP
2914
2915 arg_retain &= ~settings->drop_capability;
2916 }
2917
2918 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2919 settings->kill_signal > 0)
2920 arg_kill_signal = settings->kill_signal;
2921
2922 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2923 settings->personality != PERSONALITY_INVALID)
2924 arg_personality = settings->personality;
2925
2926 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2927 !sd_id128_is_null(settings->machine_id)) {
2928
2929 if (!arg_settings_trusted)
2930 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2931 else
2932 arg_uuid = settings->machine_id;
2933 }
2934
2935 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2936 settings->read_only >= 0)
2937 arg_read_only = settings->read_only;
2938
2939 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2940 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2941 arg_volatile_mode = settings->volatile_mode;
2942
2943 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2944 settings->n_custom_mounts > 0) {
2945
2946 if (!arg_settings_trusted)
2947 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2948 else {
2949 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2950 arg_custom_mounts = settings->custom_mounts;
2951 arg_n_custom_mounts = settings->n_custom_mounts;
2952
2953 settings->custom_mounts = NULL;
2954 settings->n_custom_mounts = 0;
2955 }
2956 }
2957
2958 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2959 (settings->private_network >= 0 ||
2960 settings->network_veth >= 0 ||
2961 settings->network_bridge ||
2962 settings->network_interfaces ||
2963 settings->network_macvlan ||
2964 settings->network_ipvlan)) {
2965
2966 if (!arg_settings_trusted)
2967 log_warning("Ignoring network settings, file %s is not trusted.", p);
2968 else {
0e265674
LP
2969 arg_network_veth = settings_private_network(settings);
2970 arg_private_network = settings_private_network(settings);
2971
f757855e
LP
2972 strv_free(arg_network_interfaces);
2973 arg_network_interfaces = settings->network_interfaces;
2974 settings->network_interfaces = NULL;
2975
2976 strv_free(arg_network_macvlan);
2977 arg_network_macvlan = settings->network_macvlan;
2978 settings->network_macvlan = NULL;
2979
2980 strv_free(arg_network_ipvlan);
2981 arg_network_ipvlan = settings->network_ipvlan;
2982 settings->network_ipvlan = NULL;
2983
2984 free(arg_network_bridge);
2985 arg_network_bridge = settings->network_bridge;
2986 settings->network_bridge = NULL;
f757855e
LP
2987 }
2988 }
2989
2990 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2991 settings->expose_ports) {
2992
2993 if (!arg_settings_trusted)
2994 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2995 else {
2996 expose_port_free_all(arg_expose_ports);
2997 arg_expose_ports = settings->expose_ports;
2998 settings->expose_ports = NULL;
2999 }
3000 }
3001
3002 return 0;
3003}
3004
03cfe0d5
LP
3005int main(int argc, char *argv[]) {
3006
3007 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3008 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3009 _cleanup_close_ int master = -1, image_fd = -1;
3010 _cleanup_fdset_free_ FDSet *fds = NULL;
3011 int r, n_fd_passed, loop_nr = -1;
3012 char veth_name[IFNAMSIZ];
3013 bool secondary = false, remove_subvol = false;
72c0a2c2 3014 sigset_t mask_chld;
03cfe0d5
LP
3015 pid_t pid = 0;
3016 int ret = EXIT_SUCCESS;
3017 union in_addr_union exposed = {};
3018 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3019 bool interactive;
3020
3021 log_parse_environment();
3022 log_open();
3023
3024 r = parse_argv(argc, argv);
3025 if (r <= 0)
3026 goto finish;
3027
03cfe0d5
LP
3028 if (geteuid() != 0) {
3029 log_error("Need to be root.");
3030 r = -EPERM;
3031 goto finish;
3032 }
f757855e
LP
3033 r = determine_names();
3034 if (r < 0)
3035 goto finish;
3036
3037 r = load_settings();
3038 if (r < 0)
3039 goto finish;
3040
3041 r = verify_arguments();
3042 if (r < 0)
3043 goto finish;
03cfe0d5
LP
3044
3045 n_fd_passed = sd_listen_fds(false);
3046 if (n_fd_passed > 0) {
3047 r = fdset_new_listen_fds(&fds, false);
3048 if (r < 0) {
3049 log_error_errno(r, "Failed to collect file descriptors: %m");
3050 goto finish;
3051 }
3052 }
3053
3054 if (arg_directory) {
3055 assert(!arg_image);
3056
3057 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3058 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3059 r = -EINVAL;
3060 goto finish;
3061 }
3062
3063 if (arg_ephemeral) {
3064 _cleanup_free_ char *np = NULL;
3065
3066 /* If the specified path is a mount point we
3067 * generate the new snapshot immediately
3068 * inside it under a random name. However if
3069 * the specified is not a mount point we
3070 * create the new snapshot in the parent
3071 * directory, just next to it. */
e26d6ce5 3072 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3073 if (r < 0) {
3074 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3075 goto finish;
3076 }
3077 if (r > 0)
770b5ce4 3078 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3079 else
770b5ce4 3080 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3081 if (r < 0) {
3082 log_error_errno(r, "Failed to generate name for snapshot: %m");
3083 goto finish;
3084 }
3085
3086 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3087 if (r < 0) {
3088 log_error_errno(r, "Failed to lock %s: %m", np);
3089 goto finish;
3090 }
3091
5bcd08db 3092 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3093 if (r < 0) {
3094 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3095 goto finish;
ec16945e
LP
3096 }
3097
3098 free(arg_directory);
3099 arg_directory = np;
8a16a7b4 3100 np = NULL;
ec16945e
LP
3101
3102 remove_subvol = true;
30535c16
LP
3103
3104 } else {
3105 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3106 if (r == -EBUSY) {
3107 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3108 goto finish;
3109 }
3110 if (r < 0) {
3111 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3112 return r;
3113 }
3114
3115 if (arg_template) {
5bcd08db 3116 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3117 if (r == -EEXIST) {
3118 if (!arg_quiet)
3119 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3120 } else if (r < 0) {
83521414 3121 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3122 goto finish;
3123 } else {
3124 if (!arg_quiet)
3125 log_info("Populated %s from template %s.", arg_directory, arg_template);
3126 }
3127 }
ec16945e
LP
3128 }
3129
1b9e5b12
LP
3130 if (arg_boot) {
3131 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3132 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3133 r = -EINVAL;
1b9e5b12
LP
3134 goto finish;
3135 }
3136 } else {
3137 const char *p;
3138
16fb773e
LP
3139 p = strjoina(arg_directory, "/usr/");
3140 if (laccess(p, F_OK) < 0) {
3141 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3142 r = -EINVAL;
1b9e5b12 3143 goto finish;
1b9e5b12
LP
3144 }
3145 }
ec16945e 3146
6b9132a9 3147 } else {
1b9e5b12 3148 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3149
ec16945e
LP
3150 assert(arg_image);
3151 assert(!arg_template);
3152
30535c16
LP
3153 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3154 if (r == -EBUSY) {
3155 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3156 goto finish;
3157 }
3158 if (r < 0) {
3159 r = log_error_errno(r, "Failed to create image lock: %m");
3160 goto finish;
3161 }
3162
1b9e5b12 3163 if (!mkdtemp(template)) {
56f64d95 3164 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3165 r = -errno;
6b9132a9 3166 goto finish;
1b9e5b12 3167 }
6b9132a9 3168
1b9e5b12
LP
3169 arg_directory = strdup(template);
3170 if (!arg_directory) {
3171 r = log_oom();
3172 goto finish;
6b9132a9 3173 }
88213476 3174
1b9e5b12
LP
3175 image_fd = setup_image(&device_path, &loop_nr);
3176 if (image_fd < 0) {
3177 r = image_fd;
842f3b0f
LP
3178 goto finish;
3179 }
1b9e5b12 3180
4d9f07b4
LP
3181 r = dissect_image(image_fd,
3182 &root_device, &root_device_rw,
3183 &home_device, &home_device_rw,
3184 &srv_device, &srv_device_rw,
3185 &secondary);
1b9e5b12
LP
3186 if (r < 0)
3187 goto finish;
842f3b0f 3188 }
842f3b0f 3189
5a8af538
LP
3190 r = custom_mounts_prepare();
3191 if (r < 0)
3192 goto finish;
3193
03cfe0d5
LP
3194 interactive =
3195 isatty(STDIN_FILENO) > 0 &&
3196 isatty(STDOUT_FILENO) > 0;
9c857b9d 3197
db7feb7e
LP
3198 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3199 if (master < 0) {
ec16945e 3200 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3201 goto finish;
3202 }
3203
611b312b
LP
3204 r = ptsname_malloc(master, &console);
3205 if (r < 0) {
3206 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3207 goto finish;
3208 }
3209
a258bf26 3210 if (unlockpt(master) < 0) {
ec16945e 3211 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3212 goto finish;
3213 }
3214
9c857b9d
LP
3215 if (!arg_quiet)
3216 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3217 arg_machine, arg_image ?: arg_directory);
3218
72c0a2c2 3219 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3220
023fb90b
LP
3221 assert_se(sigemptyset(&mask_chld) == 0);
3222 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3223
03cfe0d5
LP
3224 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3225 r = log_error_errno(errno, "Failed to become subreaper: %m");
3226 goto finish;
3227 }
3228
d87be9b0 3229 for (;;) {
97044145 3230 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3231 ContainerStatus container_status;
7566e267 3232 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3233 static const struct sigaction sa = {
189d5bac 3234 .sa_handler = nop_signal_handler,
e866af3a
DH
3235 .sa_flags = SA_NOCLDSTOP,
3236 };
03cfe0d5
LP
3237 int ifi = 0;
3238 ssize_t l;
dbb60d69
LP
3239 _cleanup_event_unref_ sd_event *event = NULL;
3240 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3241 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3242 char last_char = 0;
e866af3a 3243
7566e267 3244 r = barrier_create(&barrier);
a2da110b 3245 if (r < 0) {
da927ba9 3246 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3247 goto finish;
3248 }
3249
4610de50 3250 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3251 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3252 goto finish;
3253 }
3254
4610de50 3255 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3256 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3257 goto finish;
3258 }
3259
4610de50 3260 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3261 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3262 goto finish;
3263 }
3264
825d5287 3265 if (arg_userns)
4610de50 3266 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3267 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3268 goto finish;
3269 }
3270
e866af3a
DH
3271 /* Child can be killed before execv(), so handle SIGCHLD
3272 * in order to interrupt parent's blocking calls and
3273 * give it a chance to call wait() and terminate. */
3274 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3275 if (r < 0) {
ec16945e 3276 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3277 goto finish;
3278 }
3279
e866af3a
DH
3280 r = sigaction(SIGCHLD, &sa, NULL);
3281 if (r < 0) {
ec16945e 3282 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3283 goto finish;
3284 }
3285
03cfe0d5 3286 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3287 if (pid < 0) {
3288 if (errno == EINVAL)
ec16945e 3289 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3290 else
ec16945e 3291 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3292
d87be9b0
LP
3293 goto finish;
3294 }
a258bf26 3295
d87be9b0 3296 if (pid == 0) {
03cfe0d5 3297 /* The outer child only has a file system namespace. */
a2da110b
DH
3298 barrier_set_role(&barrier, BARRIER_CHILD);
3299
03e334a1 3300 master = safe_close(master);
a258bf26 3301
03e334a1 3302 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3303 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3304 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3305 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3306
ce30c8dc
LP
3307 (void) reset_all_signal_handlers();
3308 (void) reset_signal_mask();
f5c1b9ee 3309
03cfe0d5
LP
3310 r = outer_child(&barrier,
3311 arg_directory,
3312 console,
3313 root_device, root_device_rw,
3314 home_device, home_device_rw,
3315 srv_device, srv_device_rw,
3316 interactive,
3317 secondary,
3318 pid_socket_pair[1],
3319 kmsg_socket_pair[1],
3320 rtnl_socket_pair[1],
825d5287 3321 uid_shift_socket_pair[1],
f757855e 3322 fds);
0cb9fbcd 3323 if (r < 0)
a2da110b 3324 _exit(EXIT_FAILURE);
d87be9b0 3325
03cfe0d5 3326 _exit(EXIT_SUCCESS);
da5b3bad 3327 }
88213476 3328
a2da110b 3329 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3330
2feceb5e 3331 fds = fdset_free(fds);
842f3b0f 3332
6d0b55c2
LP
3333 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3334 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3335 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3336 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3337
03cfe0d5
LP
3338 /* Wait for the outer child. */
3339 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3340 if (r < 0)
3341 goto finish;
3342 if (r != 0) {
3343 r = -EIO;
3344 goto finish;
3345 }
3346 pid = 0;
6dac160c 3347
03cfe0d5
LP
3348 /* And now retrieve the PID of the inner child. */
3349 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3350 if (l < 0) {
3351 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3352 goto finish;
3353 }
3354 if (l != sizeof(pid)) {
76d44882 3355 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3356 r = EIO;
3357 goto finish;
3358 }
354bfd2b 3359
03cfe0d5 3360 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3361
03cfe0d5
LP
3362 if (arg_userns) {
3363 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3364 log_error("Child died too early.");
3365 r = -ESRCH;
840295fc 3366 goto finish;
03cfe0d5 3367 }
ab046dde 3368
825d5287
RM
3369 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3370 if (l < 0) {
3371 r = log_error_errno(errno, "Failed to read UID shift: %m");
3372 goto finish;
3373 }
3374 if (l != sizeof(arg_uid_shift)) {
76d44882 3375 log_error("Short read while reading UID shift.");
825d5287
RM
3376 r = EIO;
3377 goto finish;
3378 }
3379
03cfe0d5 3380 r = setup_uid_map(pid);
840295fc
LP
3381 if (r < 0)
3382 goto finish;
ab046dde 3383
03cfe0d5
LP
3384 (void) barrier_place(&barrier); /* #2 */
3385 }
c74e630d 3386
9a2a5625 3387 if (arg_private_network) {
4bbfe7ad 3388
9a2a5625
LP
3389 r = move_network_interfaces(pid, arg_network_interfaces);
3390 if (r < 0)
3391 goto finish;
5aa4bb6b 3392
9a2a5625
LP
3393 if (arg_network_veth) {
3394 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3395 if (r < 0)
3396 goto finish;
3397 else if (r > 0)
3398 ifi = r;
6dac160c 3399
9a2a5625
LP
3400 if (arg_network_bridge) {
3401 r = setup_bridge(veth_name, arg_network_bridge);
3402 if (r < 0)
3403 goto finish;
3404 if (r > 0)
3405 ifi = r;
3406 }
3407 }
6dac160c 3408
9a2a5625
LP
3409 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3410 if (r < 0)
3411 goto finish;
3412
3413 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3414 if (r < 0)
3415 goto finish;
3416 }
6dac160c 3417
b7103bc5
LP
3418 if (arg_register) {
3419 r = register_machine(
3420 arg_machine,
3421 pid,
3422 arg_directory,
3423 arg_uuid,
3424 ifi,
3425 arg_slice,
3426 arg_custom_mounts, arg_n_custom_mounts,
3427 arg_kill_signal,
3428 arg_property,
3429 arg_keep_unit);
3430 if (r < 0)
3431 goto finish;
3432 }
6dac160c 3433
34829a32 3434 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3435 if (r < 0)
3436 goto finish;
3437
34829a32
LP
3438 if (arg_keep_unit) {
3439 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3440 if (r < 0)
3441 goto finish;
3442 }
efdb0237 3443
34829a32 3444 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3445 if (r < 0)
3446 goto finish;
6dac160c 3447
03cfe0d5
LP
3448 /* Notify the child that the parent is ready with all
3449 * its setup (including cgroup-ification), and that
3450 * the child can now hand over control to the code to
3451 * run inside the container. */
3452 (void) barrier_place(&barrier); /* #3 */
6dac160c 3453
03cfe0d5
LP
3454 /* Block SIGCHLD here, before notifying child.
3455 * process_pty() will handle it with the other signals. */
3456 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3457
03cfe0d5
LP
3458 /* Reset signal to default */
3459 r = default_signals(SIGCHLD, -1);
3460 if (r < 0) {
3461 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3462 goto finish;
3463 }
e866af3a 3464
03cfe0d5 3465 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3466 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3467 log_error("Child died too early.");
03cfe0d5
LP
3468 r = -ESRCH;
3469 goto finish;
3470 }
b12afc8c 3471
03cfe0d5
LP
3472 sd_notifyf(false,
3473 "READY=1\n"
3474 "STATUS=Container running.\n"
3475 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3476
03cfe0d5
LP
3477 r = sd_event_new(&event);
3478 if (r < 0) {
3479 log_error_errno(r, "Failed to get default event source: %m");
3480 goto finish;
3481 }
88213476 3482
03cfe0d5
LP
3483 if (arg_kill_signal > 0) {
3484 /* Try to kill the init system on SIGINT or SIGTERM */
3485 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3486 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3487 } else {
3488 /* Immediately exit */
3489 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3490 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3491 }
023fb90b 3492
03cfe0d5
LP
3493 /* simply exit on sigchld */
3494 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3495
03cfe0d5 3496 if (arg_expose_ports) {
7a8f6325 3497 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3498 if (r < 0)
3499 goto finish;
023fb90b 3500
7a8f6325 3501 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3502 }
023fb90b 3503
03cfe0d5 3504 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3505
ae3dde80 3506 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3507 if (r < 0) {
3508 log_error_errno(r, "Failed to create PTY forwarder: %m");
3509 goto finish;
3510 }
023fb90b 3511
03cfe0d5
LP
3512 r = sd_event_loop(event);
3513 if (r < 0) {
3514 log_error_errno(r, "Failed to run event loop: %m");
3515 goto finish;
3516 }
6d0b55c2 3517
03cfe0d5 3518 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3519
03cfe0d5 3520 forward = pty_forward_free(forward);
6d0b55c2 3521
03cfe0d5
LP
3522 if (!arg_quiet && last_char != '\n')
3523 putc('\n', stdout);
04d39279 3524
03cfe0d5 3525 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3526 if (arg_register && !arg_keep_unit)
3527 terminate_machine(pid);
1f0cd86b 3528
840295fc 3529 /* Normally redundant, but better safe than sorry */
04d39279 3530 kill(pid, SIGKILL);
a258bf26 3531
113cea80 3532 r = wait_for_container(pid, &container_status);
04d39279
LP
3533 pid = 0;
3534
ec16945e 3535 if (r < 0)
ce9f1527
LP
3536 /* We failed to wait for the container, or the
3537 * container exited abnormally */
ec16945e
LP
3538 goto finish;
3539 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3540 /* The container exited with a non-zero
3541 * status, or with zero status and no reboot
3542 * was requested. */
ec16945e 3543 ret = r;
d87be9b0 3544 break;
ec16945e 3545 }
88213476 3546
113cea80 3547 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3548
3549 if (arg_keep_unit) {
3550 /* Special handling if we are running as a
3551 * service: instead of simply restarting the
3552 * machine we want to restart the entire
3553 * service, so let's inform systemd about this
3554 * with the special exit code 133. The service
3555 * file uses RestartForceExitStatus=133 so
3556 * that this results in a full nspawn
3557 * restart. This is necessary since we might
3558 * have cgroup parameters set we want to have
3559 * flushed out. */
ec16945e
LP
3560 ret = 133;
3561 r = 0;
ce38dbc8
LP
3562 break;
3563 }
6d0b55c2 3564
7a8f6325 3565 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3566 }
88213476
LP
3567
3568finish:
af4ec430
LP
3569 sd_notify(false,
3570 "STOPPING=1\n"
3571 "STATUS=Terminating...");
3572
9444b1f2
LP
3573 if (pid > 0)
3574 kill(pid, SIGKILL);
88213476 3575
503546da
LP
3576 /* Try to flush whatever is still queued in the pty */
3577 if (master >= 0)
59f448cf 3578 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3579
03cfe0d5
LP
3580 loop_remove(loop_nr, &image_fd);
3581
ec16945e
LP
3582 if (remove_subvol && arg_directory) {
3583 int k;
3584
5bcd08db 3585 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3586 if (k < 0)
3587 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3588 }
3589
785890ac
LP
3590 if (arg_machine) {
3591 const char *p;
3592
63c372cb 3593 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3594 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3595 }
3596
7a8f6325 3597 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3598
04d391da 3599 free(arg_directory);
ec16945e
LP
3600 free(arg_template);
3601 free(arg_image);
7027ff61 3602 free(arg_machine);
c74e630d
LP
3603 free(arg_user);
3604 strv_free(arg_setenv);
f757855e 3605 free(arg_network_bridge);
c74e630d
LP
3606 strv_free(arg_network_interfaces);
3607 strv_free(arg_network_macvlan);
4bbfe7ad 3608 strv_free(arg_network_ipvlan);
f757855e
LP
3609 strv_free(arg_parameters);
3610 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3611 expose_port_free_all(arg_expose_ports);
6d0b55c2 3612
ec16945e 3613 return r < 0 ? EXIT_FAILURE : ret;
88213476 3614}