]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
tree-wide: minor formatting inconsistency cleanups
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
88213476 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
88213476
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
8fe0087e
LP
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
88213476 23#include <errno.h>
88213476 24#include <getopt.h>
1b9e5b12 25#include <linux/loop.h>
8fe0087e 26#include <sched.h>
24fb1112
LP
27#ifdef HAVE_SECCOMP
28#include <seccomp.h>
29#endif
8fe0087e
LP
30#ifdef HAVE_SELINUX
31#include <selinux/selinux.h>
1b9e5b12 32#endif
8fe0087e
LP
33#include <signal.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <sys/file.h>
38#include <sys/mount.h>
39#include <sys/personality.h>
40#include <sys/prctl.h>
41#include <sys/types.h>
42#include <unistd.h>
1b9e5b12 43
1f0cd86b 44#include "sd-daemon.h"
1f0cd86b 45#include "sd-id128.h"
8fe0087e 46
b5efdb8a 47#include "alloc-util.h"
8fe0087e
LP
48#include "barrier.h"
49#include "base-filesystem.h"
50#include "blkid-util.h"
51#include "btrfs-util.h"
8fe0087e 52#include "cap-list.h"
430f0182 53#include "capability-util.h"
04d391da 54#include "cgroup-util.h"
8fe0087e 55#include "copy.h"
4fc9982c 56#include "dev-setup.h"
8fe0087e 57#include "env-util.h"
3ffd4af2 58#include "fd-util.h"
842f3b0f 59#include "fdset.h"
a5c32cff 60#include "fileio.h"
8fe0087e 61#include "formats-util.h"
f4f15635 62#include "fs-util.h"
1b9e5b12 63#include "gpt.h"
8fe0087e
LP
64#include "hostname-util.h"
65#include "log.h"
66#include "loopback-setup.h"
1b9cebf6 67#include "machine-image.h"
8fe0087e
LP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
4349cd7c 71#include "mount-util.h"
8fe0087e 72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
77#include "nspawn-register.h"
78#include "nspawn-settings.h"
79#include "nspawn-setuid.h"
7732f92b 80#include "nspawn-stub-pid1.h"
6bedfcbb 81#include "parse-util.h"
8fe0087e 82#include "path-util.h"
0b452006 83#include "process-util.h"
8fe0087e
LP
84#include "ptyfwd.h"
85#include "random-util.h"
86#include "rm-rf.h"
e9642be2
LP
87#ifdef HAVE_SECCOMP
88#include "seccomp-util.h"
89#endif
8fe0087e 90#include "signal-util.h"
2583fbea 91#include "socket-util.h"
8fcde012 92#include "stat-util.h"
15a5e950 93#include "stdio-util.h"
07630cea 94#include "string-util.h"
8fe0087e
LP
95#include "strv.h"
96#include "terminal-util.h"
97#include "udev-util.h"
affb60b1 98#include "umask-util.h"
b1d4f8e1 99#include "user-util.h"
8fe0087e 100#include "util.h"
e9642be2 101
113cea80
DH
102typedef enum ContainerStatus {
103 CONTAINER_TERMINATED,
104 CONTAINER_REBOOTED
105} ContainerStatus;
106
57fb9fb5
LP
107typedef enum LinkJournal {
108 LINK_NO,
109 LINK_AUTO,
110 LINK_HOST,
111 LINK_GUEST
112} LinkJournal;
88213476
LP
113
114static char *arg_directory = NULL;
ec16945e 115static char *arg_template = NULL;
5f932eb9 116static char *arg_chdir = NULL;
687d0825 117static char *arg_user = NULL;
9444b1f2 118static sd_id128_t arg_uuid = {};
7027ff61 119static char *arg_machine = NULL;
c74e630d
LP
120static const char *arg_selinux_context = NULL;
121static const char *arg_selinux_apifs_context = NULL;
9444b1f2 122static const char *arg_slice = NULL;
ff01d048 123static bool arg_private_network = false;
bc2f673e 124static bool arg_read_only = false;
7732f92b 125static StartMode arg_start_mode = START_PID1;
ec16945e 126static bool arg_ephemeral = false;
57fb9fb5 127static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 128static bool arg_link_journal_try = false;
5076f0cc
LP
129static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 151 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
5a8af538
LP
156static CustomMount *arg_custom_mounts = NULL;
157static unsigned arg_n_custom_mounts = 0;
f4889f65 158static char **arg_setenv = NULL;
284c0b91 159static bool arg_quiet = false;
8a96d94e 160static bool arg_share_system = false;
eb91eb18 161static bool arg_register = true;
89f7c846 162static bool arg_keep_unit = false;
aa28aefe 163static char **arg_network_interfaces = NULL;
c74e630d 164static char **arg_network_macvlan = NULL;
4bbfe7ad 165static char **arg_network_ipvlan = NULL;
69c79d3c 166static bool arg_network_veth = false;
f6d6bad1 167static char **arg_network_veth_extra = NULL;
f757855e 168static char *arg_network_bridge = NULL;
050f7277 169static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 170static char *arg_image = NULL;
f757855e 171static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 172static ExposePort *arg_expose_ports = NULL;
f36933fe 173static char **arg_property = NULL;
6dac160c
LP
174static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175static bool arg_userns = false;
c6c8f6e2 176static int arg_kill_signal = 0;
efdb0237 177static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
178static SettingsMask arg_settings_mask = 0;
179static int arg_settings_trusted = -1;
180static char **arg_parameters = NULL;
6aadfa4c 181static const char *arg_container_service_name = "systemd-nspawn";
88213476 182
601185b4 183static void help(void) {
88213476
LP
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
186 " -h --help Show this help\n"
187 " --version Print version string\n"
69c79d3c 188 " -q --quiet Do not show status information\n"
1b9e5b12 189 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
7732f92b 195 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
a8828ed9 196 " -b --boot Boot up full system (i.e. invoke init)\n"
5f932eb9 197 " --chdir=PATH Set working directory in the container\n"
a8828ed9 198 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 199 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 200 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 201 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 202 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
203 " --private-users[=UIDBASE[:NUIDS]]\n"
204 " Run within user namespace\n"
69c79d3c
LP
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
208 " container\n"
c74e630d
LP
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
4bbfe7ad
TG
212 " --network-ipvlan=INTERFACE\n"
213 " Create a ipvlan network interface based on an\n"
214 " existing network interface to the container\n"
a8eaaee7 215 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 216 " and container\n"
f6d6bad1
LP
217 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
218 " Add an additional virtual Ethernet link between\n"
219 " host and container\n"
ab046dde 220 " --network-bridge=INTERFACE\n"
a8eaaee7 221 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
222 " and container and add it to an existing bridge on\n"
223 " the host\n"
6d0b55c2 224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 225 " Expose a container IP port on the host\n"
82adf6af
LP
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
a8828ed9
DW
232 " --capability=CAP In addition to the default, retain specified\n"
233 " capability\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 235 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
2b26a728
LP
236 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
237 " host, try-guest, try-host\n"
574edc90 238 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 239 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
240 " --bind=PATH[:PATH[:OPTIONS]]\n"
241 " Bind mount a file or directory from the host into\n"
a8828ed9 242 " the container\n"
5e5bfa6e
EY
243 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
244 " Similar, but creates a read-only bind mount\n"
06c17c39 245 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
246 " --overlay=PATH[:PATH...]:PATH\n"
247 " Create an overlay mount from the host to \n"
248 " the container\n"
249 " --overlay-ro=PATH[:PATH...]:PATH\n"
250 " Similar, but creates a read-only overlay mount\n"
284c0b91 251 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 252 " --share-system Share system namespaces with host\n"
eb91eb18 253 " --register=BOOLEAN Register container as machine\n"
89f7c846 254 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 255 " the service unit nspawn is running in\n"
6d0b55c2 256 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 257 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 258 , program_invocation_short_name);
88213476
LP
259}
260
5a8af538
LP
261
262static int custom_mounts_prepare(void) {
263 unsigned i;
264 int r;
265
266 /* Ensure the mounts are applied prefix first. */
267 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
268
269 /* Allocate working directories for the overlay file systems that need it */
270 for (i = 0; i < arg_n_custom_mounts; i++) {
271 CustomMount *m = &arg_custom_mounts[i];
272
825d5287
RM
273 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
274 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
275 return -EINVAL;
276 }
277
5a8af538
LP
278 if (m->type != CUSTOM_MOUNT_OVERLAY)
279 continue;
280
281 if (m->work_dir)
282 continue;
283
284 if (m->read_only)
285 continue;
286
14bcf25c 287 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
288 if (r < 0)
289 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
290 }
291
292 return 0;
293}
294
efdb0237
LP
295static int detect_unified_cgroup_hierarchy(void) {
296 const char *e;
297 int r;
298
299 /* Allow the user to control whether the unified hierarchy is used */
300 e = getenv("UNIFIED_CGROUP_HIERARCHY");
301 if (e) {
302 r = parse_boolean(e);
303 if (r < 0)
304 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
305
306 arg_unified_cgroup_hierarchy = r;
307 return 0;
308 }
309
310 /* Otherwise inherit the default from the host system */
311 r = cg_unified();
312 if (r < 0)
313 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
314
315 arg_unified_cgroup_hierarchy = r;
316 return 0;
317}
318
88213476
LP
319static int parse_argv(int argc, char *argv[]) {
320
a41fe3a2 321 enum {
acbeb427
ZJS
322 ARG_VERSION = 0x100,
323 ARG_PRIVATE_NETWORK,
bc2f673e 324 ARG_UUID,
5076f0cc 325 ARG_READ_ONLY,
57fb9fb5 326 ARG_CAPABILITY,
420c7379 327 ARG_DROP_CAPABILITY,
17fe0523
LP
328 ARG_LINK_JOURNAL,
329 ARG_BIND,
f4889f65 330 ARG_BIND_RO,
06c17c39 331 ARG_TMPFS,
5a8af538
LP
332 ARG_OVERLAY,
333 ARG_OVERLAY_RO,
f4889f65 334 ARG_SETENV,
eb91eb18 335 ARG_SHARE_SYSTEM,
89f7c846 336 ARG_REGISTER,
aa28aefe 337 ARG_KEEP_UNIT,
69c79d3c 338 ARG_NETWORK_INTERFACE,
c74e630d 339 ARG_NETWORK_MACVLAN,
4bbfe7ad 340 ARG_NETWORK_IPVLAN,
ab046dde 341 ARG_NETWORK_BRIDGE,
f6d6bad1 342 ARG_NETWORK_VETH_EXTRA,
6afc95b7 343 ARG_PERSONALITY,
4d9f07b4 344 ARG_VOLATILE,
ec16945e 345 ARG_TEMPLATE,
f36933fe 346 ARG_PROPERTY,
6dac160c 347 ARG_PRIVATE_USERS,
c6c8f6e2 348 ARG_KILL_SIGNAL,
f757855e 349 ARG_SETTINGS,
5f932eb9 350 ARG_CHDIR,
a41fe3a2
LP
351 };
352
88213476 353 static const struct option options[] = {
aa28aefe
LP
354 { "help", no_argument, NULL, 'h' },
355 { "version", no_argument, NULL, ARG_VERSION },
356 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
357 { "template", required_argument, NULL, ARG_TEMPLATE },
358 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
359 { "user", required_argument, NULL, 'u' },
360 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
7732f92b 361 { "as-pid2", no_argument, NULL, 'a' },
aa28aefe
LP
362 { "boot", no_argument, NULL, 'b' },
363 { "uuid", required_argument, NULL, ARG_UUID },
364 { "read-only", no_argument, NULL, ARG_READ_ONLY },
365 { "capability", required_argument, NULL, ARG_CAPABILITY },
366 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
367 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
368 { "bind", required_argument, NULL, ARG_BIND },
369 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 370 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
371 { "overlay", required_argument, NULL, ARG_OVERLAY },
372 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
373 { "machine", required_argument, NULL, 'M' },
374 { "slice", required_argument, NULL, 'S' },
375 { "setenv", required_argument, NULL, ARG_SETENV },
376 { "selinux-context", required_argument, NULL, 'Z' },
377 { "selinux-apifs-context", required_argument, NULL, 'L' },
378 { "quiet", no_argument, NULL, 'q' },
379 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
380 { "register", required_argument, NULL, ARG_REGISTER },
381 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
382 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 383 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 384 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 385 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 386 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 387 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 388 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 389 { "image", required_argument, NULL, 'i' },
4d9f07b4 390 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 391 { "port", required_argument, NULL, 'p' },
f36933fe 392 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 393 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 394 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 395 { "settings", required_argument, NULL, ARG_SETTINGS },
5f932eb9 396 { "chdir", required_argument, NULL, ARG_CHDIR },
eb9da376 397 {}
88213476
LP
398 };
399
9444b1f2 400 int c, r;
6aadfa4c 401 const char *p, *e;
a42c8b54 402 uint64_t plus = 0, minus = 0;
f757855e 403 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
404
405 assert(argc >= 0);
406 assert(argv);
407
7732f92b 408 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
409
410 switch (c) {
411
412 case 'h':
601185b4
ZJS
413 help();
414 return 0;
88213476 415
acbeb427 416 case ARG_VERSION:
3f6fd1ba 417 return version();
acbeb427 418
88213476 419 case 'D':
0f03c2a4 420 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 421 if (r < 0)
0f03c2a4 422 return r;
ec16945e
LP
423 break;
424
425 case ARG_TEMPLATE:
0f03c2a4 426 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 427 if (r < 0)
0f03c2a4 428 return r;
88213476
LP
429 break;
430
1b9e5b12 431 case 'i':
0f03c2a4 432 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 433 if (r < 0)
0f03c2a4 434 return r;
ec16945e
LP
435 break;
436
437 case 'x':
438 arg_ephemeral = true;
1b9e5b12
LP
439 break;
440
687d0825 441 case 'u':
2fc09a9c
DM
442 r = free_and_strdup(&arg_user, optarg);
443 if (r < 0)
7027ff61 444 return log_oom();
687d0825 445
f757855e 446 arg_settings_mask |= SETTING_USER;
687d0825
MV
447 break;
448
ab046dde 449 case ARG_NETWORK_BRIDGE:
f757855e
LP
450 r = free_and_strdup(&arg_network_bridge, optarg);
451 if (r < 0)
452 return log_oom();
ab046dde
TG
453
454 /* fall through */
455
0dfaa006 456 case 'n':
69c79d3c
LP
457 arg_network_veth = true;
458 arg_private_network = true;
f757855e 459 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
460 break;
461
f6d6bad1
LP
462 case ARG_NETWORK_VETH_EXTRA:
463 r = veth_extra_parse(&arg_network_veth_extra, optarg);
464 if (r < 0)
465 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
466
467 arg_private_network = true;
468 arg_settings_mask |= SETTING_NETWORK;
469 break;
470
aa28aefe 471 case ARG_NETWORK_INTERFACE:
c74e630d
LP
472 if (strv_extend(&arg_network_interfaces, optarg) < 0)
473 return log_oom();
474
475 arg_private_network = true;
f757855e 476 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
477 break;
478
479 case ARG_NETWORK_MACVLAN:
480 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
481 return log_oom();
482
4bbfe7ad 483 arg_private_network = true;
f757855e 484 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
485 break;
486
487 case ARG_NETWORK_IPVLAN:
488 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
489 return log_oom();
490
aa28aefe
LP
491 /* fall through */
492
ff01d048
LP
493 case ARG_PRIVATE_NETWORK:
494 arg_private_network = true;
f757855e 495 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
496 break;
497
0f0dbc46 498 case 'b':
7732f92b
LP
499 if (arg_start_mode == START_PID2) {
500 log_error("--boot and --as-pid2 may not be combined.");
501 return -EINVAL;
502 }
503
504 arg_start_mode = START_BOOT;
505 arg_settings_mask |= SETTING_START_MODE;
506 break;
507
508 case 'a':
509 if (arg_start_mode == START_BOOT) {
510 log_error("--boot and --as-pid2 may not be combined.");
511 return -EINVAL;
512 }
513
514 arg_start_mode = START_PID2;
515 arg_settings_mask |= SETTING_START_MODE;
0f0dbc46
LP
516 break;
517
144f0fc0 518 case ARG_UUID:
9444b1f2
LP
519 r = sd_id128_from_string(optarg, &arg_uuid);
520 if (r < 0) {
aa96c6cb 521 log_error("Invalid UUID: %s", optarg);
9444b1f2 522 return r;
aa96c6cb 523 }
f757855e
LP
524
525 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 526 break;
aa96c6cb 527
9444b1f2 528 case 'S':
c74e630d 529 arg_slice = optarg;
144f0fc0
LP
530 break;
531
7027ff61 532 case 'M':
c1521918 533 if (isempty(optarg))
97b11eed 534 arg_machine = mfree(arg_machine);
c1521918 535 else {
0c3c4284 536 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
537 log_error("Invalid machine name: %s", optarg);
538 return -EINVAL;
539 }
7027ff61 540
0c3c4284
LP
541 r = free_and_strdup(&arg_machine, optarg);
542 if (r < 0)
eb91eb18
LP
543 return log_oom();
544
545 break;
546 }
7027ff61 547
82adf6af
LP
548 case 'Z':
549 arg_selinux_context = optarg;
a8828ed9
DW
550 break;
551
82adf6af
LP
552 case 'L':
553 arg_selinux_apifs_context = optarg;
a8828ed9
DW
554 break;
555
bc2f673e
LP
556 case ARG_READ_ONLY:
557 arg_read_only = true;
f757855e 558 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
559 break;
560
420c7379
LP
561 case ARG_CAPABILITY:
562 case ARG_DROP_CAPABILITY: {
6cbe4ed1 563 p = optarg;
9ed794a3 564 for (;;) {
6cbe4ed1 565 _cleanup_free_ char *t = NULL;
5076f0cc 566
6cbe4ed1
SS
567 r = extract_first_word(&p, &t, ",", 0);
568 if (r < 0)
569 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 570
6cbe4ed1
SS
571 if (r == 0)
572 break;
5076f0cc 573
39ed67d1
LP
574 if (streq(t, "all")) {
575 if (c == ARG_CAPABILITY)
a42c8b54 576 plus = (uint64_t) -1;
39ed67d1 577 else
a42c8b54 578 minus = (uint64_t) -1;
39ed67d1 579 } else {
2822da4f
LP
580 int cap;
581
582 cap = capability_from_name(t);
583 if (cap < 0) {
39ed67d1
LP
584 log_error("Failed to parse capability %s.", t);
585 return -EINVAL;
586 }
587
588 if (c == ARG_CAPABILITY)
a42c8b54 589 plus |= 1ULL << (uint64_t) cap;
39ed67d1 590 else
a42c8b54 591 minus |= 1ULL << (uint64_t) cap;
5076f0cc 592 }
5076f0cc
LP
593 }
594
f757855e 595 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
596 break;
597 }
598
57fb9fb5
LP
599 case 'j':
600 arg_link_journal = LINK_GUEST;
574edc90 601 arg_link_journal_try = true;
57fb9fb5
LP
602 break;
603
604 case ARG_LINK_JOURNAL:
53e438e3 605 if (streq(optarg, "auto")) {
57fb9fb5 606 arg_link_journal = LINK_AUTO;
53e438e3
LP
607 arg_link_journal_try = false;
608 } else if (streq(optarg, "no")) {
57fb9fb5 609 arg_link_journal = LINK_NO;
53e438e3
LP
610 arg_link_journal_try = false;
611 } else if (streq(optarg, "guest")) {
57fb9fb5 612 arg_link_journal = LINK_GUEST;
53e438e3
LP
613 arg_link_journal_try = false;
614 } else if (streq(optarg, "host")) {
57fb9fb5 615 arg_link_journal = LINK_HOST;
53e438e3
LP
616 arg_link_journal_try = false;
617 } else if (streq(optarg, "try-guest")) {
574edc90
MP
618 arg_link_journal = LINK_GUEST;
619 arg_link_journal_try = true;
620 } else if (streq(optarg, "try-host")) {
621 arg_link_journal = LINK_HOST;
622 arg_link_journal_try = true;
623 } else {
57fb9fb5
LP
624 log_error("Failed to parse link journal mode %s", optarg);
625 return -EINVAL;
626 }
627
628 break;
629
17fe0523 630 case ARG_BIND:
f757855e
LP
631 case ARG_BIND_RO:
632 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
633 if (r < 0)
634 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 635
f757855e 636 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 637 break;
06c17c39 638
f757855e
LP
639 case ARG_TMPFS:
640 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
641 if (r < 0)
642 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 643
f757855e 644 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 645 break;
5a8af538
LP
646
647 case ARG_OVERLAY:
648 case ARG_OVERLAY_RO: {
649 _cleanup_free_ char *upper = NULL, *destination = NULL;
650 _cleanup_strv_free_ char **lower = NULL;
651 CustomMount *m;
652 unsigned n = 0;
653 char **i;
654
62f9f39a
RM
655 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
656 if (r == -ENOMEM)
06c17c39 657 return log_oom();
62f9f39a
RM
658 else if (r < 0) {
659 log_error("Invalid overlay specification: %s", optarg);
660 return r;
661 }
06c17c39 662
5a8af538
LP
663 STRV_FOREACH(i, lower) {
664 if (!path_is_absolute(*i)) {
665 log_error("Overlay path %s is not absolute.", *i);
666 return -EINVAL;
667 }
668
669 n++;
670 }
671
672 if (n < 2) {
673 log_error("--overlay= needs at least two colon-separated directories specified.");
674 return -EINVAL;
675 }
676
677 if (n == 2) {
678 /* If two parameters are specified,
679 * the first one is the lower, the
680 * second one the upper directory. And
af86c440
ZJS
681 * we'll also define the destination
682 * mount point the same as the upper. */
5a8af538
LP
683 upper = lower[1];
684 lower[1] = NULL;
685
686 destination = strdup(upper);
687 if (!destination)
688 return log_oom();
689
690 } else {
691 upper = lower[n - 2];
692 destination = lower[n - 1];
693 lower[n - 2] = NULL;
694 }
695
f757855e 696 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
697 if (!m)
698 return log_oom();
699
700 m->destination = destination;
701 m->source = upper;
702 m->lower = lower;
703 m->read_only = c == ARG_OVERLAY_RO;
704
705 upper = destination = NULL;
706 lower = NULL;
06c17c39 707
f757855e 708 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
709 break;
710 }
711
f4889f65
LP
712 case ARG_SETENV: {
713 char **n;
714
715 if (!env_assignment_is_valid(optarg)) {
716 log_error("Environment variable assignment '%s' is not valid.", optarg);
717 return -EINVAL;
718 }
719
720 n = strv_env_set(arg_setenv, optarg);
721 if (!n)
722 return log_oom();
723
724 strv_free(arg_setenv);
725 arg_setenv = n;
f757855e
LP
726
727 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
728 break;
729 }
730
284c0b91
LP
731 case 'q':
732 arg_quiet = true;
733 break;
734
8a96d94e
LP
735 case ARG_SHARE_SYSTEM:
736 arg_share_system = true;
737 break;
738
eb91eb18
LP
739 case ARG_REGISTER:
740 r = parse_boolean(optarg);
741 if (r < 0) {
742 log_error("Failed to parse --register= argument: %s", optarg);
743 return r;
744 }
745
746 arg_register = r;
747 break;
748
89f7c846
LP
749 case ARG_KEEP_UNIT:
750 arg_keep_unit = true;
751 break;
752
6afc95b7
LP
753 case ARG_PERSONALITY:
754
ac45f971 755 arg_personality = personality_from_string(optarg);
050f7277 756 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
757 log_error("Unknown or unsupported personality '%s'.", optarg);
758 return -EINVAL;
759 }
760
f757855e 761 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
762 break;
763
4d9f07b4
LP
764 case ARG_VOLATILE:
765
766 if (!optarg)
f757855e 767 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 768 else {
f757855e 769 VolatileMode m;
4d9f07b4 770
f757855e
LP
771 m = volatile_mode_from_string(optarg);
772 if (m < 0) {
773 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 774 return -EINVAL;
f757855e
LP
775 } else
776 arg_volatile_mode = m;
6d0b55c2
LP
777 }
778
f757855e
LP
779 arg_settings_mask |= SETTING_VOLATILE_MODE;
780 break;
6d0b55c2 781
f757855e
LP
782 case 'p':
783 r = expose_port_parse(&arg_expose_ports, optarg);
784 if (r == -EEXIST)
785 return log_error_errno(r, "Duplicate port specification: %s", optarg);
786 if (r < 0)
787 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 788
f757855e 789 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 790 break;
6d0b55c2 791
f36933fe
LP
792 case ARG_PROPERTY:
793 if (strv_extend(&arg_property, optarg) < 0)
794 return log_oom();
795
796 break;
797
6dac160c
LP
798 case ARG_PRIVATE_USERS:
799 if (optarg) {
800 _cleanup_free_ char *buffer = NULL;
801 const char *range, *shift;
802
803 range = strchr(optarg, ':');
804 if (range) {
805 buffer = strndup(optarg, range - optarg);
806 if (!buffer)
807 return log_oom();
808 shift = buffer;
809
810 range++;
811 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
812 log_error("Failed to parse UID range: %s", range);
813 return -EINVAL;
814 }
815 } else
816 shift = optarg;
817
818 if (parse_uid(shift, &arg_uid_shift) < 0) {
819 log_error("Failed to parse UID: %s", optarg);
820 return -EINVAL;
821 }
822 }
823
824 arg_userns = true;
825 break;
826
c6c8f6e2
LP
827 case ARG_KILL_SIGNAL:
828 arg_kill_signal = signal_from_string_try_harder(optarg);
829 if (arg_kill_signal < 0) {
830 log_error("Cannot parse signal: %s", optarg);
831 return -EINVAL;
832 }
833
f757855e
LP
834 arg_settings_mask |= SETTING_KILL_SIGNAL;
835 break;
836
837 case ARG_SETTINGS:
838
839 /* no → do not read files
840 * yes → read files, do not override cmdline, trust only subset
841 * override → read files, override cmdline, trust only subset
842 * trusted → read files, do not override cmdline, trust all
843 */
844
845 r = parse_boolean(optarg);
846 if (r < 0) {
847 if (streq(optarg, "trusted")) {
848 mask_all_settings = false;
849 mask_no_settings = false;
850 arg_settings_trusted = true;
851
852 } else if (streq(optarg, "override")) {
853 mask_all_settings = false;
854 mask_no_settings = true;
855 arg_settings_trusted = -1;
856 } else
857 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
858 } else if (r > 0) {
859 /* yes */
860 mask_all_settings = false;
861 mask_no_settings = false;
862 arg_settings_trusted = -1;
863 } else {
864 /* no */
865 mask_all_settings = true;
866 mask_no_settings = false;
867 arg_settings_trusted = false;
868 }
869
c6c8f6e2
LP
870 break;
871
5f932eb9
LP
872 case ARG_CHDIR:
873 if (!path_is_absolute(optarg)) {
874 log_error("Working directory %s is not an absolute path.", optarg);
875 return -EINVAL;
876 }
877
878 r = free_and_strdup(&arg_chdir, optarg);
879 if (r < 0)
880 return log_oom();
881
882 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
883 break;
884
88213476
LP
885 case '?':
886 return -EINVAL;
887
888 default:
eb9da376 889 assert_not_reached("Unhandled option");
88213476 890 }
88213476 891
eb91eb18
LP
892 if (arg_share_system)
893 arg_register = false;
894
7732f92b 895 if (arg_start_mode != START_PID1 && arg_share_system) {
eb91eb18
LP
896 log_error("--boot and --share-system may not be combined.");
897 return -EINVAL;
898 }
899
89f7c846
LP
900 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
901 log_error("--keep-unit may not be used when invoked from a user session.");
902 return -EINVAL;
903 }
904
1b9e5b12
LP
905 if (arg_directory && arg_image) {
906 log_error("--directory= and --image= may not be combined.");
907 return -EINVAL;
908 }
909
ec16945e
LP
910 if (arg_template && arg_image) {
911 log_error("--template= and --image= may not be combined.");
912 return -EINVAL;
913 }
914
915 if (arg_template && !(arg_directory || arg_machine)) {
916 log_error("--template= needs --directory= or --machine=.");
917 return -EINVAL;
918 }
919
920 if (arg_ephemeral && arg_template) {
921 log_error("--ephemeral and --template= may not be combined.");
922 return -EINVAL;
923 }
924
925 if (arg_ephemeral && arg_image) {
926 log_error("--ephemeral and --image= may not be combined.");
927 return -EINVAL;
928 }
929
df9a75e4
LP
930 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
931 log_error("--ephemeral and --link-journal= may not be combined.");
932 return -EINVAL;
933 }
934
f757855e
LP
935 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
936 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
937
938 if (argc > optind) {
939 arg_parameters = strv_copy(argv + optind);
940 if (!arg_parameters)
941 return log_oom();
942
7732f92b 943 arg_settings_mask |= SETTING_START_MODE;
f757855e
LP
944 }
945
946 /* Load all settings from .nspawn files */
947 if (mask_no_settings)
948 arg_settings_mask = 0;
949
950 /* Don't load any settings from .nspawn files */
951 if (mask_all_settings)
952 arg_settings_mask = _SETTINGS_MASK_ALL;
953
954 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
955
956 r = detect_unified_cgroup_hierarchy();
957 if (r < 0)
958 return r;
959
6aadfa4c
ILG
960 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
961 if (e)
962 arg_container_service_name = e;
963
f757855e
LP
964 return 1;
965}
966
967static int verify_arguments(void) {
968
969 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
970 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
971 return -EINVAL;
972 }
973
6d0b55c2
LP
974 if (arg_expose_ports && !arg_private_network) {
975 log_error("Cannot use --port= without private networking.");
976 return -EINVAL;
977 }
978
7732f92b 979 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
c6c8f6e2
LP
980 arg_kill_signal = SIGRTMIN+3;
981
f757855e 982 return 0;
88213476
LP
983}
984
03cfe0d5
LP
985static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
986 assert(p);
987
988 if (!arg_userns)
989 return 0;
990
991 if (uid == UID_INVALID && gid == GID_INVALID)
992 return 0;
993
994 if (uid != UID_INVALID) {
995 uid += arg_uid_shift;
996
997 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
998 return -EOVERFLOW;
999 }
1000
1001 if (gid != GID_INVALID) {
1002 gid += (gid_t) arg_uid_shift;
1003
1004 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1005 return -EOVERFLOW;
1006 }
1007
1008 if (lchown(p, uid, gid) < 0)
1009 return -errno;
b12afc8c
LP
1010
1011 return 0;
1012}
1013
03cfe0d5
LP
1014static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1015 const char *q;
1016
1017 q = prefix_roota(root, path);
1018 if (mkdir(q, mode) < 0) {
1019 if (errno == EEXIST)
1020 return 0;
1021 return -errno;
1022 }
1023
1024 return userns_lchown(q, uid, gid);
1025}
1026
e58a1277 1027static int setup_timezone(const char *dest) {
03cfe0d5
LP
1028 _cleanup_free_ char *p = NULL, *q = NULL;
1029 const char *where, *check, *what;
d4036145
LP
1030 char *z, *y;
1031 int r;
f8440af5 1032
e58a1277
LP
1033 assert(dest);
1034
1035 /* Fix the timezone, if possible */
d4036145
LP
1036 r = readlink_malloc("/etc/localtime", &p);
1037 if (r < 0) {
1038 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1039 return 0;
1040 }
1041
1042 z = path_startswith(p, "../usr/share/zoneinfo/");
1043 if (!z)
1044 z = path_startswith(p, "/usr/share/zoneinfo/");
1045 if (!z) {
1046 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1047 return 0;
1048 }
1049
03cfe0d5 1050 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1051 r = readlink_malloc(where, &q);
1052 if (r >= 0) {
1053 y = path_startswith(q, "../usr/share/zoneinfo/");
1054 if (!y)
1055 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1056
d4036145
LP
1057 /* Already pointing to the right place? Then do nothing .. */
1058 if (y && streq(y, z))
1059 return 0;
1060 }
1061
03cfe0d5 1062 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1063 check = prefix_roota(dest, check);
03cfe0d5 1064 if (laccess(check, F_OK) < 0) {
d4036145
LP
1065 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1066 return 0;
1067 }
68fb0892 1068
79d80fc1
TG
1069 r = unlink(where);
1070 if (r < 0 && errno != ENOENT) {
56f64d95 1071 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1072 return 0;
1073 }
4d9f07b4 1074
03cfe0d5 1075 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1076 if (symlink(what, where) < 0) {
56f64d95 1077 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1078 return 0;
1079 }
e58a1277 1080
03cfe0d5
LP
1081 r = userns_lchown(where, 0, 0);
1082 if (r < 0)
1083 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1084
e58a1277 1085 return 0;
88213476
LP
1086}
1087
2547bb41 1088static int setup_resolv_conf(const char *dest) {
03cfe0d5 1089 const char *where = NULL;
79d80fc1 1090 int r;
2547bb41
LP
1091
1092 assert(dest);
1093
1094 if (arg_private_network)
1095 return 0;
1096
1097 /* Fix resolv.conf, if possible */
03cfe0d5 1098 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1099
f2068bcc 1100 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1101 if (r < 0) {
68a313c5
LP
1102 /* If the file already exists as symlink, let's
1103 * suppress the warning, under the assumption that
1104 * resolved or something similar runs inside and the
1105 * symlink points there.
1106 *
1107 * If the disk image is read-only, there's also no
1108 * point in complaining.
1109 */
1110 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1111 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1112 return 0;
1113 }
2547bb41 1114
03cfe0d5
LP
1115 r = userns_lchown(where, 0, 0);
1116 if (r < 0)
1117 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1118
2547bb41
LP
1119 return 0;
1120}
1121
9f24adc2 1122static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1123 assert(s);
9f24adc2
LP
1124
1125 snprintf(s, 37,
1126 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1127 SD_ID128_FORMAT_VAL(id));
1128
1129 return s;
1130}
1131
04bc4a3f 1132static int setup_boot_id(const char *dest) {
03cfe0d5 1133 const char *from, *to;
39883f62 1134 sd_id128_t rnd = {};
04bc4a3f
LP
1135 char as_uuid[37];
1136 int r;
1137
eb91eb18
LP
1138 if (arg_share_system)
1139 return 0;
1140
04bc4a3f
LP
1141 /* Generate a new randomized boot ID, so that each boot-up of
1142 * the container gets a new one */
1143
03cfe0d5
LP
1144 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1145 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1146
1147 r = sd_id128_randomize(&rnd);
f647962d
MS
1148 if (r < 0)
1149 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1150
9f24adc2 1151 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1152
4c1fc3e4 1153 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1154 if (r < 0)
1155 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1156
03cfe0d5
LP
1157 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1158 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1159 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1160 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1161
1162 unlink(from);
04bc4a3f
LP
1163 return r;
1164}
1165
e58a1277 1166static int copy_devnodes(const char *dest) {
88213476
LP
1167
1168 static const char devnodes[] =
1169 "null\0"
1170 "zero\0"
1171 "full\0"
1172 "random\0"
1173 "urandom\0"
85614d66
TG
1174 "tty\0"
1175 "net/tun\0";
88213476
LP
1176
1177 const char *d;
e58a1277 1178 int r = 0;
7fd1b19b 1179 _cleanup_umask_ mode_t u;
a258bf26
LP
1180
1181 assert(dest);
124640f1
LP
1182
1183 u = umask(0000);
88213476 1184
03cfe0d5
LP
1185 /* Create /dev/net, so that we can create /dev/net/tun in it */
1186 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1187 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1188
88213476 1189 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1190 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1191 struct stat st;
88213476 1192
7f112f50 1193 from = strappend("/dev/", d);
03cfe0d5 1194 to = prefix_root(dest, from);
88213476
LP
1195
1196 if (stat(from, &st) < 0) {
1197
4a62c710
MS
1198 if (errno != ENOENT)
1199 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1200
a258bf26 1201 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1202
03cfe0d5 1203 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1204 return -EIO;
a258bf26 1205
85614d66 1206 } else {
81f5049b
AC
1207 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1208 if (errno != EPERM)
1209 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1210
1211 /* Some systems abusively restrict mknod but
1212 * allow bind mounts. */
1213 r = touch(to);
1214 if (r < 0)
1215 return log_error_errno(r, "touch (%s) failed: %m", to);
1216 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1217 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1218 }
6278cf60 1219
03cfe0d5
LP
1220 r = userns_lchown(to, 0, 0);
1221 if (r < 0)
1222 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1223 }
88213476
LP
1224 }
1225
e58a1277
LP
1226 return r;
1227}
88213476 1228
03cfe0d5
LP
1229static int setup_pts(const char *dest) {
1230 _cleanup_free_ char *options = NULL;
1231 const char *p;
709f6e46 1232 int r;
03cfe0d5
LP
1233
1234#ifdef HAVE_SELINUX
1235 if (arg_selinux_apifs_context)
1236 (void) asprintf(&options,
3dce8915 1237 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1238 arg_uid_shift + TTY_GID,
1239 arg_selinux_apifs_context);
1240 else
1241#endif
1242 (void) asprintf(&options,
3dce8915 1243 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1244 arg_uid_shift + TTY_GID);
f2d88580 1245
03cfe0d5 1246 if (!options)
f2d88580
LP
1247 return log_oom();
1248
03cfe0d5 1249 /* Mount /dev/pts itself */
cc9fce65 1250 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1251 if (mkdir(p, 0755) < 0)
1252 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1253 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1254 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1255 r = userns_lchown(p, 0, 0);
1256 if (r < 0)
1257 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1258
1259 /* Create /dev/ptmx symlink */
1260 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1261 if (symlink("pts/ptmx", p) < 0)
1262 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1263 r = userns_lchown(p, 0, 0);
1264 if (r < 0)
1265 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1266
03cfe0d5
LP
1267 /* And fix /dev/pts/ptmx ownership */
1268 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1269 r = userns_lchown(p, 0, 0);
1270 if (r < 0)
1271 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1272
f2d88580
LP
1273 return 0;
1274}
1275
e58a1277 1276static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1277 _cleanup_umask_ mode_t u;
1278 const char *to;
e58a1277 1279 int r;
e58a1277
LP
1280
1281 assert(dest);
1282 assert(console);
1283
1284 u = umask(0000);
1285
03cfe0d5 1286 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1289
a258bf26
LP
1290 /* We need to bind mount the right tty to /dev/console since
1291 * ptys can only exist on pts file systems. To have something
81f5049b 1292 * to bind mount things on we create a empty regular file. */
a258bf26 1293
03cfe0d5 1294 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1295 r = touch(to);
1296 if (r < 0)
1297 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1298
4543768d 1299 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1300 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1301
25ea79fe 1302 return 0;
e58a1277
LP
1303}
1304
1305static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1306 const char *from, *to;
7fd1b19b 1307 _cleanup_umask_ mode_t u;
d9603714 1308 int fd, r;
e58a1277 1309
e58a1277 1310 assert(kmsg_socket >= 0);
a258bf26 1311
e58a1277 1312 u = umask(0000);
a258bf26 1313
03cfe0d5 1314 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1315 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1316 * on the reading side behave very similar to /proc/kmsg,
1317 * their writing side behaves differently from /dev/kmsg in
1318 * that writing blocks when nothing is reading. In order to
1319 * avoid any problems with containers deadlocking due to this
1320 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1321 from = prefix_roota(dest, "/run/kmsg");
1322 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1323
4a62c710 1324 if (mkfifo(from, 0600) < 0)
03cfe0d5 1325 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1326 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1327 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1328
1329 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1330 if (fd < 0)
1331 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1332
e58a1277
LP
1333 /* Store away the fd in the socket, so that it stays open as
1334 * long as we run the child */
3ee897d6 1335 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1336 safe_close(fd);
e58a1277 1337
d9603714
DH
1338 if (r < 0)
1339 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1340
03cfe0d5
LP
1341 /* And now make the FIFO unavailable as /run/kmsg... */
1342 (void) unlink(from);
1343
25ea79fe 1344 return 0;
88213476
LP
1345}
1346
1c4baffc 1347static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1348 union in_addr_union *exposed = userdata;
1349
1350 assert(rtnl);
1351 assert(m);
1352 assert(exposed);
1353
7a8f6325 1354 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1355 return 0;
1356}
1357
3a74cea5 1358static int setup_hostname(void) {
3a74cea5 1359
eb91eb18
LP
1360 if (arg_share_system)
1361 return 0;
1362
605f81a8 1363 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1364 return -errno;
3a74cea5 1365
7027ff61 1366 return 0;
3a74cea5
LP
1367}
1368
57fb9fb5 1369static int setup_journal(const char *directory) {
4d680aee 1370 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1371 _cleanup_free_ char *b = NULL, *d = NULL;
1372 const char *etc_machine_id, *p, *q;
8054d749 1373 bool try;
27407a01 1374 char *id;
57fb9fb5
LP
1375 int r;
1376
df9a75e4
LP
1377 /* Don't link journals in ephemeral mode */
1378 if (arg_ephemeral)
1379 return 0;
1380
8054d749
LP
1381 if (arg_link_journal == LINK_NO)
1382 return 0;
1383
1384 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1385
03cfe0d5 1386 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1387
03cfe0d5 1388 r = read_one_line_file(etc_machine_id, &b);
8054d749 1389 if (r == -ENOENT && try)
27407a01 1390 return 0;
f647962d 1391 else if (r < 0)
03cfe0d5 1392 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1393
27407a01 1394 id = strstrip(b);
8054d749 1395 if (isempty(id) && try)
27407a01 1396 return 0;
57fb9fb5 1397
27407a01
ZJS
1398 /* Verify validity */
1399 r = sd_id128_from_string(id, &machine_id);
f647962d 1400 if (r < 0)
03cfe0d5 1401 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1402
4d680aee 1403 r = sd_id128_get_machine(&this_id);
f647962d
MS
1404 if (r < 0)
1405 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1406
1407 if (sd_id128_equal(machine_id, this_id)) {
8054d749 1408 log_full(try ? LOG_WARNING : LOG_ERR,
4d680aee 1409 "Host and machine ids are equal (%s): refusing to link journals", id);
8054d749 1410 if (try)
4d680aee 1411 return 0;
df9a75e4 1412 return -EEXIST;
4d680aee
ZJS
1413 }
1414
03cfe0d5
LP
1415 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to create /var: %m");
1418
1419 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1420 if (r < 0)
1421 return log_error_errno(r, "Failed to create /var/log: %m");
1422
1423 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1424 if (r < 0)
1425 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1426
1427 p = strjoina("/var/log/journal/", id);
1428 q = prefix_roota(directory, p);
27407a01 1429
e26d6ce5 1430 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1431 if (try)
1432 return 0;
27407a01 1433
8054d749
LP
1434 log_error("%s: already a mount point, refusing to use for journal", p);
1435 return -EEXIST;
57fb9fb5
LP
1436 }
1437
e26d6ce5 1438 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1439 if (try)
1440 return 0;
57fb9fb5 1441
8054d749
LP
1442 log_error("%s: already a mount point, refusing to use for journal", q);
1443 return -EEXIST;
57fb9fb5
LP
1444 }
1445
1446 r = readlink_and_make_absolute(p, &d);
1447 if (r >= 0) {
1448 if ((arg_link_journal == LINK_GUEST ||
1449 arg_link_journal == LINK_AUTO) &&
1450 path_equal(d, q)) {
1451
03cfe0d5 1452 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1453 if (r < 0)
709f6e46 1454 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1455 return 0;
57fb9fb5
LP
1456 }
1457
4a62c710
MS
1458 if (unlink(p) < 0)
1459 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1460 } else if (r == -EINVAL) {
1461
1462 if (arg_link_journal == LINK_GUEST &&
1463 rmdir(p) < 0) {
1464
27407a01
ZJS
1465 if (errno == ENOTDIR) {
1466 log_error("%s already exists and is neither a symlink nor a directory", p);
1467 return r;
4314d33f
MS
1468 } else
1469 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1470 }
4314d33f
MS
1471 } else if (r != -ENOENT)
1472 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1473
1474 if (arg_link_journal == LINK_GUEST) {
1475
1476 if (symlink(q, p) < 0) {
8054d749 1477 if (try) {
56f64d95 1478 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1479 return 0;
4314d33f
MS
1480 } else
1481 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1482 }
1483
03cfe0d5 1484 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1485 if (r < 0)
709f6e46 1486 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1487 return 0;
57fb9fb5
LP
1488 }
1489
1490 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1491 /* don't create parents here -- if the host doesn't have
1492 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1493
1494 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1495 if (try) {
56f64d95 1496 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1497 return 0;
4314d33f
MS
1498 } else
1499 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1500 }
1501
27407a01
ZJS
1502 } else if (access(p, F_OK) < 0)
1503 return 0;
57fb9fb5 1504
cdb2b9d0
LP
1505 if (dir_is_empty(q) == 0)
1506 log_warning("%s is not empty, proceeding anyway.", q);
1507
03cfe0d5 1508 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1509 if (r < 0)
1510 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1511
4543768d 1512 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1513 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1514
27407a01 1515 return 0;
57fb9fb5
LP
1516}
1517
88213476 1518static int drop_capabilities(void) {
a103496c 1519 return capability_bounding_set_drop(arg_retain, false);
88213476
LP
1520}
1521
db999e0f
LP
1522static int reset_audit_loginuid(void) {
1523 _cleanup_free_ char *p = NULL;
1524 int r;
1525
1526 if (arg_share_system)
1527 return 0;
1528
1529 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1530 if (r == -ENOENT)
db999e0f 1531 return 0;
f647962d
MS
1532 if (r < 0)
1533 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1534
1535 /* Already reset? */
1536 if (streq(p, "4294967295"))
1537 return 0;
1538
ad118bda 1539 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1540 if (r < 0) {
10a87006
LP
1541 log_error_errno(r,
1542 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1543 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1544 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1545 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1546 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1547
db999e0f 1548 sleep(5);
77b6e194 1549 }
db999e0f
LP
1550
1551 return 0;
77b6e194
LP
1552}
1553
28650077 1554static int setup_seccomp(void) {
24fb1112
LP
1555
1556#ifdef HAVE_SECCOMP
9a71b112
JF
1557 static const struct {
1558 uint64_t capability;
1559 int syscall_num;
1560 } blacklist[] = {
5ba7a268
LP
1561 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1562 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1563 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1564 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1565 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1566 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1567 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1568 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1569 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1570 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1571 };
1572
24fb1112 1573 scmp_filter_ctx seccomp;
28650077 1574 unsigned i;
24fb1112
LP
1575 int r;
1576
24fb1112
LP
1577 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1578 if (!seccomp)
1579 return log_oom();
1580
e9642be2 1581 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1582 if (r < 0) {
da927ba9 1583 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1584 goto finish;
1585 }
1586
28650077 1587 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1588 if (arg_retain & (1ULL << blacklist[i].capability))
1589 continue;
1590
1591 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1592 if (r == -EFAULT)
1593 continue; /* unknown syscall */
1594 if (r < 0) {
da927ba9 1595 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1596 goto finish;
1597 }
1598 }
1599
d0a0ccf3 1600
28650077
LP
1601 /*
1602 Audit is broken in containers, much of the userspace audit
1603 hookup will fail if running inside a container. We don't
1604 care and just turn off creation of audit sockets.
1605
1606 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1607 with EAFNOSUPPORT which audit userspace uses as indication
1608 that audit is disabled in the kernel.
1609 */
1610
3302da46 1611 r = seccomp_rule_add(
24fb1112
LP
1612 seccomp,
1613 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1614 SCMP_SYS(socket),
1615 2,
1616 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1617 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1618 if (r < 0) {
da927ba9 1619 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1620 goto finish;
1621 }
1622
1623 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1624 if (r < 0) {
da927ba9 1625 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1626 goto finish;
1627 }
1628
1629 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1630 if (r == -EINVAL) {
1631 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1632 r = 0;
1633 goto finish;
1634 }
1635 if (r < 0) {
da927ba9 1636 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1637 goto finish;
1638 }
24fb1112
LP
1639
1640finish:
1641 seccomp_release(seccomp);
1642 return r;
1643#else
1644 return 0;
1645#endif
1646
1647}
1648
785890ac
LP
1649static int setup_propagate(const char *root) {
1650 const char *p, *q;
709f6e46 1651 int r;
785890ac
LP
1652
1653 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1654 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1655 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1656 (void) mkdir_p(p, 0600);
1657
709f6e46
MS
1658 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1659 if (r < 0)
1660 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1661
709f6e46
MS
1662 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1663 if (r < 0)
1664 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1665
709f6e46
MS
1666 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1667 if (r < 0)
1668 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1669
03cfe0d5 1670 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1671 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1672 return log_error_errno(errno, "Failed to install propagation bind mount.");
1673
1674 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1675 return log_error_errno(errno, "Failed to make propagation mount read-only");
1676
1677 return 0;
1678}
1679
1b9e5b12
LP
1680static int setup_image(char **device_path, int *loop_nr) {
1681 struct loop_info64 info = {
1682 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1683 };
1684 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1685 _cleanup_free_ char* loopdev = NULL;
1686 struct stat st;
1687 int r, nr;
1688
1689 assert(device_path);
1690 assert(loop_nr);
ec16945e 1691 assert(arg_image);
1b9e5b12
LP
1692
1693 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1694 if (fd < 0)
1695 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1696
4a62c710
MS
1697 if (fstat(fd, &st) < 0)
1698 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1699
1700 if (S_ISBLK(st.st_mode)) {
1701 char *p;
1702
1703 p = strdup(arg_image);
1704 if (!p)
1705 return log_oom();
1706
1707 *device_path = p;
1708
1709 *loop_nr = -1;
1710
1711 r = fd;
1712 fd = -1;
1713
1714 return r;
1715 }
1716
1717 if (!S_ISREG(st.st_mode)) {
070edd97 1718 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1719 return -EINVAL;
1720 }
1721
1722 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1723 if (control < 0)
1724 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1725
1726 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1727 if (nr < 0)
1728 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1729
1730 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1731 return log_oom();
1732
1733 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1734 if (loop < 0)
1735 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1736
4a62c710
MS
1737 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1738 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1739
1740 if (arg_read_only)
1741 info.lo_flags |= LO_FLAGS_READ_ONLY;
1742
4a62c710
MS
1743 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1744 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1745
1746 *device_path = loopdev;
1747 loopdev = NULL;
1748
1749 *loop_nr = nr;
1750
1751 r = loop;
1752 loop = -1;
1753
1754 return r;
1755}
1756
ada4799a
LP
1757#define PARTITION_TABLE_BLURB \
1758 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1759 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1760 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1761 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1762 "to be bootable with systemd-nspawn."
1763
1b9e5b12
LP
1764static int dissect_image(
1765 int fd,
727fd4fd
LP
1766 char **root_device, bool *root_device_rw,
1767 char **home_device, bool *home_device_rw,
1768 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1769 bool *secondary) {
1770
1771#ifdef HAVE_BLKID
01dc33ce
ZJS
1772 int home_nr = -1, srv_nr = -1;
1773#ifdef GPT_ROOT_NATIVE
1774 int root_nr = -1;
1775#endif
1776#ifdef GPT_ROOT_SECONDARY
1777 int secondary_root_nr = -1;
1778#endif
f6c51a81 1779 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1780 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1781 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1782 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1783 _cleanup_udev_unref_ struct udev *udev = NULL;
1784 struct udev_list_entry *first, *item;
f6c51a81 1785 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1786 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1787 const char *pttype = NULL;
1788 blkid_partlist pl;
1789 struct stat st;
c09ef2e4 1790 unsigned i;
1b9e5b12
LP
1791 int r;
1792
1793 assert(fd >= 0);
1794 assert(root_device);
1795 assert(home_device);
1796 assert(srv_device);
1797 assert(secondary);
ec16945e 1798 assert(arg_image);
1b9e5b12
LP
1799
1800 b = blkid_new_probe();
1801 if (!b)
1802 return log_oom();
1803
1804 errno = 0;
1805 r = blkid_probe_set_device(b, fd, 0, 0);
1806 if (r != 0) {
1807 if (errno == 0)
1808 return log_oom();
1809
e1427b13 1810 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1811 }
1812
1813 blkid_probe_enable_partitions(b, 1);
1814 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1815
1816 errno = 0;
1817 r = blkid_do_safeprobe(b);
1818 if (r == -2 || r == 1) {
ada4799a
LP
1819 log_error("Failed to identify any partition table on\n"
1820 " %s\n"
1821 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1822 return -EINVAL;
1823 } else if (r != 0) {
1824 if (errno == 0)
1825 errno = EIO;
e1427b13 1826 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1827 }
1828
48861960 1829 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1830
1831 is_gpt = streq_ptr(pttype, "gpt");
1832 is_mbr = streq_ptr(pttype, "dos");
1833
1834 if (!is_gpt && !is_mbr) {
1835 log_error("No GPT or MBR partition table discovered on\n"
1836 " %s\n"
1837 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1838 return -EINVAL;
1839 }
1840
1841 errno = 0;
1842 pl = blkid_probe_get_partitions(b);
1843 if (!pl) {
1844 if (errno == 0)
1845 return log_oom();
1846
1847 log_error("Failed to list partitions of %s", arg_image);
1848 return -errno;
1849 }
1850
1851 udev = udev_new();
1852 if (!udev)
1853 return log_oom();
1854
4a62c710
MS
1855 if (fstat(fd, &st) < 0)
1856 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1857
c09ef2e4
LP
1858 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1859 if (!d)
1b9e5b12
LP
1860 return log_oom();
1861
c09ef2e4
LP
1862 for (i = 0;; i++) {
1863 int n, m;
1b9e5b12 1864
c09ef2e4
LP
1865 if (i >= 10) {
1866 log_error("Kernel partitions never appeared.");
1867 return -ENXIO;
1868 }
1869
1870 e = udev_enumerate_new(udev);
1871 if (!e)
1872 return log_oom();
1873
1874 r = udev_enumerate_add_match_parent(e, d);
1875 if (r < 0)
1876 return log_oom();
1877
1878 r = udev_enumerate_scan_devices(e);
1879 if (r < 0)
1880 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1881
1882 /* Count the partitions enumerated by the kernel */
1883 n = 0;
1884 first = udev_enumerate_get_list_entry(e);
1885 udev_list_entry_foreach(item, first)
1886 n++;
1887
1888 /* Count the partitions enumerated by blkid */
1889 m = blkid_partlist_numof_partitions(pl);
1890 if (n == m + 1)
1891 break;
1892 if (n > m + 1) {
1893 log_error("blkid and kernel partition list do not match.");
1894 return -EIO;
1895 }
1896 if (n < m + 1) {
1897 unsigned j;
1898
1899 /* The kernel has probed fewer partitions than
1900 * blkid? Maybe the kernel prober is still
1901 * running or it got EBUSY because udev
1902 * already opened the device. Let's reprobe
1903 * the device, which is a synchronous call
1904 * that waits until probing is complete. */
1905
1906 for (j = 0; j < 20; j++) {
1907
1908 r = ioctl(fd, BLKRRPART, 0);
1909 if (r < 0)
1910 r = -errno;
1911 if (r >= 0 || r != -EBUSY)
1912 break;
1913
1914 /* If something else has the device
1915 * open, such as an udev rule, the
1916 * ioctl will return EBUSY. Since
1917 * there's no way to wait until it
1918 * isn't busy anymore, let's just wait
1919 * a bit, and try again.
1920 *
1921 * This is really something they
1922 * should fix in the kernel! */
1923
1924 usleep(50 * USEC_PER_MSEC);
1925 }
1926
1927 if (r < 0)
1928 return log_error_errno(r, "Failed to reread partition table: %m");
1929 }
1930
1931 e = udev_enumerate_unref(e);
1932 }
1b9e5b12
LP
1933
1934 first = udev_enumerate_get_list_entry(e);
1935 udev_list_entry_foreach(item, first) {
1936 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1937 const char *node;
727fd4fd 1938 unsigned long long flags;
1b9e5b12
LP
1939 blkid_partition pp;
1940 dev_t qn;
1941 int nr;
1942
1943 errno = 0;
1944 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1945 if (!q) {
1946 if (!errno)
1947 errno = ENOMEM;
1948
e1427b13 1949 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1950 }
1951
1952 qn = udev_device_get_devnum(q);
1953 if (major(qn) == 0)
1954 continue;
1955
1956 if (st.st_rdev == qn)
1957 continue;
1958
1959 node = udev_device_get_devnode(q);
1960 if (!node)
1961 continue;
1962
1963 pp = blkid_partlist_devno_to_partition(pl, qn);
1964 if (!pp)
1965 continue;
1966
727fd4fd 1967 flags = blkid_partition_get_flags(pp);
727fd4fd 1968
1b9e5b12
LP
1969 nr = blkid_partition_get_partno(pp);
1970 if (nr < 0)
1971 continue;
1972
ada4799a
LP
1973 if (is_gpt) {
1974 sd_id128_t type_id;
1975 const char *stype;
1b9e5b12 1976
f6c51a81
LP
1977 if (flags & GPT_FLAG_NO_AUTO)
1978 continue;
1979
ada4799a
LP
1980 stype = blkid_partition_get_type_string(pp);
1981 if (!stype)
1982 continue;
1b9e5b12 1983
ada4799a 1984 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1985 continue;
1986
ada4799a 1987 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1988
ada4799a
LP
1989 if (home && nr >= home_nr)
1990 continue;
1b9e5b12 1991
ada4799a
LP
1992 home_nr = nr;
1993 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1994
ada4799a
LP
1995 r = free_and_strdup(&home, node);
1996 if (r < 0)
1997 return log_oom();
727fd4fd 1998
ada4799a
LP
1999 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2000
2001 if (srv && nr >= srv_nr)
2002 continue;
2003
2004 srv_nr = nr;
2005 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2006
2007 r = free_and_strdup(&srv, node);
2008 if (r < 0)
2009 return log_oom();
2010 }
1b9e5b12 2011#ifdef GPT_ROOT_NATIVE
ada4799a 2012 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2013
ada4799a
LP
2014 if (root && nr >= root_nr)
2015 continue;
1b9e5b12 2016
ada4799a
LP
2017 root_nr = nr;
2018 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2019
ada4799a
LP
2020 r = free_and_strdup(&root, node);
2021 if (r < 0)
2022 return log_oom();
2023 }
1b9e5b12
LP
2024#endif
2025#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2026 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2027
2028 if (secondary_root && nr >= secondary_root_nr)
2029 continue;
2030
2031 secondary_root_nr = nr;
2032 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2033
2034 r = free_and_strdup(&secondary_root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
2038#endif
f6c51a81
LP
2039 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2040
2041 if (generic)
2042 multiple_generic = true;
2043 else {
2044 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2045
2046 r = free_and_strdup(&generic, node);
2047 if (r < 0)
2048 return log_oom();
2049 }
2050 }
ada4799a
LP
2051
2052 } else if (is_mbr) {
2053 int type;
1b9e5b12 2054
f6c51a81
LP
2055 if (flags != 0x80) /* Bootable flag */
2056 continue;
2057
ada4799a
LP
2058 type = blkid_partition_get_type(pp);
2059 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2060 continue;
2061
f6c51a81
LP
2062 if (generic)
2063 multiple_generic = true;
2064 else {
2065 generic_rw = true;
727fd4fd 2066
f6c51a81
LP
2067 r = free_and_strdup(&root, node);
2068 if (r < 0)
2069 return log_oom();
2070 }
1b9e5b12 2071 }
1b9e5b12
LP
2072 }
2073
1b9e5b12
LP
2074 if (root) {
2075 *root_device = root;
2076 root = NULL;
727fd4fd
LP
2077
2078 *root_device_rw = root_rw;
1b9e5b12
LP
2079 *secondary = false;
2080 } else if (secondary_root) {
2081 *root_device = secondary_root;
2082 secondary_root = NULL;
727fd4fd
LP
2083
2084 *root_device_rw = secondary_root_rw;
1b9e5b12 2085 *secondary = true;
f6c51a81
LP
2086 } else if (generic) {
2087
2088 /* There were no partitions with precise meanings
2089 * around, but we found generic partitions. In this
2090 * case, if there's only one, we can go ahead and boot
2091 * it, otherwise we bail out, because we really cannot
2092 * make any sense of it. */
2093
2094 if (multiple_generic) {
2095 log_error("Identified multiple bootable Linux partitions on\n"
2096 " %s\n"
2097 PARTITION_TABLE_BLURB, arg_image);
2098 return -EINVAL;
2099 }
2100
2101 *root_device = generic;
2102 generic = NULL;
2103
2104 *root_device_rw = generic_rw;
2105 *secondary = false;
2106 } else {
2107 log_error("Failed to identify root partition in disk image\n"
2108 " %s\n"
2109 PARTITION_TABLE_BLURB, arg_image);
2110 return -EINVAL;
1b9e5b12
LP
2111 }
2112
2113 if (home) {
2114 *home_device = home;
2115 home = NULL;
727fd4fd
LP
2116
2117 *home_device_rw = home_rw;
1b9e5b12
LP
2118 }
2119
2120 if (srv) {
2121 *srv_device = srv;
2122 srv = NULL;
727fd4fd
LP
2123
2124 *srv_device_rw = srv_rw;
1b9e5b12
LP
2125 }
2126
2127 return 0;
2128#else
2129 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2130 return -EOPNOTSUPP;
1b9e5b12
LP
2131#endif
2132}
2133
727fd4fd 2134static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2135#ifdef HAVE_BLKID
2136 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2137 const char *fstype, *p;
2138 int r;
2139
2140 assert(what);
2141 assert(where);
2142
727fd4fd
LP
2143 if (arg_read_only)
2144 rw = false;
2145
1b9e5b12 2146 if (directory)
63c372cb 2147 p = strjoina(where, directory);
1b9e5b12
LP
2148 else
2149 p = where;
2150
2151 errno = 0;
2152 b = blkid_new_probe_from_filename(what);
2153 if (!b) {
2154 if (errno == 0)
2155 return log_oom();
e1427b13 2156 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2157 }
2158
2159 blkid_probe_enable_superblocks(b, 1);
2160 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2161
2162 errno = 0;
2163 r = blkid_do_safeprobe(b);
2164 if (r == -1 || r == 1) {
2165 log_error("Cannot determine file system type of %s", what);
2166 return -EINVAL;
2167 } else if (r != 0) {
2168 if (errno == 0)
2169 errno = EIO;
e1427b13 2170 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2171 }
2172
2173 errno = 0;
2174 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2175 if (errno == 0)
2176 errno = EINVAL;
2177 log_error("Failed to determine file system type of %s", what);
2178 return -errno;
2179 }
2180
2181 if (streq(fstype, "crypto_LUKS")) {
2182 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2183 return -EOPNOTSUPP;
1b9e5b12
LP
2184 }
2185
4a62c710
MS
2186 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2187 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2188
2189 return 0;
2190#else
2191 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2192 return -EOPNOTSUPP;
1b9e5b12
LP
2193#endif
2194}
2195
727fd4fd
LP
2196static int mount_devices(
2197 const char *where,
2198 const char *root_device, bool root_device_rw,
2199 const char *home_device, bool home_device_rw,
2200 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2201 int r;
2202
2203 assert(where);
2204
2205 if (root_device) {
727fd4fd 2206 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2207 if (r < 0)
2208 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2209 }
2210
2211 if (home_device) {
727fd4fd 2212 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2213 if (r < 0)
2214 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2215 }
2216
2217 if (srv_device) {
727fd4fd 2218 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2219 if (r < 0)
2220 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2221 }
2222
2223 return 0;
2224}
2225
2226static void loop_remove(int nr, int *image_fd) {
2227 _cleanup_close_ int control = -1;
e8c8ddcc 2228 int r;
1b9e5b12
LP
2229
2230 if (nr < 0)
2231 return;
2232
2233 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2234 r = ioctl(*image_fd, LOOP_CLR_FD);
2235 if (r < 0)
5e4074aa 2236 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2237 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2238 }
2239
2240 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2241 if (control < 0) {
56f64d95 2242 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2243 return;
e8c8ddcc 2244 }
1b9e5b12 2245
e8c8ddcc
TG
2246 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2247 if (r < 0)
5e4074aa 2248 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2249}
2250
113cea80 2251/*
6d416b9c
LS
2252 * Return values:
2253 * < 0 : wait_for_terminate() failed to get the state of the
2254 * container, the container was terminated by a signal, or
2255 * failed for an unknown reason. No change is made to the
2256 * container argument.
2257 * > 0 : The program executed in the container terminated with an
2258 * error. The exit code of the program executed in the
919699ec
LP
2259 * container is returned. The container argument has been set
2260 * to CONTAINER_TERMINATED.
6d416b9c
LS
2261 * 0 : The container is being rebooted, has been shut down or exited
2262 * successfully. The container argument has been set to either
2263 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2264 *
6d416b9c
LS
2265 * That is, success is indicated by a return value of zero, and an
2266 * error is indicated by a non-zero value.
113cea80
DH
2267 */
2268static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2269 siginfo_t status;
919699ec 2270 int r;
113cea80
DH
2271
2272 r = wait_for_terminate(pid, &status);
f647962d
MS
2273 if (r < 0)
2274 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2275
2276 switch (status.si_code) {
fddbb89c 2277
113cea80 2278 case CLD_EXITED:
919699ec
LP
2279 if (status.si_status == 0) {
2280 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2281
fddbb89c 2282 } else
919699ec 2283 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2284
919699ec
LP
2285 *container = CONTAINER_TERMINATED;
2286 return status.si_status;
113cea80
DH
2287
2288 case CLD_KILLED:
2289 if (status.si_status == SIGINT) {
113cea80 2290
919699ec 2291 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2292 *container = CONTAINER_TERMINATED;
919699ec
LP
2293 return 0;
2294
113cea80 2295 } else if (status.si_status == SIGHUP) {
113cea80 2296
919699ec 2297 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2298 *container = CONTAINER_REBOOTED;
919699ec 2299 return 0;
113cea80 2300 }
919699ec 2301
113cea80
DH
2302 /* CLD_KILLED fallthrough */
2303
2304 case CLD_DUMPED:
fddbb89c 2305 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2306 return -EIO;
113cea80
DH
2307
2308 default:
fddbb89c 2309 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2310 return -EIO;
113cea80
DH
2311 }
2312
2313 return r;
2314}
2315
023fb90b
LP
2316static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2317 pid_t pid;
2318
4a0b58c4 2319 pid = PTR_TO_PID(userdata);
023fb90b 2320 if (pid > 0) {
c6c8f6e2 2321 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2322 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2323 sd_event_source_set_userdata(s, NULL);
2324 return 0;
2325 }
2326 }
2327
2328 sd_event_exit(sd_event_source_get_event(s), 0);
2329 return 0;
2330}
2331
ec16945e 2332static int determine_names(void) {
1b9cebf6 2333 int r;
ec16945e 2334
c1521918
LP
2335 if (arg_template && !arg_directory && arg_machine) {
2336
2337 /* If --template= was specified then we should not
2338 * search for a machine, but instead create a new one
2339 * in /var/lib/machine. */
2340
2341 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2342 if (!arg_directory)
2343 return log_oom();
2344 }
2345
ec16945e 2346 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2347 if (arg_machine) {
2348 _cleanup_(image_unrefp) Image *i = NULL;
2349
2350 r = image_find(arg_machine, &i);
2351 if (r < 0)
2352 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2353 else if (r == 0) {
2354 log_error("No image for machine '%s': %m", arg_machine);
2355 return -ENOENT;
2356 }
2357
aceac2f0 2358 if (i->type == IMAGE_RAW)
0f03c2a4 2359 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2360 else
0f03c2a4 2361 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2362 if (r < 0)
2363 return log_error_errno(r, "Invalid image directory: %m");
2364
aee327b8
LP
2365 if (!arg_ephemeral)
2366 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2367 } else
ec16945e
LP
2368 arg_directory = get_current_dir_name();
2369
1b9cebf6
LP
2370 if (!arg_directory && !arg_machine) {
2371 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2372 return -EINVAL;
2373 }
2374 }
2375
2376 if (!arg_machine) {
b9ba4dab
LP
2377 if (arg_directory && path_equal(arg_directory, "/"))
2378 arg_machine = gethostname_malloc();
2379 else
2380 arg_machine = strdup(basename(arg_image ?: arg_directory));
2381
ec16945e
LP
2382 if (!arg_machine)
2383 return log_oom();
2384
ae691c1d 2385 hostname_cleanup(arg_machine);
ec16945e
LP
2386 if (!machine_name_is_valid(arg_machine)) {
2387 log_error("Failed to determine machine name automatically, please use -M.");
2388 return -EINVAL;
2389 }
b9ba4dab
LP
2390
2391 if (arg_ephemeral) {
2392 char *b;
2393
2394 /* Add a random suffix when this is an
2395 * ephemeral machine, so that we can run many
2396 * instances at once without manually having
2397 * to specify -M each time. */
2398
2399 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2400 return log_oom();
2401
2402 free(arg_machine);
2403 arg_machine = b;
2404 }
ec16945e
LP
2405 }
2406
2407 return 0;
2408}
2409
03cfe0d5 2410static int determine_uid_shift(const char *directory) {
6dac160c
LP
2411 int r;
2412
03cfe0d5
LP
2413 if (!arg_userns) {
2414 arg_uid_shift = 0;
6dac160c 2415 return 0;
03cfe0d5 2416 }
6dac160c
LP
2417
2418 if (arg_uid_shift == UID_INVALID) {
2419 struct stat st;
2420
03cfe0d5 2421 r = stat(directory, &st);
6dac160c 2422 if (r < 0)
03cfe0d5 2423 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2424
2425 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2426
2427 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2428 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2429 return -EINVAL;
2430 }
2431
2432 arg_uid_range = UINT32_C(0x10000);
2433 }
2434
2435 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2436 log_error("UID base too high for UID range.");
2437 return -EINVAL;
2438 }
2439
2440 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2441 return 0;
2442}
2443
03cfe0d5
LP
2444static int inner_child(
2445 Barrier *barrier,
2446 const char *directory,
2447 bool secondary,
2448 int kmsg_socket,
2449 int rtnl_socket,
f757855e 2450 FDSet *fds) {
69c79d3c 2451
03cfe0d5 2452 _cleanup_free_ char *home = NULL;
6aadfa4c 2453 unsigned n_env = 1;
03cfe0d5
LP
2454 const char *envp[] = {
2455 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2456 NULL, /* container */
03cfe0d5
LP
2457 NULL, /* TERM */
2458 NULL, /* HOME */
2459 NULL, /* USER */
2460 NULL, /* LOGNAME */
2461 NULL, /* container_uuid */
2462 NULL, /* LISTEN_FDS */
2463 NULL, /* LISTEN_PID */
2464 NULL
2465 };
88213476 2466
2371271c 2467 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2468 int r;
88213476 2469
03cfe0d5
LP
2470 assert(barrier);
2471 assert(directory);
2472 assert(kmsg_socket >= 0);
88213476 2473
efdb0237
LP
2474 cg_unified_flush();
2475
03cfe0d5
LP
2476 if (arg_userns) {
2477 /* Tell the parent, that it now can write the UID map. */
2478 (void) barrier_place(barrier); /* #1 */
7027ff61 2479
03cfe0d5
LP
2480 /* Wait until the parent wrote the UID map */
2481 if (!barrier_place_and_sync(barrier)) { /* #2 */
2482 log_error("Parent died too early");
2483 return -ESRCH;
2484 }
88213476
LP
2485 }
2486
d1678248 2487 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2488 if (r < 0)
2489 return r;
2490
d8fc6a00
LP
2491 r = mount_sysfs(NULL);
2492 if (r < 0)
2493 return r;
2494
03cfe0d5
LP
2495 /* Wait until we are cgroup-ified, so that we
2496 * can mount the right cgroup path writable */
2497 if (!barrier_place_and_sync(barrier)) { /* #3 */
2498 log_error("Parent died too early");
2499 return -ESRCH;
88213476
LP
2500 }
2501
e83bebef 2502 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2503 if (r < 0)
2504 return r;
ec16945e 2505
03cfe0d5
LP
2506 r = reset_uid_gid();
2507 if (r < 0)
2508 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2509
03cfe0d5
LP
2510 r = setup_boot_id(NULL);
2511 if (r < 0)
2512 return r;
ec16945e 2513
03cfe0d5
LP
2514 r = setup_kmsg(NULL, kmsg_socket);
2515 if (r < 0)
2516 return r;
2517 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2518
03cfe0d5 2519 umask(0022);
30535c16 2520
03cfe0d5
LP
2521 if (setsid() < 0)
2522 return log_error_errno(errno, "setsid() failed: %m");
2523
2524 if (arg_private_network)
2525 loopback_setup();
2526
7a8f6325
LP
2527 if (arg_expose_ports) {
2528 r = expose_port_send_rtnl(rtnl_socket);
2529 if (r < 0)
2530 return r;
2531 rtnl_socket = safe_close(rtnl_socket);
2532 }
03cfe0d5 2533
709f6e46
MS
2534 r = drop_capabilities();
2535 if (r < 0)
2536 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2537
2538 setup_hostname();
2539
050f7277 2540 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2541 if (personality(arg_personality) < 0)
2542 return log_error_errno(errno, "personality() failed: %m");
2543 } else if (secondary) {
2544 if (personality(PER_LINUX32) < 0)
2545 return log_error_errno(errno, "personality() failed: %m");
2546 }
2547
2548#ifdef HAVE_SELINUX
2549 if (arg_selinux_context)
2550 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2551 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2552#endif
2553
ee645080 2554 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2555 if (r < 0)
2556 return r;
2557
6aadfa4c
ILG
2558 /* LXC sets container=lxc, so follow the scheme here */
2559 envp[n_env++] = strjoina("container=", arg_container_service_name);
2560
03cfe0d5
LP
2561 envp[n_env] = strv_find_prefix(environ, "TERM=");
2562 if (envp[n_env])
313cefa1 2563 n_env++;
03cfe0d5
LP
2564
2565 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2566 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2567 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2568 return log_oom();
2569
2570 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2571 char as_uuid[37];
2572
2573 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2574 return log_oom();
2575 }
2576
2577 if (fdset_size(fds) > 0) {
2578 r = fdset_cloexec(fds, false);
2579 if (r < 0)
2580 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2581
2582 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2583 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2584 return log_oom();
2585 }
2586
2371271c
TG
2587 env_use = strv_env_merge(2, envp, arg_setenv);
2588 if (!env_use)
2589 return log_oom();
03cfe0d5
LP
2590
2591 /* Let the parent know that we are ready and
2592 * wait until the parent is ready with the
2593 * setup, too... */
2594 if (!barrier_place_and_sync(barrier)) { /* #4 */
2595 log_error("Parent died too early");
2596 return -ESRCH;
2597 }
2598
5f932eb9
LP
2599 if (arg_chdir)
2600 if (chdir(arg_chdir) < 0)
2601 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2602
7732f92b
LP
2603 if (arg_start_mode == START_PID2) {
2604 r = stub_pid1();
2605 if (r < 0)
2606 return r;
2607 }
2608
03cfe0d5
LP
2609 /* Now, explicitly close the log, so that we
2610 * then can close all remaining fds. Closing
2611 * the log explicitly first has the benefit
2612 * that the logging subsystem knows about it,
2613 * and is thus ready to be reopened should we
2614 * need it again. Note that the other fds
2615 * closed here are at least the locking and
2616 * barrier fds. */
2617 log_close();
2618 (void) fdset_close_others(fds);
2619
7732f92b 2620 if (arg_start_mode == START_BOOT) {
03cfe0d5
LP
2621 char **a;
2622 size_t m;
2623
2624 /* Automatically search for the init system */
2625
75f32f04
ZJS
2626 m = strv_length(arg_parameters);
2627 a = newa(char*, m + 2);
2628 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2629 a[1 + m] = NULL;
03cfe0d5
LP
2630
2631 a[0] = (char*) "/usr/lib/systemd/systemd";
2632 execve(a[0], a, env_use);
2633
2634 a[0] = (char*) "/lib/systemd/systemd";
2635 execve(a[0], a, env_use);
2636
2637 a[0] = (char*) "/sbin/init";
2638 execve(a[0], a, env_use);
f757855e
LP
2639 } else if (!strv_isempty(arg_parameters))
2640 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2641 else {
5f932eb9
LP
2642 if (!arg_chdir)
2643 chdir(home ?: "/root");
2644
03cfe0d5
LP
2645 execle("/bin/bash", "-bash", NULL, env_use);
2646 execle("/bin/sh", "-sh", NULL, env_use);
2647 }
2648
35607a8d 2649 r = -errno;
03cfe0d5 2650 (void) log_open();
35607a8d 2651 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2652}
2653
2654static int outer_child(
2655 Barrier *barrier,
2656 const char *directory,
2657 const char *console,
2658 const char *root_device, bool root_device_rw,
2659 const char *home_device, bool home_device_rw,
2660 const char *srv_device, bool srv_device_rw,
2661 bool interactive,
2662 bool secondary,
2663 int pid_socket,
2664 int kmsg_socket,
2665 int rtnl_socket,
825d5287 2666 int uid_shift_socket,
f757855e 2667 FDSet *fds) {
03cfe0d5
LP
2668
2669 pid_t pid;
2670 ssize_t l;
2671 int r;
2672
2673 assert(barrier);
2674 assert(directory);
2675 assert(console);
2676 assert(pid_socket >= 0);
2677 assert(kmsg_socket >= 0);
2678
efdb0237
LP
2679 cg_unified_flush();
2680
03cfe0d5
LP
2681 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2682 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2683
2684 if (interactive) {
2685 close_nointr(STDIN_FILENO);
2686 close_nointr(STDOUT_FILENO);
2687 close_nointr(STDERR_FILENO);
2688
2689 r = open_terminal(console, O_RDWR);
2690 if (r != STDIN_FILENO) {
2691 if (r >= 0) {
2692 safe_close(r);
2693 r = -EINVAL;
2694 }
2695
2696 return log_error_errno(r, "Failed to open console: %m");
2697 }
2698
2699 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2700 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2701 return log_error_errno(errno, "Failed to duplicate console: %m");
2702 }
2703
2704 r = reset_audit_loginuid();
2705 if (r < 0)
2706 return r;
2707
2708 /* Mark everything as slave, so that we still
2709 * receive mounts from the real root, but don't
2710 * propagate mounts to the real root. */
2711 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2712 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2713
2714 r = mount_devices(directory,
2715 root_device, root_device_rw,
2716 home_device, home_device_rw,
2717 srv_device, srv_device_rw);
2718 if (r < 0)
2719 return r;
2720
391567f4
LP
2721 r = determine_uid_shift(directory);
2722 if (r < 0)
2723 return r;
2724
825d5287
RM
2725 if (arg_userns) {
2726 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2727 if (l < 0)
2728 return log_error_errno(errno, "Failed to send UID shift: %m");
2729 if (l != sizeof(arg_uid_shift)) {
2730 log_error("Short write while sending UID shift.");
2731 return -EIO;
2732 }
2733 }
2734
03cfe0d5
LP
2735 /* Turn directory into bind mount */
2736 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2737 return log_error_errno(errno, "Failed to make bind mount: %m");
2738
e83bebef 2739 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2740 if (r < 0)
2741 return r;
2742
e83bebef 2743 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2744 if (r < 0)
2745 return r;
2746
03cfe0d5
LP
2747 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2748 if (r < 0)
2749 return r;
2750
03cfe0d5
LP
2751 if (arg_read_only) {
2752 r = bind_remount_recursive(directory, true);
2753 if (r < 0)
2754 return log_error_errno(r, "Failed to make tree read-only: %m");
2755 }
2756
d1678248 2757 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2758 if (r < 0)
2759 return r;
2760
07fa00f9
LP
2761 r = copy_devnodes(directory);
2762 if (r < 0)
03cfe0d5
LP
2763 return r;
2764
2765 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2766
07fa00f9
LP
2767 r = setup_pts(directory);
2768 if (r < 0)
03cfe0d5
LP
2769 return r;
2770
2771 r = setup_propagate(directory);
2772 if (r < 0)
2773 return r;
2774
2775 r = setup_dev_console(directory, console);
2776 if (r < 0)
2777 return r;
2778
2779 r = setup_seccomp();
2780 if (r < 0)
2781 return r;
2782
2783 r = setup_timezone(directory);
2784 if (r < 0)
2785 return r;
2786
2787 r = setup_resolv_conf(directory);
2788 if (r < 0)
2789 return r;
2790
2791 r = setup_journal(directory);
2792 if (r < 0)
2793 return r;
2794
e83bebef 2795 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2796 if (r < 0)
2797 return r;
2798
e83bebef 2799 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2800 if (r < 0)
2801 return r;
2802
2803 r = mount_move_root(directory);
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to move root directory: %m");
2806
2807 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2808 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2809 (arg_private_network ? CLONE_NEWNET : 0) |
2810 (arg_userns ? CLONE_NEWUSER : 0),
2811 NULL);
2812 if (pid < 0)
2813 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2814 if (pid == 0) {
2815 pid_socket = safe_close(pid_socket);
825d5287 2816 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2817
2818 /* The inner child has all namespaces that are
2819 * requested, so that we all are owned by the user if
2820 * user namespaces are turned on. */
2821
f757855e 2822 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2823 if (r < 0)
2824 _exit(EXIT_FAILURE);
2825
2826 _exit(EXIT_SUCCESS);
2827 }
2828
2829 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2830 if (l < 0)
2831 return log_error_errno(errno, "Failed to send PID: %m");
2832 if (l != sizeof(pid)) {
2833 log_error("Short write while sending PID.");
2834 return -EIO;
2835 }
2836
2837 pid_socket = safe_close(pid_socket);
327e26d6
KN
2838 kmsg_socket = safe_close(kmsg_socket);
2839 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2840
2841 return 0;
2842}
2843
2844static int setup_uid_map(pid_t pid) {
2845 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2846 int r;
2847
2848 assert(pid > 1);
2849
2850 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2851 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2852 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2853 if (r < 0)
2854 return log_error_errno(r, "Failed to write UID map: %m");
2855
2856 /* We always assign the same UID and GID ranges */
2857 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2858 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2859 if (r < 0)
2860 return log_error_errno(r, "Failed to write GID map: %m");
2861
2862 return 0;
2863}
2864
f757855e
LP
2865static int load_settings(void) {
2866 _cleanup_(settings_freep) Settings *settings = NULL;
2867 _cleanup_fclose_ FILE *f = NULL;
2868 _cleanup_free_ char *p = NULL;
2869 const char *fn, *i;
2870 int r;
2871
2872 /* If all settings are masked, there's no point in looking for
2873 * the settings file */
2874 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2875 return 0;
2876
2877 fn = strjoina(arg_machine, ".nspawn");
2878
2879 /* We first look in the admin's directories in /etc and /run */
2880 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2881 _cleanup_free_ char *j = NULL;
2882
2883 j = strjoin(i, "/", fn, NULL);
2884 if (!j)
2885 return log_oom();
2886
2887 f = fopen(j, "re");
2888 if (f) {
2889 p = j;
2890 j = NULL;
2891
b938cb90 2892 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2893 if (arg_settings_trusted < 0)
2894 arg_settings_trusted = true;
2895
2896 break;
2897 }
2898
2899 if (errno != ENOENT)
2900 return log_error_errno(errno, "Failed to open %s: %m", j);
2901 }
2902
2903 if (!f) {
2904 /* After that, let's look for a file next to the
2905 * actual image we shall boot. */
2906
2907 if (arg_image) {
2908 p = file_in_same_dir(arg_image, fn);
2909 if (!p)
2910 return log_oom();
2911 } else if (arg_directory) {
2912 p = file_in_same_dir(arg_directory, fn);
2913 if (!p)
2914 return log_oom();
2915 }
2916
2917 if (p) {
2918 f = fopen(p, "re");
2919 if (!f && errno != ENOENT)
2920 return log_error_errno(errno, "Failed to open %s: %m", p);
2921
b938cb90 2922 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2923 if (arg_settings_trusted < 0)
2924 arg_settings_trusted = false;
2925 }
2926 }
2927
2928 if (!f)
2929 return 0;
2930
2931 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2932
2933 r = settings_load(f, p, &settings);
2934 if (r < 0)
2935 return r;
2936
2937 /* Copy over bits from the settings, unless they have been
2938 * explicitly masked by command line switches. */
2939
7732f92b
LP
2940 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2941 settings->start_mode >= 0) {
2942 arg_start_mode = settings->start_mode;
f757855e
LP
2943
2944 strv_free(arg_parameters);
2945 arg_parameters = settings->parameters;
2946 settings->parameters = NULL;
2947 }
2948
5f932eb9
LP
2949 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2950 settings->working_directory) {
2951 free(arg_chdir);
2952 arg_chdir = settings->working_directory;
2953 settings->working_directory = NULL;
2954 }
2955
f757855e
LP
2956 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2957 settings->environment) {
2958 strv_free(arg_setenv);
2959 arg_setenv = settings->environment;
2960 settings->environment = NULL;
2961 }
2962
2963 if ((arg_settings_mask & SETTING_USER) == 0 &&
2964 settings->user) {
2965 free(arg_user);
2966 arg_user = settings->user;
2967 settings->user = NULL;
2968 }
2969
2970 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2971 uint64_t plus;
f757855e 2972
0e265674
LP
2973 plus = settings->capability;
2974 if (settings_private_network(settings))
2975 plus |= (1ULL << CAP_NET_ADMIN);
2976
2977 if (!arg_settings_trusted && plus != 0) {
2978 if (settings->capability != 0)
2979 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2980 } else
2981 arg_retain |= plus;
f757855e
LP
2982
2983 arg_retain &= ~settings->drop_capability;
2984 }
2985
2986 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2987 settings->kill_signal > 0)
2988 arg_kill_signal = settings->kill_signal;
2989
2990 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2991 settings->personality != PERSONALITY_INVALID)
2992 arg_personality = settings->personality;
2993
2994 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2995 !sd_id128_is_null(settings->machine_id)) {
2996
2997 if (!arg_settings_trusted)
2998 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2999 else
3000 arg_uuid = settings->machine_id;
3001 }
3002
3003 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3004 settings->read_only >= 0)
3005 arg_read_only = settings->read_only;
3006
3007 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3008 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3009 arg_volatile_mode = settings->volatile_mode;
3010
3011 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3012 settings->n_custom_mounts > 0) {
3013
3014 if (!arg_settings_trusted)
3015 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3016 else {
3017 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3018 arg_custom_mounts = settings->custom_mounts;
3019 arg_n_custom_mounts = settings->n_custom_mounts;
3020
3021 settings->custom_mounts = NULL;
3022 settings->n_custom_mounts = 0;
3023 }
3024 }
3025
3026 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3027 (settings->private_network >= 0 ||
3028 settings->network_veth >= 0 ||
3029 settings->network_bridge ||
3030 settings->network_interfaces ||
3031 settings->network_macvlan ||
f6d6bad1
LP
3032 settings->network_ipvlan ||
3033 settings->network_veth_extra)) {
f757855e
LP
3034
3035 if (!arg_settings_trusted)
3036 log_warning("Ignoring network settings, file %s is not trusted.", p);
3037 else {
f6d6bad1 3038 arg_network_veth = settings_network_veth(settings);
0e265674
LP
3039 arg_private_network = settings_private_network(settings);
3040
f757855e
LP
3041 strv_free(arg_network_interfaces);
3042 arg_network_interfaces = settings->network_interfaces;
3043 settings->network_interfaces = NULL;
3044
3045 strv_free(arg_network_macvlan);
3046 arg_network_macvlan = settings->network_macvlan;
3047 settings->network_macvlan = NULL;
3048
3049 strv_free(arg_network_ipvlan);
3050 arg_network_ipvlan = settings->network_ipvlan;
3051 settings->network_ipvlan = NULL;
3052
f6d6bad1
LP
3053 strv_free(arg_network_veth_extra);
3054 arg_network_veth_extra = settings->network_veth_extra;
3055 settings->network_veth_extra = NULL;
3056
f757855e
LP
3057 free(arg_network_bridge);
3058 arg_network_bridge = settings->network_bridge;
3059 settings->network_bridge = NULL;
f757855e
LP
3060 }
3061 }
3062
3063 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3064 settings->expose_ports) {
3065
3066 if (!arg_settings_trusted)
3067 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3068 else {
3069 expose_port_free_all(arg_expose_ports);
3070 arg_expose_ports = settings->expose_ports;
3071 settings->expose_ports = NULL;
3072 }
3073 }
3074
3075 return 0;
3076}
3077
03cfe0d5
LP
3078int main(int argc, char *argv[]) {
3079
3080 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3081 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3082 _cleanup_close_ int master = -1, image_fd = -1;
3083 _cleanup_fdset_free_ FDSet *fds = NULL;
3084 int r, n_fd_passed, loop_nr = -1;
3085 char veth_name[IFNAMSIZ];
3086 bool secondary = false, remove_subvol = false;
72c0a2c2 3087 sigset_t mask_chld;
03cfe0d5
LP
3088 pid_t pid = 0;
3089 int ret = EXIT_SUCCESS;
3090 union in_addr_union exposed = {};
3091 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3092 bool interactive;
3093
3094 log_parse_environment();
3095 log_open();
3096
7732f92b
LP
3097 /* Make sure rename_process() in the stub init process can work */
3098 saved_argv = argv;
3099 saved_argc = argc;
3100
03cfe0d5
LP
3101 r = parse_argv(argc, argv);
3102 if (r <= 0)
3103 goto finish;
3104
03cfe0d5
LP
3105 if (geteuid() != 0) {
3106 log_error("Need to be root.");
3107 r = -EPERM;
3108 goto finish;
3109 }
f757855e
LP
3110 r = determine_names();
3111 if (r < 0)
3112 goto finish;
3113
3114 r = load_settings();
3115 if (r < 0)
3116 goto finish;
3117
3118 r = verify_arguments();
3119 if (r < 0)
3120 goto finish;
03cfe0d5
LP
3121
3122 n_fd_passed = sd_listen_fds(false);
3123 if (n_fd_passed > 0) {
3124 r = fdset_new_listen_fds(&fds, false);
3125 if (r < 0) {
3126 log_error_errno(r, "Failed to collect file descriptors: %m");
3127 goto finish;
3128 }
3129 }
3130
3131 if (arg_directory) {
3132 assert(!arg_image);
3133
3134 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3135 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3136 r = -EINVAL;
3137 goto finish;
3138 }
3139
3140 if (arg_ephemeral) {
3141 _cleanup_free_ char *np = NULL;
3142
3143 /* If the specified path is a mount point we
3144 * generate the new snapshot immediately
3145 * inside it under a random name. However if
3146 * the specified is not a mount point we
3147 * create the new snapshot in the parent
3148 * directory, just next to it. */
e26d6ce5 3149 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3150 if (r < 0) {
3151 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3152 goto finish;
3153 }
3154 if (r > 0)
770b5ce4 3155 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3156 else
770b5ce4 3157 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3158 if (r < 0) {
3159 log_error_errno(r, "Failed to generate name for snapshot: %m");
3160 goto finish;
3161 }
3162
3163 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3164 if (r < 0) {
3165 log_error_errno(r, "Failed to lock %s: %m", np);
3166 goto finish;
3167 }
3168
5bcd08db 3169 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3170 if (r < 0) {
3171 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3172 goto finish;
ec16945e
LP
3173 }
3174
3175 free(arg_directory);
3176 arg_directory = np;
8a16a7b4 3177 np = NULL;
ec16945e
LP
3178
3179 remove_subvol = true;
30535c16
LP
3180
3181 } else {
3182 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3183 if (r == -EBUSY) {
3184 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3185 goto finish;
3186 }
3187 if (r < 0) {
3188 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3189 return r;
3190 }
3191
3192 if (arg_template) {
5bcd08db 3193 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3194 if (r == -EEXIST) {
3195 if (!arg_quiet)
3196 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3197 } else if (r < 0) {
83521414 3198 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3199 goto finish;
3200 } else {
3201 if (!arg_quiet)
3202 log_info("Populated %s from template %s.", arg_directory, arg_template);
3203 }
3204 }
ec16945e
LP
3205 }
3206
7732f92b 3207 if (arg_start_mode == START_BOOT) {
1b9e5b12 3208 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3209 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3210 r = -EINVAL;
1b9e5b12
LP
3211 goto finish;
3212 }
3213 } else {
3214 const char *p;
3215
16fb773e
LP
3216 p = strjoina(arg_directory, "/usr/");
3217 if (laccess(p, F_OK) < 0) {
3218 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3219 r = -EINVAL;
1b9e5b12 3220 goto finish;
1b9e5b12
LP
3221 }
3222 }
ec16945e 3223
6b9132a9 3224 } else {
1b9e5b12 3225 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3226
ec16945e
LP
3227 assert(arg_image);
3228 assert(!arg_template);
3229
30535c16
LP
3230 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3231 if (r == -EBUSY) {
3232 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3233 goto finish;
3234 }
3235 if (r < 0) {
3236 r = log_error_errno(r, "Failed to create image lock: %m");
3237 goto finish;
3238 }
3239
1b9e5b12 3240 if (!mkdtemp(template)) {
56f64d95 3241 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3242 r = -errno;
6b9132a9 3243 goto finish;
1b9e5b12 3244 }
6b9132a9 3245
1b9e5b12
LP
3246 arg_directory = strdup(template);
3247 if (!arg_directory) {
3248 r = log_oom();
3249 goto finish;
6b9132a9 3250 }
88213476 3251
1b9e5b12
LP
3252 image_fd = setup_image(&device_path, &loop_nr);
3253 if (image_fd < 0) {
3254 r = image_fd;
842f3b0f
LP
3255 goto finish;
3256 }
1b9e5b12 3257
4d9f07b4
LP
3258 r = dissect_image(image_fd,
3259 &root_device, &root_device_rw,
3260 &home_device, &home_device_rw,
3261 &srv_device, &srv_device_rw,
3262 &secondary);
1b9e5b12
LP
3263 if (r < 0)
3264 goto finish;
842f3b0f 3265 }
842f3b0f 3266
5a8af538
LP
3267 r = custom_mounts_prepare();
3268 if (r < 0)
3269 goto finish;
3270
03cfe0d5
LP
3271 interactive =
3272 isatty(STDIN_FILENO) > 0 &&
3273 isatty(STDOUT_FILENO) > 0;
9c857b9d 3274
db7feb7e
LP
3275 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3276 if (master < 0) {
ec16945e 3277 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3278 goto finish;
3279 }
3280
611b312b
LP
3281 r = ptsname_malloc(master, &console);
3282 if (r < 0) {
3283 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3284 goto finish;
3285 }
3286
a258bf26 3287 if (unlockpt(master) < 0) {
ec16945e 3288 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3289 goto finish;
3290 }
3291
9c857b9d
LP
3292 if (!arg_quiet)
3293 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3294 arg_machine, arg_image ?: arg_directory);
3295
72c0a2c2 3296 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3297
023fb90b
LP
3298 assert_se(sigemptyset(&mask_chld) == 0);
3299 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3300
03cfe0d5
LP
3301 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3302 r = log_error_errno(errno, "Failed to become subreaper: %m");
3303 goto finish;
3304 }
3305
d87be9b0 3306 for (;;) {
97044145 3307 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3308 ContainerStatus container_status;
7566e267 3309 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3310 static const struct sigaction sa = {
189d5bac 3311 .sa_handler = nop_signal_handler,
e866af3a
DH
3312 .sa_flags = SA_NOCLDSTOP,
3313 };
03cfe0d5
LP
3314 int ifi = 0;
3315 ssize_t l;
4afd3348 3316 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3317 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3318 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
dbb60d69 3319 char last_char = 0;
e866af3a 3320
7566e267 3321 r = barrier_create(&barrier);
a2da110b 3322 if (r < 0) {
da927ba9 3323 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3324 goto finish;
3325 }
3326
4610de50 3327 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3328 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3329 goto finish;
3330 }
3331
4610de50 3332 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3333 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3334 goto finish;
3335 }
3336
4610de50 3337 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3338 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3339 goto finish;
3340 }
3341
825d5287 3342 if (arg_userns)
4610de50 3343 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3344 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3345 goto finish;
3346 }
3347
e866af3a
DH
3348 /* Child can be killed before execv(), so handle SIGCHLD
3349 * in order to interrupt parent's blocking calls and
3350 * give it a chance to call wait() and terminate. */
3351 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3352 if (r < 0) {
ec16945e 3353 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3354 goto finish;
3355 }
3356
e866af3a
DH
3357 r = sigaction(SIGCHLD, &sa, NULL);
3358 if (r < 0) {
ec16945e 3359 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3360 goto finish;
3361 }
3362
03cfe0d5 3363 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3364 if (pid < 0) {
3365 if (errno == EINVAL)
ec16945e 3366 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3367 else
ec16945e 3368 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3369
d87be9b0
LP
3370 goto finish;
3371 }
a258bf26 3372
d87be9b0 3373 if (pid == 0) {
03cfe0d5 3374 /* The outer child only has a file system namespace. */
a2da110b
DH
3375 barrier_set_role(&barrier, BARRIER_CHILD);
3376
03e334a1 3377 master = safe_close(master);
a258bf26 3378
03e334a1 3379 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3380 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3381 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3382 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3383
ce30c8dc
LP
3384 (void) reset_all_signal_handlers();
3385 (void) reset_signal_mask();
f5c1b9ee 3386
03cfe0d5
LP
3387 r = outer_child(&barrier,
3388 arg_directory,
3389 console,
3390 root_device, root_device_rw,
3391 home_device, home_device_rw,
3392 srv_device, srv_device_rw,
3393 interactive,
3394 secondary,
3395 pid_socket_pair[1],
3396 kmsg_socket_pair[1],
3397 rtnl_socket_pair[1],
825d5287 3398 uid_shift_socket_pair[1],
f757855e 3399 fds);
0cb9fbcd 3400 if (r < 0)
a2da110b 3401 _exit(EXIT_FAILURE);
d87be9b0 3402
03cfe0d5 3403 _exit(EXIT_SUCCESS);
da5b3bad 3404 }
88213476 3405
a2da110b 3406 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3407
2feceb5e 3408 fds = fdset_free(fds);
842f3b0f 3409
6d0b55c2
LP
3410 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3411 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3412 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3413 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3414
03cfe0d5
LP
3415 /* Wait for the outer child. */
3416 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3417 if (r < 0)
3418 goto finish;
3419 if (r != 0) {
3420 r = -EIO;
3421 goto finish;
3422 }
3423 pid = 0;
6dac160c 3424
03cfe0d5
LP
3425 /* And now retrieve the PID of the inner child. */
3426 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3427 if (l < 0) {
3428 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3429 goto finish;
3430 }
3431 if (l != sizeof(pid)) {
76d44882 3432 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3433 r = EIO;
3434 goto finish;
3435 }
354bfd2b 3436
03cfe0d5 3437 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3438
03cfe0d5
LP
3439 if (arg_userns) {
3440 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3441 log_error("Child died too early.");
3442 r = -ESRCH;
840295fc 3443 goto finish;
03cfe0d5 3444 }
ab046dde 3445
825d5287
RM
3446 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3447 if (l < 0) {
3448 r = log_error_errno(errno, "Failed to read UID shift: %m");
3449 goto finish;
3450 }
3451 if (l != sizeof(arg_uid_shift)) {
76d44882 3452 log_error("Short read while reading UID shift.");
825d5287
RM
3453 r = EIO;
3454 goto finish;
3455 }
3456
03cfe0d5 3457 r = setup_uid_map(pid);
840295fc
LP
3458 if (r < 0)
3459 goto finish;
ab046dde 3460
03cfe0d5
LP
3461 (void) barrier_place(&barrier); /* #2 */
3462 }
c74e630d 3463
9a2a5625 3464 if (arg_private_network) {
4bbfe7ad 3465
9a2a5625
LP
3466 r = move_network_interfaces(pid, arg_network_interfaces);
3467 if (r < 0)
3468 goto finish;
5aa4bb6b 3469
9a2a5625
LP
3470 if (arg_network_veth) {
3471 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3472 if (r < 0)
3473 goto finish;
3474 else if (r > 0)
3475 ifi = r;
6dac160c 3476
9a2a5625
LP
3477 if (arg_network_bridge) {
3478 r = setup_bridge(veth_name, arg_network_bridge);
3479 if (r < 0)
3480 goto finish;
3481 if (r > 0)
3482 ifi = r;
3483 }
3484 }
6dac160c 3485
f6d6bad1
LP
3486 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3487 if (r < 0)
3488 goto finish;
3489
9a2a5625
LP
3490 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3491 if (r < 0)
3492 goto finish;
3493
3494 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3495 if (r < 0)
3496 goto finish;
3497 }
6dac160c 3498
b7103bc5
LP
3499 if (arg_register) {
3500 r = register_machine(
3501 arg_machine,
3502 pid,
3503 arg_directory,
3504 arg_uuid,
3505 ifi,
3506 arg_slice,
3507 arg_custom_mounts, arg_n_custom_mounts,
3508 arg_kill_signal,
3509 arg_property,
6aadfa4c
ILG
3510 arg_keep_unit,
3511 arg_container_service_name);
b7103bc5
LP
3512 if (r < 0)
3513 goto finish;
3514 }
6dac160c 3515
34829a32 3516 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3517 if (r < 0)
3518 goto finish;
3519
34829a32
LP
3520 if (arg_keep_unit) {
3521 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3522 if (r < 0)
3523 goto finish;
3524 }
efdb0237 3525
34829a32 3526 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3527 if (r < 0)
3528 goto finish;
6dac160c 3529
03cfe0d5
LP
3530 /* Notify the child that the parent is ready with all
3531 * its setup (including cgroup-ification), and that
3532 * the child can now hand over control to the code to
3533 * run inside the container. */
3534 (void) barrier_place(&barrier); /* #3 */
6dac160c 3535
03cfe0d5
LP
3536 /* Block SIGCHLD here, before notifying child.
3537 * process_pty() will handle it with the other signals. */
3538 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3539
03cfe0d5
LP
3540 /* Reset signal to default */
3541 r = default_signals(SIGCHLD, -1);
3542 if (r < 0) {
3543 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3544 goto finish;
3545 }
e866af3a 3546
03cfe0d5 3547 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3548 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3549 log_error("Child died too early.");
03cfe0d5
LP
3550 r = -ESRCH;
3551 goto finish;
3552 }
b12afc8c 3553
03cfe0d5
LP
3554 sd_notifyf(false,
3555 "READY=1\n"
3556 "STATUS=Container running.\n"
3557 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3558
03cfe0d5
LP
3559 r = sd_event_new(&event);
3560 if (r < 0) {
3561 log_error_errno(r, "Failed to get default event source: %m");
3562 goto finish;
3563 }
88213476 3564
03cfe0d5
LP
3565 if (arg_kill_signal > 0) {
3566 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
3567 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3568 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
3569 } else {
3570 /* Immediately exit */
3571 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3572 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3573 }
023fb90b 3574
03cfe0d5
LP
3575 /* simply exit on sigchld */
3576 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3577
03cfe0d5 3578 if (arg_expose_ports) {
7a8f6325 3579 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3580 if (r < 0)
3581 goto finish;
023fb90b 3582
7a8f6325 3583 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3584 }
023fb90b 3585
03cfe0d5 3586 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3587
ae3dde80 3588 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3589 if (r < 0) {
3590 log_error_errno(r, "Failed to create PTY forwarder: %m");
3591 goto finish;
3592 }
023fb90b 3593
03cfe0d5
LP
3594 r = sd_event_loop(event);
3595 if (r < 0) {
3596 log_error_errno(r, "Failed to run event loop: %m");
3597 goto finish;
3598 }
6d0b55c2 3599
03cfe0d5 3600 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3601
03cfe0d5 3602 forward = pty_forward_free(forward);
6d0b55c2 3603
03cfe0d5
LP
3604 if (!arg_quiet && last_char != '\n')
3605 putc('\n', stdout);
04d39279 3606
03cfe0d5 3607 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3608 if (arg_register && !arg_keep_unit)
3609 terminate_machine(pid);
1f0cd86b 3610
840295fc 3611 /* Normally redundant, but better safe than sorry */
04d39279 3612 kill(pid, SIGKILL);
a258bf26 3613
113cea80 3614 r = wait_for_container(pid, &container_status);
04d39279
LP
3615 pid = 0;
3616
ec16945e 3617 if (r < 0)
ce9f1527
LP
3618 /* We failed to wait for the container, or the
3619 * container exited abnormally */
ec16945e 3620 goto finish;
9ed794a3 3621 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
ce9f1527
LP
3622 /* The container exited with a non-zero
3623 * status, or with zero status and no reboot
3624 * was requested. */
ec16945e 3625 ret = r;
d87be9b0 3626 break;
ec16945e 3627 }
88213476 3628
113cea80 3629 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3630
3631 if (arg_keep_unit) {
3632 /* Special handling if we are running as a
3633 * service: instead of simply restarting the
3634 * machine we want to restart the entire
3635 * service, so let's inform systemd about this
3636 * with the special exit code 133. The service
3637 * file uses RestartForceExitStatus=133 so
3638 * that this results in a full nspawn
3639 * restart. This is necessary since we might
3640 * have cgroup parameters set we want to have
3641 * flushed out. */
ec16945e
LP
3642 ret = 133;
3643 r = 0;
ce38dbc8
LP
3644 break;
3645 }
6d0b55c2 3646
7a8f6325 3647 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3648 }
88213476
LP
3649
3650finish:
af4ec430
LP
3651 sd_notify(false,
3652 "STOPPING=1\n"
3653 "STATUS=Terminating...");
3654
9444b1f2
LP
3655 if (pid > 0)
3656 kill(pid, SIGKILL);
88213476 3657
503546da
LP
3658 /* Try to flush whatever is still queued in the pty */
3659 if (master >= 0)
59f448cf 3660 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3661
03cfe0d5
LP
3662 loop_remove(loop_nr, &image_fd);
3663
ec16945e
LP
3664 if (remove_subvol && arg_directory) {
3665 int k;
3666
5bcd08db 3667 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3668 if (k < 0)
3669 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3670 }
3671
785890ac
LP
3672 if (arg_machine) {
3673 const char *p;
3674
63c372cb 3675 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3676 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3677 }
3678
7a8f6325 3679 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3680
04d391da 3681 free(arg_directory);
ec16945e
LP
3682 free(arg_template);
3683 free(arg_image);
7027ff61 3684 free(arg_machine);
c74e630d 3685 free(arg_user);
5f932eb9 3686 free(arg_chdir);
c74e630d 3687 strv_free(arg_setenv);
f757855e 3688 free(arg_network_bridge);
c74e630d
LP
3689 strv_free(arg_network_interfaces);
3690 strv_free(arg_network_macvlan);
4bbfe7ad 3691 strv_free(arg_network_ipvlan);
f6d6bad1 3692 strv_free(arg_network_veth_extra);
f757855e
LP
3693 strv_free(arg_parameters);
3694 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3695 expose_port_free_all(arg_expose_ports);
6d0b55c2 3696
ec16945e 3697 return r < 0 ? EXIT_FAILURE : ret;
88213476 3698}