]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
update TODO
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <net/if.h>
69c79d3c 44#include <linux/veth.h>
6afc95b7 45#include <sys/personality.h>
1b9e5b12 46#include <linux/loop.h>
aa28aefe 47
5d63309c 48#ifdef HAVE_SELINUX
a8828ed9
DW
49#include <selinux/selinux.h>
50#endif
88213476 51
24fb1112
LP
52#ifdef HAVE_SECCOMP
53#include <seccomp.h>
54#endif
55
1b9e5b12
LP
56#ifdef HAVE_BLKID
57#include <blkid/blkid.h>
58#endif
59
1f0cd86b
LP
60#include "sd-daemon.h"
61#include "sd-bus.h"
62#include "sd-id128.h"
aa28aefe 63#include "sd-rtnl.h"
88213476
LP
64#include "log.h"
65#include "util.h"
49e942b2 66#include "mkdir.h"
6b2d0e85 67#include "macro.h"
d7832d2c 68#include "audit.h"
94d82985 69#include "missing.h"
04d391da 70#include "cgroup-util.h"
a258bf26 71#include "strv.h"
9eb977db 72#include "path-util.h"
a41fe3a2 73#include "loopback-setup.h"
4fc9982c 74#include "dev-setup.h"
842f3b0f 75#include "fdset.h"
acbeb427 76#include "build.h"
a5c32cff 77#include "fileio.h"
40ca29a1 78#include "bus-util.h"
1f0cd86b 79#include "bus-error.h"
4ba93280 80#include "ptyfwd.h"
9bd37b40 81#include "bus-kernel.h"
f4889f65 82#include "env-util.h"
7f112f50 83#include "def.h"
aa28aefe 84#include "rtnl-util.h"
7e227024 85#include "udev-util.h"
1b9e5b12
LP
86#include "blkid-util.h"
87#include "gpt.h"
01dde061 88#include "siphash24.h"
849958d1 89#include "copy.h"
3577de7a 90#include "base-filesystem.h"
a2da110b 91#include "barrier.h"
023fb90b 92#include "event-util.h"
f2d88580 93
e9642be2
LP
94#ifdef HAVE_SECCOMP
95#include "seccomp-util.h"
96#endif
97
113cea80
DH
98typedef enum ContainerStatus {
99 CONTAINER_TERMINATED,
100 CONTAINER_REBOOTED
101} ContainerStatus;
102
57fb9fb5
LP
103typedef enum LinkJournal {
104 LINK_NO,
105 LINK_AUTO,
106 LINK_HOST,
107 LINK_GUEST
108} LinkJournal;
88213476 109
4d9f07b4
LP
110typedef enum Volatile {
111 VOLATILE_NO,
112 VOLATILE_YES,
113 VOLATILE_STATE,
114} Volatile;
115
88213476 116static char *arg_directory = NULL;
687d0825 117static char *arg_user = NULL;
9444b1f2 118static sd_id128_t arg_uuid = {};
7027ff61 119static char *arg_machine = NULL;
c74e630d
LP
120static const char *arg_selinux_context = NULL;
121static const char *arg_selinux_apifs_context = NULL;
9444b1f2 122static const char *arg_slice = NULL;
ff01d048 123static bool arg_private_network = false;
bc2f673e 124static bool arg_read_only = false;
0f0dbc46 125static bool arg_boot = false;
57fb9fb5 126static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 127static bool arg_link_journal_try = false;
5076f0cc
LP
128static uint64_t arg_retain =
129 (1ULL << CAP_CHOWN) |
130 (1ULL << CAP_DAC_OVERRIDE) |
131 (1ULL << CAP_DAC_READ_SEARCH) |
132 (1ULL << CAP_FOWNER) |
133 (1ULL << CAP_FSETID) |
134 (1ULL << CAP_IPC_OWNER) |
135 (1ULL << CAP_KILL) |
136 (1ULL << CAP_LEASE) |
137 (1ULL << CAP_LINUX_IMMUTABLE) |
138 (1ULL << CAP_NET_BIND_SERVICE) |
139 (1ULL << CAP_NET_BROADCAST) |
140 (1ULL << CAP_NET_RAW) |
141 (1ULL << CAP_SETGID) |
142 (1ULL << CAP_SETFCAP) |
143 (1ULL << CAP_SETPCAP) |
144 (1ULL << CAP_SETUID) |
145 (1ULL << CAP_SYS_ADMIN) |
146 (1ULL << CAP_SYS_CHROOT) |
147 (1ULL << CAP_SYS_NICE) |
148 (1ULL << CAP_SYS_PTRACE) |
149 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 150 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
151 (1ULL << CAP_SYS_BOOT) |
152 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
153 (1ULL << CAP_AUDIT_CONTROL) |
154 (1ULL << CAP_MKNOD);
17fe0523
LP
155static char **arg_bind = NULL;
156static char **arg_bind_ro = NULL;
06c17c39 157static char **arg_tmpfs = NULL;
f4889f65 158static char **arg_setenv = NULL;
284c0b91 159static bool arg_quiet = false;
8a96d94e 160static bool arg_share_system = false;
eb91eb18 161static bool arg_register = true;
89f7c846 162static bool arg_keep_unit = false;
aa28aefe 163static char **arg_network_interfaces = NULL;
c74e630d 164static char **arg_network_macvlan = NULL;
69c79d3c 165static bool arg_network_veth = false;
c74e630d 166static const char *arg_network_bridge = NULL;
6afc95b7 167static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 168static const char *arg_image = NULL;
4d9f07b4 169static Volatile arg_volatile = VOLATILE_NO;
88213476 170
601185b4 171static void help(void) {
88213476
LP
172 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
174 " -h --help Show this help\n"
175 " --version Print version string\n"
69c79d3c 176 " -q --quiet Do not show status information\n"
1b9e5b12
LP
177 " -D --directory=PATH Root directory for the container\n"
178 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
179 " -b --boot Boot up full system (i.e. invoke init)\n"
180 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 181 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 182 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 183 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
184 " --private-network Disable network in container\n"
185 " --network-interface=INTERFACE\n"
186 " Assign an existing network interface to the\n"
187 " container\n"
c74e630d
LP
188 " --network-macvlan=INTERFACE\n"
189 " Create a macvlan network interface based on an\n"
190 " existing network interface to the container\n"
32457153 191 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 192 " and container\n"
ab046dde 193 " --network-bridge=INTERFACE\n"
32457153 194 " Add a virtual ethernet connection between host\n"
ab046dde
TG
195 " and container and add it to an existing bridge on\n"
196 " the host\n"
82adf6af
LP
197 " -Z --selinux-context=SECLABEL\n"
198 " Set the SELinux security context to be used by\n"
199 " processes in the container\n"
200 " -L --selinux-apifs-context=SECLABEL\n"
201 " Set the SELinux security context to be used by\n"
202 " API/tmpfs file systems in the container\n"
a8828ed9
DW
203 " --capability=CAP In addition to the default, retain specified\n"
204 " capability\n"
205 " --drop-capability=CAP Drop the specified capability from the default set\n"
574edc90
MP
206 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
207 " try-guest, try-host\n"
208 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 209 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
210 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
211 " the container\n"
212 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 213 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 214 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 215 " --share-system Share system namespaces with host\n"
eb91eb18 216 " --register=BOOLEAN Register container as machine\n"
89f7c846 217 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
218 " the service unit nspawn is running in\n"
219 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 220 program_invocation_short_name);
88213476
LP
221}
222
223static int parse_argv(int argc, char *argv[]) {
224
a41fe3a2 225 enum {
acbeb427
ZJS
226 ARG_VERSION = 0x100,
227 ARG_PRIVATE_NETWORK,
bc2f673e 228 ARG_UUID,
5076f0cc 229 ARG_READ_ONLY,
57fb9fb5 230 ARG_CAPABILITY,
420c7379 231 ARG_DROP_CAPABILITY,
17fe0523
LP
232 ARG_LINK_JOURNAL,
233 ARG_BIND,
f4889f65 234 ARG_BIND_RO,
06c17c39 235 ARG_TMPFS,
f4889f65 236 ARG_SETENV,
eb91eb18 237 ARG_SHARE_SYSTEM,
89f7c846 238 ARG_REGISTER,
aa28aefe 239 ARG_KEEP_UNIT,
69c79d3c 240 ARG_NETWORK_INTERFACE,
c74e630d 241 ARG_NETWORK_MACVLAN,
69c79d3c 242 ARG_NETWORK_VETH,
ab046dde 243 ARG_NETWORK_BRIDGE,
6afc95b7 244 ARG_PERSONALITY,
4d9f07b4 245 ARG_VOLATILE,
a41fe3a2
LP
246 };
247
88213476 248 static const struct option options[] = {
aa28aefe
LP
249 { "help", no_argument, NULL, 'h' },
250 { "version", no_argument, NULL, ARG_VERSION },
251 { "directory", required_argument, NULL, 'D' },
252 { "user", required_argument, NULL, 'u' },
253 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
254 { "boot", no_argument, NULL, 'b' },
255 { "uuid", required_argument, NULL, ARG_UUID },
256 { "read-only", no_argument, NULL, ARG_READ_ONLY },
257 { "capability", required_argument, NULL, ARG_CAPABILITY },
258 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
259 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
260 { "bind", required_argument, NULL, ARG_BIND },
261 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 262 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
263 { "machine", required_argument, NULL, 'M' },
264 { "slice", required_argument, NULL, 'S' },
265 { "setenv", required_argument, NULL, ARG_SETENV },
266 { "selinux-context", required_argument, NULL, 'Z' },
267 { "selinux-apifs-context", required_argument, NULL, 'L' },
268 { "quiet", no_argument, NULL, 'q' },
269 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
270 { "register", required_argument, NULL, ARG_REGISTER },
271 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
272 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 273 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
274 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
275 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 276 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 277 { "image", required_argument, NULL, 'i' },
4d9f07b4 278 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 279 {}
88213476
LP
280 };
281
9444b1f2 282 int c, r;
a42c8b54 283 uint64_t plus = 0, minus = 0;
88213476
LP
284
285 assert(argc >= 0);
286 assert(argv);
287
601185b4 288 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
88213476
LP
289
290 switch (c) {
291
292 case 'h':
601185b4
ZJS
293 help();
294 return 0;
88213476 295
acbeb427
ZJS
296 case ARG_VERSION:
297 puts(PACKAGE_STRING);
298 puts(SYSTEMD_FEATURES);
299 return 0;
300
88213476
LP
301 case 'D':
302 free(arg_directory);
3a74cea5
LP
303 arg_directory = canonicalize_file_name(optarg);
304 if (!arg_directory) {
56f64d95 305 log_error_errno(errno, "Invalid root directory: %m");
88213476
LP
306 return -ENOMEM;
307 }
308
309 break;
310
1b9e5b12
LP
311 case 'i':
312 arg_image = optarg;
313 break;
314
687d0825
MV
315 case 'u':
316 free(arg_user);
7027ff61
LP
317 arg_user = strdup(optarg);
318 if (!arg_user)
319 return log_oom();
687d0825
MV
320
321 break;
322
ab046dde 323 case ARG_NETWORK_BRIDGE:
c74e630d 324 arg_network_bridge = optarg;
ab046dde
TG
325
326 /* fall through */
327
69c79d3c
LP
328 case ARG_NETWORK_VETH:
329 arg_network_veth = true;
330 arg_private_network = true;
331 break;
332
aa28aefe 333 case ARG_NETWORK_INTERFACE:
c74e630d
LP
334 if (strv_extend(&arg_network_interfaces, optarg) < 0)
335 return log_oom();
336
337 arg_private_network = true;
338 break;
339
340 case ARG_NETWORK_MACVLAN:
341 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
342 return log_oom();
343
344 /* fall through */
345
ff01d048
LP
346 case ARG_PRIVATE_NETWORK:
347 arg_private_network = true;
a41fe3a2
LP
348 break;
349
0f0dbc46
LP
350 case 'b':
351 arg_boot = true;
352 break;
353
144f0fc0 354 case ARG_UUID:
9444b1f2
LP
355 r = sd_id128_from_string(optarg, &arg_uuid);
356 if (r < 0) {
aa96c6cb 357 log_error("Invalid UUID: %s", optarg);
9444b1f2 358 return r;
aa96c6cb 359 }
9444b1f2 360 break;
aa96c6cb 361
9444b1f2 362 case 'S':
c74e630d 363 arg_slice = optarg;
144f0fc0
LP
364 break;
365
7027ff61 366 case 'M':
eb91eb18
LP
367 if (isempty(optarg)) {
368 free(arg_machine);
369 arg_machine = NULL;
370 } else {
7027ff61 371
eb91eb18
LP
372 if (!hostname_is_valid(optarg)) {
373 log_error("Invalid machine name: %s", optarg);
374 return -EINVAL;
375 }
7027ff61 376
eb91eb18
LP
377 free(arg_machine);
378 arg_machine = strdup(optarg);
379 if (!arg_machine)
380 return log_oom();
381
382 break;
383 }
7027ff61 384
82adf6af
LP
385 case 'Z':
386 arg_selinux_context = optarg;
a8828ed9
DW
387 break;
388
82adf6af
LP
389 case 'L':
390 arg_selinux_apifs_context = optarg;
a8828ed9
DW
391 break;
392
bc2f673e
LP
393 case ARG_READ_ONLY:
394 arg_read_only = true;
395 break;
396
420c7379
LP
397 case ARG_CAPABILITY:
398 case ARG_DROP_CAPABILITY: {
a2a5291b 399 const char *state, *word;
5076f0cc
LP
400 size_t length;
401
402 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 403 _cleanup_free_ char *t;
5076f0cc 404 cap_value_t cap;
5076f0cc
LP
405
406 t = strndup(word, length);
0d0f0c50
SL
407 if (!t)
408 return log_oom();
5076f0cc 409
39ed67d1
LP
410 if (streq(t, "all")) {
411 if (c == ARG_CAPABILITY)
a42c8b54 412 plus = (uint64_t) -1;
39ed67d1 413 else
a42c8b54 414 minus = (uint64_t) -1;
39ed67d1
LP
415 } else {
416 if (cap_from_name(t, &cap) < 0) {
417 log_error("Failed to parse capability %s.", t);
418 return -EINVAL;
419 }
420
421 if (c == ARG_CAPABILITY)
a42c8b54 422 plus |= 1ULL << (uint64_t) cap;
39ed67d1 423 else
a42c8b54 424 minus |= 1ULL << (uint64_t) cap;
5076f0cc 425 }
5076f0cc
LP
426 }
427
428 break;
429 }
430
57fb9fb5
LP
431 case 'j':
432 arg_link_journal = LINK_GUEST;
574edc90 433 arg_link_journal_try = true;
57fb9fb5
LP
434 break;
435
436 case ARG_LINK_JOURNAL:
437 if (streq(optarg, "auto"))
438 arg_link_journal = LINK_AUTO;
439 else if (streq(optarg, "no"))
440 arg_link_journal = LINK_NO;
441 else if (streq(optarg, "guest"))
442 arg_link_journal = LINK_GUEST;
443 else if (streq(optarg, "host"))
444 arg_link_journal = LINK_HOST;
574edc90
MP
445 else if (streq(optarg, "try-guest")) {
446 arg_link_journal = LINK_GUEST;
447 arg_link_journal_try = true;
448 } else if (streq(optarg, "try-host")) {
449 arg_link_journal = LINK_HOST;
450 arg_link_journal_try = true;
451 } else {
57fb9fb5
LP
452 log_error("Failed to parse link journal mode %s", optarg);
453 return -EINVAL;
454 }
455
456 break;
457
17fe0523
LP
458 case ARG_BIND:
459 case ARG_BIND_RO: {
460 _cleanup_free_ char *a = NULL, *b = NULL;
461 char *e;
462 char ***x;
17fe0523
LP
463
464 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466 e = strchr(optarg, ':');
467 if (e) {
468 a = strndup(optarg, e - optarg);
469 b = strdup(e + 1);
470 } else {
471 a = strdup(optarg);
472 b = strdup(optarg);
473 }
474
475 if (!a || !b)
476 return log_oom();
477
478 if (!path_is_absolute(a) || !path_is_absolute(b)) {
479 log_error("Invalid bind mount specification: %s", optarg);
480 return -EINVAL;
481 }
482
483 r = strv_extend(x, a);
484 if (r < 0)
b3451bed 485 return log_oom();
17fe0523
LP
486
487 r = strv_extend(x, b);
488 if (r < 0)
b3451bed 489 return log_oom();
17fe0523
LP
490
491 break;
492 }
493
06c17c39
LP
494 case ARG_TMPFS: {
495 _cleanup_free_ char *a = NULL, *b = NULL;
496 char *e;
497
498 e = strchr(optarg, ':');
499 if (e) {
500 a = strndup(optarg, e - optarg);
501 b = strdup(e + 1);
502 } else {
503 a = strdup(optarg);
504 b = strdup("mode=0755");
505 }
506
507 if (!a || !b)
508 return log_oom();
509
510 if (!path_is_absolute(a)) {
511 log_error("Invalid tmpfs specification: %s", optarg);
512 return -EINVAL;
513 }
514
515 r = strv_push(&arg_tmpfs, a);
516 if (r < 0)
517 return log_oom();
518
519 a = NULL;
520
521 r = strv_push(&arg_tmpfs, b);
522 if (r < 0)
523 return log_oom();
524
525 b = NULL;
526
527 break;
528 }
529
f4889f65
LP
530 case ARG_SETENV: {
531 char **n;
532
533 if (!env_assignment_is_valid(optarg)) {
534 log_error("Environment variable assignment '%s' is not valid.", optarg);
535 return -EINVAL;
536 }
537
538 n = strv_env_set(arg_setenv, optarg);
539 if (!n)
540 return log_oom();
541
542 strv_free(arg_setenv);
543 arg_setenv = n;
544 break;
545 }
546
284c0b91
LP
547 case 'q':
548 arg_quiet = true;
549 break;
550
8a96d94e
LP
551 case ARG_SHARE_SYSTEM:
552 arg_share_system = true;
553 break;
554
eb91eb18
LP
555 case ARG_REGISTER:
556 r = parse_boolean(optarg);
557 if (r < 0) {
558 log_error("Failed to parse --register= argument: %s", optarg);
559 return r;
560 }
561
562 arg_register = r;
563 break;
564
89f7c846
LP
565 case ARG_KEEP_UNIT:
566 arg_keep_unit = true;
567 break;
568
6afc95b7
LP
569 case ARG_PERSONALITY:
570
ac45f971 571 arg_personality = personality_from_string(optarg);
6afc95b7
LP
572 if (arg_personality == 0xffffffffLU) {
573 log_error("Unknown or unsupported personality '%s'.", optarg);
574 return -EINVAL;
575 }
576
577 break;
578
4d9f07b4
LP
579 case ARG_VOLATILE:
580
581 if (!optarg)
582 arg_volatile = VOLATILE_YES;
583 else {
584 r = parse_boolean(optarg);
585 if (r < 0) {
586 if (streq(optarg, "state"))
587 arg_volatile = VOLATILE_STATE;
588 else {
589 log_error("Failed to parse --volatile= argument: %s", optarg);
590 return r;
591 }
592 } else
593 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594 }
595
596 break;
597
88213476
LP
598 case '?':
599 return -EINVAL;
600
601 default:
eb9da376 602 assert_not_reached("Unhandled option");
88213476 603 }
88213476 604
eb91eb18
LP
605 if (arg_share_system)
606 arg_register = false;
607
608 if (arg_boot && arg_share_system) {
609 log_error("--boot and --share-system may not be combined.");
610 return -EINVAL;
611 }
612
89f7c846
LP
613 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614 log_error("--keep-unit may not be used when invoked from a user session.");
615 return -EINVAL;
616 }
617
1b9e5b12
LP
618 if (arg_directory && arg_image) {
619 log_error("--directory= and --image= may not be combined.");
620 return -EINVAL;
621 }
622
4d9f07b4
LP
623 if (arg_volatile != VOLATILE_NO && arg_read_only) {
624 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625 return -EINVAL;
626 }
627
a42c8b54
LP
628 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
88213476
LP
630 return 1;
631}
632
633static int mount_all(const char *dest) {
634
635 typedef struct MountPoint {
636 const char *what;
637 const char *where;
638 const char *type;
639 const char *options;
640 unsigned long flags;
3bd66c05 641 bool fatal;
88213476
LP
642 } MountPoint;
643
644 static const MountPoint mount_table[] = {
06c17c39
LP
645 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
646 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
647 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
648 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
649 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 650 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
651 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
652 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 653#ifdef HAVE_SELINUX
06c17c39
LP
654 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
655 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 656#endif
88213476
LP
657 };
658
659 unsigned k;
660 int r = 0;
661
662 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 663 _cleanup_free_ char *where = NULL;
d002827b 664#ifdef HAVE_SELINUX
a8828ed9 665 _cleanup_free_ char *options = NULL;
d002827b
LP
666#endif
667 const char *o;
88213476
LP
668 int t;
669
17fe0523
LP
670 where = strjoin(dest, "/", mount_table[k].where, NULL);
671 if (!where)
672 return log_oom();
88213476 673
e65aec12 674 t = path_is_mount_point(where, true);
68fb0892 675 if (t < 0) {
da927ba9 676 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
677
678 if (r == 0)
679 r = t;
680
681 continue;
682 }
683
9c1c7f71
LP
684 /* Skip this entry if it is not a remount. */
685 if (mount_table[k].what && t > 0)
014a9c77
LP
686 continue;
687
79d80fc1
TG
688 t = mkdir_p(where, 0755);
689 if (t < 0) {
690 if (mount_table[k].fatal) {
da927ba9 691 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
692
693 if (r == 0)
694 r = t;
695 } else
da927ba9 696 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
697
698 continue;
699 }
88213476 700
a8828ed9 701#ifdef HAVE_SELINUX
82adf6af
LP
702 if (arg_selinux_apifs_context &&
703 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
705 if (!options)
706 return log_oom();
707
708 o = options;
709 } else
a8828ed9 710#endif
d002827b 711 o = mount_table[k].options;
a8828ed9 712
a8828ed9 713
88213476
LP
714 if (mount(mount_table[k].what,
715 where,
716 mount_table[k].type,
717 mount_table[k].flags,
79d80fc1 718 o) < 0) {
88213476 719
79d80fc1 720 if (mount_table[k].fatal) {
56f64d95 721 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 722
79d80fc1
TG
723 if (r == 0)
724 r = -errno;
725 } else
56f64d95 726 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 727 }
88213476
LP
728 }
729
e58a1277
LP
730 return r;
731}
f8440af5 732
d6797c92 733static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
734 char **x, **y;
735
736 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 737 _cleanup_free_ char *where = NULL;
d2421337 738 struct stat source_st, dest_st;
2ed4e5e0 739 int r;
d2421337 740
4a62c710
MS
741 if (stat(*x, &source_st) < 0)
742 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 743
06c17c39
LP
744 where = strappend(dest, *y);
745 if (!where)
746 return log_oom();
747
2ed4e5e0
SL
748 r = stat(where, &dest_st);
749 if (r == 0) {
d2421337 750 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 751 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
752 return -EINVAL;
753 }
2ed4e5e0
SL
754 } else if (errno == ENOENT) {
755 r = mkdir_parents_label(where, 0755);
f647962d
MS
756 if (r < 0)
757 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 758 } else {
56f64d95 759 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
760 return -errno;
761 }
06c17c39 762
2ed4e5e0 763 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 764 * and char devices. */
79d80fc1
TG
765 if (S_ISDIR(source_st.st_mode)) {
766 r = mkdir_label(where, 0755);
f647962d
MS
767 if (r < 0 && errno != EEXIST)
768 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1
TG
769 } else if (S_ISFIFO(source_st.st_mode)) {
770 r = mkfifo(where, 0644);
4a62c710
MS
771 if (r < 0 && errno != EEXIST)
772 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
773 } else if (S_ISSOCK(source_st.st_mode)) {
774 r = mknod(where, 0644 | S_IFSOCK, 0);
4a62c710
MS
775 if (r < 0 && errno != EEXIST)
776 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
777 } else if (S_ISREG(source_st.st_mode)) {
778 r = touch(where);
f647962d
MS
779 if (r < 0)
780 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1 781 } else {
2ed4e5e0
SL
782 log_error("Refusing to create mountpoint for file: %s", *x);
783 return -ENOTSUP;
d2421337 784 }
17fe0523 785
4a62c710
MS
786 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
787 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 788
d6797c92
LP
789 if (ro) {
790 r = bind_remount_recursive(where, true);
f647962d
MS
791 if (r < 0)
792 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
793 }
794 }
795
796 return 0;
797}
798
06c17c39
LP
799static int mount_tmpfs(const char *dest) {
800 char **i, **o;
801
802 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
803 _cleanup_free_ char *where = NULL;
79d80fc1 804 int r;
06c17c39
LP
805
806 where = strappend(dest, *i);
807 if (!where)
808 return log_oom();
809
79d80fc1 810 r = mkdir_label(where, 0755);
04a91939
LP
811 if (r < 0 && r != -EEXIST)
812 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 813
4a62c710
MS
814 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
815 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
816 }
817
818 return 0;
819}
820
e58a1277 821static int setup_timezone(const char *dest) {
d4036145
LP
822 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
823 char *z, *y;
824 int r;
f8440af5 825
e58a1277
LP
826 assert(dest);
827
828 /* Fix the timezone, if possible */
d4036145
LP
829 r = readlink_malloc("/etc/localtime", &p);
830 if (r < 0) {
831 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
832 return 0;
833 }
834
835 z = path_startswith(p, "../usr/share/zoneinfo/");
836 if (!z)
837 z = path_startswith(p, "/usr/share/zoneinfo/");
838 if (!z) {
839 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
840 return 0;
841 }
842
04bc4a3f
LP
843 where = strappend(dest, "/etc/localtime");
844 if (!where)
0d0f0c50 845 return log_oom();
715ac17a 846
d4036145
LP
847 r = readlink_malloc(where, &q);
848 if (r >= 0) {
849 y = path_startswith(q, "../usr/share/zoneinfo/");
850 if (!y)
851 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 852
d4036145
LP
853 /* Already pointing to the right place? Then do nothing .. */
854 if (y && streq(y, z))
855 return 0;
856 }
857
858 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
859 if (!check)
0d0f0c50 860 return log_oom();
4d1c38b8 861
d4036145
LP
862 if (access(check, F_OK) < 0) {
863 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
864 return 0;
865 }
68fb0892 866
d4036145
LP
867 what = strappend("../usr/share/zoneinfo/", z);
868 if (!what)
869 return log_oom();
870
79d80fc1
TG
871 r = mkdir_parents(where, 0755);
872 if (r < 0) {
da927ba9 873 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
874
875 return 0;
876 }
877
878 r = unlink(where);
879 if (r < 0 && errno != ENOENT) {
56f64d95 880 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
881
882 return 0;
883 }
4d9f07b4 884
d4036145 885 if (symlink(what, where) < 0) {
56f64d95 886 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
887 return 0;
888 }
e58a1277
LP
889
890 return 0;
88213476
LP
891}
892
2547bb41 893static int setup_resolv_conf(const char *dest) {
c8b32e11 894 _cleanup_free_ char *where = NULL;
79d80fc1 895 int r;
2547bb41
LP
896
897 assert(dest);
898
899 if (arg_private_network)
900 return 0;
901
902 /* Fix resolv.conf, if possible */
04bc4a3f
LP
903 where = strappend(dest, "/etc/resolv.conf");
904 if (!where)
0d0f0c50 905 return log_oom();
2547bb41 906
77e63faf
LP
907 /* We don't really care for the results of this really. If it
908 * fails, it fails, but meh... */
79d80fc1
TG
909 r = mkdir_parents(where, 0755);
910 if (r < 0) {
da927ba9 911 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
912
913 return 0;
914 }
915
916 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
917 if (r < 0) {
da927ba9 918 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
919
920 return 0;
921 }
2547bb41
LP
922
923 return 0;
924}
925
4d9f07b4
LP
926static int setup_volatile_state(const char *directory) {
927 const char *p;
928 int r;
929
930 assert(directory);
931
932 if (arg_volatile != VOLATILE_STATE)
933 return 0;
934
935 /* --volatile=state means we simply overmount /var
936 with a tmpfs, and the rest read-only. */
937
938 r = bind_remount_recursive(directory, true);
f647962d
MS
939 if (r < 0)
940 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4
LP
941
942 p = strappenda(directory, "/var");
79d80fc1 943 r = mkdir(p, 0755);
4a62c710
MS
944 if (r < 0 && errno != EEXIST)
945 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 946
4a62c710
MS
947 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
948 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
949
950 return 0;
951}
952
953static int setup_volatile(const char *directory) {
954 bool tmpfs_mounted = false, bind_mounted = false;
955 char template[] = "/tmp/nspawn-volatile-XXXXXX";
956 const char *f, *t;
957 int r;
958
959 assert(directory);
960
961 if (arg_volatile != VOLATILE_YES)
962 return 0;
963
964 /* --volatile=yes means we mount a tmpfs to the root dir, and
965 the original /usr to use inside it, and that read-only. */
966
4a62c710
MS
967 if (!mkdtemp(template))
968 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
969
970 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 971 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
972 r = -errno;
973 goto fail;
974 }
975
976 tmpfs_mounted = true;
977
978 f = strappenda(directory, "/usr");
979 t = strappenda(template, "/usr");
980
79d80fc1
TG
981 r = mkdir(t, 0755);
982 if (r < 0 && errno != EEXIST) {
56f64d95 983 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
984 r = -errno;
985 goto fail;
986 }
987
4d9f07b4 988 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 989 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
990 r = -errno;
991 goto fail;
992 }
993
994 bind_mounted = true;
995
996 r = bind_remount_recursive(t, true);
997 if (r < 0) {
da927ba9 998 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
999 goto fail;
1000 }
1001
1002 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1003 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1004 r = -errno;
1005 goto fail;
1006 }
1007
1008 rmdir(template);
1009
1010 return 0;
1011
1012fail:
1013 if (bind_mounted)
1014 umount(t);
1015 if (tmpfs_mounted)
1016 umount(template);
1017 rmdir(template);
1018 return r;
1019}
1020
9f24adc2
LP
1021static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1022
1023 snprintf(s, 37,
1024 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1025 SD_ID128_FORMAT_VAL(id));
1026
1027 return s;
1028}
1029
04bc4a3f 1030static int setup_boot_id(const char *dest) {
7fd1b19b 1031 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1032 sd_id128_t rnd = {};
04bc4a3f
LP
1033 char as_uuid[37];
1034 int r;
1035
1036 assert(dest);
1037
eb91eb18
LP
1038 if (arg_share_system)
1039 return 0;
1040
04bc4a3f
LP
1041 /* Generate a new randomized boot ID, so that each boot-up of
1042 * the container gets a new one */
1043
1044 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1045 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1046 if (!from || !to)
1047 return log_oom();
04bc4a3f
LP
1048
1049 r = sd_id128_randomize(&rnd);
f647962d
MS
1050 if (r < 0)
1051 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1052
9f24adc2 1053 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1054
574d5f2d 1055 r = write_string_file(from, as_uuid);
f647962d
MS
1056 if (r < 0)
1057 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1058
1059 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1060 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1061 r = -errno;
10d18763 1062 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1063 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1064
1065 unlink(from);
04bc4a3f
LP
1066 return r;
1067}
1068
e58a1277 1069static int copy_devnodes(const char *dest) {
88213476
LP
1070
1071 static const char devnodes[] =
1072 "null\0"
1073 "zero\0"
1074 "full\0"
1075 "random\0"
1076 "urandom\0"
85614d66
TG
1077 "tty\0"
1078 "net/tun\0";
88213476
LP
1079
1080 const char *d;
e58a1277 1081 int r = 0;
7fd1b19b 1082 _cleanup_umask_ mode_t u;
a258bf26
LP
1083
1084 assert(dest);
124640f1
LP
1085
1086 u = umask(0000);
88213476
LP
1087
1088 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1089 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1090 struct stat st;
88213476 1091
7f112f50
LP
1092 from = strappend("/dev/", d);
1093 to = strjoin(dest, "/dev/", d, NULL);
1094 if (!from || !to)
1095 return log_oom();
88213476
LP
1096
1097 if (stat(from, &st) < 0) {
1098
4a62c710
MS
1099 if (errno != ENOENT)
1100 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1101
a258bf26 1102 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1103
ed8b7a3e 1104 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1105 return -EIO;
a258bf26 1106
85614d66
TG
1107 } else {
1108 r = mkdir_parents(to, 0775);
1109 if (r < 0) {
da927ba9 1110 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1111 return -r;
1112 }
a258bf26 1113
4a62c710
MS
1114 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1115 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
88213476 1116 }
88213476
LP
1117 }
1118
e58a1277
LP
1119 return r;
1120}
88213476 1121
f2d88580
LP
1122static int setup_ptmx(const char *dest) {
1123 _cleanup_free_ char *p = NULL;
1124
1125 p = strappend(dest, "/dev/ptmx");
1126 if (!p)
1127 return log_oom();
1128
4a62c710
MS
1129 if (symlink("pts/ptmx", p) < 0)
1130 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580
LP
1131
1132 return 0;
1133}
1134
e58a1277 1135static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1136 _cleanup_umask_ mode_t u;
1137 const char *to;
e58a1277 1138 struct stat st;
e58a1277 1139 int r;
e58a1277
LP
1140
1141 assert(dest);
1142 assert(console);
1143
1144 u = umask(0000);
1145
4a62c710
MS
1146 if (stat("/dev/null", &st) < 0)
1147 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1148
e58a1277 1149 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1150 if (r < 0)
1151 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1152
a258bf26
LP
1153 /* We need to bind mount the right tty to /dev/console since
1154 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1155 * to bind mount things on we create a device node first, and
1156 * use /dev/null for that since we the cgroups device policy
1157 * allows us to create that freely, while we cannot create
1158 * /dev/console. (Note that the major minor doesn't actually
1159 * matter here, since we mount it over anyway). */
a258bf26 1160
eb0f0863 1161 to = strappenda(dest, "/dev/console");
4a62c710
MS
1162 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1163 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1164
4a62c710
MS
1165 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1166 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1167
25ea79fe 1168 return 0;
e58a1277
LP
1169}
1170
1171static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1172 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1173 int r, fd, k;
7fd1b19b 1174 _cleanup_umask_ mode_t u;
e58a1277
LP
1175 union {
1176 struct cmsghdr cmsghdr;
1177 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1178 } control = {};
1179 struct msghdr mh = {
1180 .msg_control = &control,
1181 .msg_controllen = sizeof(control),
1182 };
e58a1277
LP
1183 struct cmsghdr *cmsg;
1184
1185 assert(dest);
1186 assert(kmsg_socket >= 0);
a258bf26 1187
e58a1277 1188 u = umask(0000);
a258bf26 1189
f1e5dfe2
LP
1190 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1191 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1192 * on the reading side behave very similar to /proc/kmsg,
1193 * their writing side behaves differently from /dev/kmsg in
1194 * that writing blocks when nothing is reading. In order to
1195 * avoid any problems with containers deadlocking due to this
1196 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1197 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1198 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1199 return log_oom();
e58a1277 1200
4a62c710
MS
1201 if (mkfifo(from, 0600) < 0)
1202 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1203
1204 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1205 if (r < 0)
1206 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1207
4a62c710
MS
1208 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1209 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1210
1211 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1212 if (fd < 0)
1213 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1214
e58a1277
LP
1215 cmsg = CMSG_FIRSTHDR(&mh);
1216 cmsg->cmsg_level = SOL_SOCKET;
1217 cmsg->cmsg_type = SCM_RIGHTS;
1218 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1219 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1220
1221 mh.msg_controllen = cmsg->cmsg_len;
1222
1223 /* Store away the fd in the socket, so that it stays open as
1224 * long as we run the child */
1225 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1226 safe_close(fd);
e58a1277 1227
4a62c710
MS
1228 if (k < 0)
1229 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1230
f1e5dfe2
LP
1231 /* And now make the FIFO unavailable as /dev/kmsg... */
1232 unlink(from);
25ea79fe 1233 return 0;
88213476
LP
1234}
1235
3a74cea5 1236static int setup_hostname(void) {
3a74cea5 1237
eb91eb18
LP
1238 if (arg_share_system)
1239 return 0;
1240
605f81a8 1241 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1242 return -errno;
3a74cea5 1243
7027ff61 1244 return 0;
3a74cea5
LP
1245}
1246
57fb9fb5 1247static int setup_journal(const char *directory) {
4d680aee 1248 sd_id128_t machine_id, this_id;
7fd1b19b 1249 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1250 char *id;
57fb9fb5
LP
1251 int r;
1252
57fb9fb5 1253 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1254 if (!p)
1255 return log_oom();
57fb9fb5
LP
1256
1257 r = read_one_line_file(p, &b);
27407a01
ZJS
1258 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1259 return 0;
f647962d
MS
1260 else if (r < 0)
1261 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1262
27407a01
ZJS
1263 id = strstrip(b);
1264 if (isempty(id) && arg_link_journal == LINK_AUTO)
1265 return 0;
57fb9fb5 1266
27407a01
ZJS
1267 /* Verify validity */
1268 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1269 if (r < 0)
1270 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1271
4d680aee 1272 r = sd_id128_get_machine(&this_id);
f647962d
MS
1273 if (r < 0)
1274 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1275
1276 if (sd_id128_equal(machine_id, this_id)) {
1277 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1278 "Host and machine ids are equal (%s): refusing to link journals", id);
1279 if (arg_link_journal == LINK_AUTO)
1280 return 0;
1281 return
1282 -EEXIST;
1283 }
1284
1285 if (arg_link_journal == LINK_NO)
1286 return 0;
1287
57fb9fb5 1288 free(p);
27407a01
ZJS
1289 p = strappend("/var/log/journal/", id);
1290 q = strjoin(directory, "/var/log/journal/", id, NULL);
1291 if (!p || !q)
1292 return log_oom();
1293
1294 if (path_is_mount_point(p, false) > 0) {
1295 if (arg_link_journal != LINK_AUTO) {
1296 log_error("%s: already a mount point, refusing to use for journal", p);
1297 return -EEXIST;
1298 }
1299
1300 return 0;
57fb9fb5
LP
1301 }
1302
27407a01 1303 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1304 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1305 log_error("%s: already a mount point, refusing to use for journal", q);
1306 return -EEXIST;
57fb9fb5
LP
1307 }
1308
27407a01 1309 return 0;
57fb9fb5
LP
1310 }
1311
1312 r = readlink_and_make_absolute(p, &d);
1313 if (r >= 0) {
1314 if ((arg_link_journal == LINK_GUEST ||
1315 arg_link_journal == LINK_AUTO) &&
1316 path_equal(d, q)) {
1317
27407a01
ZJS
1318 r = mkdir_p(q, 0755);
1319 if (r < 0)
56f64d95 1320 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1321 return 0;
57fb9fb5
LP
1322 }
1323
4a62c710
MS
1324 if (unlink(p) < 0)
1325 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1326 } else if (r == -EINVAL) {
1327
1328 if (arg_link_journal == LINK_GUEST &&
1329 rmdir(p) < 0) {
1330
27407a01
ZJS
1331 if (errno == ENOTDIR) {
1332 log_error("%s already exists and is neither a symlink nor a directory", p);
1333 return r;
1334 } else {
56f64d95 1335 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1336 return -errno;
57fb9fb5 1337 }
57fb9fb5
LP
1338 }
1339 } else if (r != -ENOENT) {
56f64d95 1340 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1341 return r;
57fb9fb5
LP
1342 }
1343
1344 if (arg_link_journal == LINK_GUEST) {
1345
1346 if (symlink(q, p) < 0) {
574edc90 1347 if (arg_link_journal_try) {
56f64d95 1348 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1349 return 0;
1350 } else {
56f64d95 1351 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1352 return -errno;
1353 }
57fb9fb5
LP
1354 }
1355
27407a01
ZJS
1356 r = mkdir_p(q, 0755);
1357 if (r < 0)
56f64d95 1358 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1359 return 0;
57fb9fb5
LP
1360 }
1361
1362 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1363 /* don't create parents here -- if the host doesn't have
1364 * permanent journal set up, don't force it here */
1365 r = mkdir(p, 0755);
57fb9fb5 1366 if (r < 0) {
574edc90 1367 if (arg_link_journal_try) {
56f64d95 1368 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1369 return 0;
1370 } else {
56f64d95 1371 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1372 return r;
1373 }
57fb9fb5
LP
1374 }
1375
27407a01
ZJS
1376 } else if (access(p, F_OK) < 0)
1377 return 0;
57fb9fb5 1378
cdb2b9d0
LP
1379 if (dir_is_empty(q) == 0)
1380 log_warning("%s is not empty, proceeding anyway.", q);
1381
57fb9fb5
LP
1382 r = mkdir_p(q, 0755);
1383 if (r < 0) {
56f64d95 1384 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1385 return r;
57fb9fb5
LP
1386 }
1387
4a62c710
MS
1388 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1389 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1390
27407a01 1391 return 0;
57fb9fb5
LP
1392}
1393
88213476 1394static int drop_capabilities(void) {
5076f0cc 1395 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1396}
1397
5aa4bb6b 1398static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1399 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1400 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1401 int r;
1402
eb91eb18
LP
1403 if (!arg_register)
1404 return 0;
1405
1c03020c 1406 r = sd_bus_default_system(&bus);
f647962d
MS
1407 if (r < 0)
1408 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1409
89f7c846
LP
1410 if (arg_keep_unit) {
1411 r = sd_bus_call_method(
1412 bus,
1413 "org.freedesktop.machine1",
1414 "/org/freedesktop/machine1",
1415 "org.freedesktop.machine1.Manager",
5aa4bb6b 1416 "RegisterMachineWithNetwork",
89f7c846
LP
1417 &error,
1418 NULL,
5aa4bb6b 1419 "sayssusai",
89f7c846
LP
1420 arg_machine,
1421 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1422 "nspawn",
1423 "container",
1424 (uint32_t) pid,
5aa4bb6b
LP
1425 strempty(arg_directory),
1426 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1427 } else {
9457ac5b
LP
1428 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1429
1430 r = sd_bus_message_new_method_call(
89f7c846 1431 bus,
9457ac5b 1432 &m,
89f7c846
LP
1433 "org.freedesktop.machine1",
1434 "/org/freedesktop/machine1",
1435 "org.freedesktop.machine1.Manager",
5aa4bb6b 1436 "CreateMachineWithNetwork");
f647962d
MS
1437 if (r < 0)
1438 return log_error_errno(r, "Failed to create message: %m");
9457ac5b
LP
1439
1440 r = sd_bus_message_append(
1441 m,
5aa4bb6b 1442 "sayssusai",
89f7c846
LP
1443 arg_machine,
1444 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1445 "nspawn",
1446 "container",
1447 (uint32_t) pid,
5aa4bb6b
LP
1448 strempty(arg_directory),
1449 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d
MS
1450 if (r < 0)
1451 return log_error_errno(r, "Failed to append message arguments: %m");
9457ac5b
LP
1452
1453 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d
MS
1454 if (r < 0)
1455 return log_error_errno(r, "Failed to open container: %m");
9457ac5b
LP
1456
1457 if (!isempty(arg_slice)) {
1458 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d
MS
1459 if (r < 0)
1460 return log_error_errno(r, "Failed to append slice: %m");
9457ac5b
LP
1461 }
1462
1463 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d
MS
1464 if (r < 0)
1465 return log_error_errno(r, "Failed to add device policy: %m");
9457ac5b 1466
63cc4c31 1467 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1468 /* Allow the container to
1469 * access and create the API
1470 * device nodes, so that
1471 * PrivateDevices= in the
1472 * container can work
1473 * fine */
1474 "/dev/null", "rwm",
1475 "/dev/zero", "rwm",
1476 "/dev/full", "rwm",
1477 "/dev/random", "rwm",
1478 "/dev/urandom", "rwm",
1479 "/dev/tty", "rwm",
864e1706 1480 "/dev/net/tun", "rwm",
9457ac5b
LP
1481 /* Allow the container
1482 * access to ptys. However,
1483 * do not permit the
1484 * container to ever create
1485 * these device nodes. */
1486 "/dev/pts/ptmx", "rw",
63cc4c31 1487 "char-pts", "rw");
f647962d
MS
1488 if (r < 0)
1489 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b
LP
1490
1491 r = sd_bus_message_close_container(m);
f647962d
MS
1492 if (r < 0)
1493 return log_error_errno(r, "Failed to close container: %m");
9457ac5b
LP
1494
1495 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1496 }
1497
9444b1f2 1498 if (r < 0) {
1f0cd86b
LP
1499 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1500 return r;
1501 }
1502
1503 return 0;
1504}
1505
1506static int terminate_machine(pid_t pid) {
1507 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1508 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1509 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1510 const char *path;
1511 int r;
1512
eb91eb18
LP
1513 if (!arg_register)
1514 return 0;
1515
76b54375 1516 r = sd_bus_default_system(&bus);
f647962d
MS
1517 if (r < 0)
1518 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1519
1520 r = sd_bus_call_method(
1521 bus,
1522 "org.freedesktop.machine1",
1523 "/org/freedesktop/machine1",
1524 "org.freedesktop.machine1.Manager",
1525 "GetMachineByPID",
1526 &error,
1527 &reply,
1528 "u",
1529 (uint32_t) pid);
1530 if (r < 0) {
1531 /* Note that the machine might already have been
1532 * cleaned up automatically, hence don't consider it a
1533 * failure if we cannot get the machine object. */
1534 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1535 return 0;
1536 }
1537
1538 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1539 if (r < 0)
1540 return bus_log_parse_error(r);
9444b1f2 1541
1f0cd86b
LP
1542 r = sd_bus_call_method(
1543 bus,
1544 "org.freedesktop.machine1",
1545 path,
1546 "org.freedesktop.machine1.Machine",
1547 "Terminate",
1548 &error,
1549 NULL,
1550 NULL);
1551 if (r < 0) {
1552 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1553 return 0;
1554 }
1555
9444b1f2
LP
1556 return 0;
1557}
1558
db999e0f
LP
1559static int reset_audit_loginuid(void) {
1560 _cleanup_free_ char *p = NULL;
1561 int r;
1562
1563 if (arg_share_system)
1564 return 0;
1565
1566 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1567 if (r == -ENOENT)
db999e0f 1568 return 0;
f647962d
MS
1569 if (r < 0)
1570 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1571
1572 /* Already reset? */
1573 if (streq(p, "4294967295"))
1574 return 0;
1575
1576 r = write_string_file("/proc/self/loginuid", "4294967295");
1577 if (r < 0) {
1578 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1579 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1580 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1581 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1582 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1583
db999e0f 1584 sleep(5);
77b6e194 1585 }
db999e0f
LP
1586
1587 return 0;
77b6e194
LP
1588}
1589
4f758c23
LP
1590#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1591#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 1592#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 1593
a90e2305 1594static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
1595 uint8_t result[8];
1596 size_t l, sz;
a90e2305
LP
1597 uint8_t *v, *i;
1598 int r;
01dde061
TG
1599
1600 l = strlen(arg_machine);
1601 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
1602 if (idx > 0)
1603 sz += sizeof(idx);
a90e2305 1604
01dde061
TG
1605 v = alloca(sz);
1606
1607 /* fetch some persistent data unique to the host */
1608 r = sd_id128_get_machine((sd_id128_t*) v);
1609 if (r < 0)
1610 return r;
1611
1612 /* combine with some data unique (on this host) to this
1613 * container instance */
a90e2305
LP
1614 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1615 if (idx > 0) {
1616 idx = htole64(idx);
1617 memcpy(i, &idx, sizeof(idx));
1618 }
01dde061
TG
1619
1620 /* Let's hash the host machine ID plus the container name. We
1621 * use a fixed, but originally randomly created hash key here. */
4f758c23 1622 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
1623
1624 assert_cc(ETH_ALEN <= sizeof(result));
1625 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1626
1627 /* see eth_random_addr in the kernel */
1628 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1629 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1630
1631 return 0;
1632}
1633
5aa4bb6b 1634static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1635 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1636 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 1637 struct ether_addr mac_host, mac_container;
5aa4bb6b 1638 int r, i;
69c79d3c
LP
1639
1640 if (!arg_private_network)
1641 return 0;
1642
1643 if (!arg_network_veth)
1644 return 0;
1645
08af0da2
LP
1646 /* Use two different interface name prefixes depending whether
1647 * we are in bridge mode or not. */
c00524c9 1648 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 1649 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1650
e867ceb6
LP
1651 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1652 if (r < 0)
1653 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 1654
e867ceb6
LP
1655 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1656 if (r < 0)
1657 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 1658
151b9b96 1659 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1660 if (r < 0)
1661 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 1662
151b9b96 1663 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1664 if (r < 0)
1665 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 1666
ab046dde 1667 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
1668 if (r < 0)
1669 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 1670
4f758c23 1671 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
1672 if (r < 0)
1673 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 1674
ee3a6a51 1675 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1676 if (r < 0)
1677 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1678
d8e538ec 1679 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
1680 if (r < 0)
1681 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1682
ee3a6a51 1683 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
1684 if (r < 0)
1685 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1686
ab046dde 1687 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
1688 if (r < 0)
1689 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 1690
4f758c23 1691 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
1692 if (r < 0)
1693 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 1694
ab046dde 1695 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1696 if (r < 0)
1697 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
1698
1699 r = sd_rtnl_message_close_container(m);
f647962d
MS
1700 if (r < 0)
1701 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1702
1703 r = sd_rtnl_message_close_container(m);
f647962d
MS
1704 if (r < 0)
1705 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1706
1707 r = sd_rtnl_message_close_container(m);
f647962d
MS
1708 if (r < 0)
1709 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1710
1711 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1712 if (r < 0)
1713 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 1714
5aa4bb6b 1715 i = (int) if_nametoindex(iface_name);
4a62c710
MS
1716 if (i <= 0)
1717 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
1718
1719 *ifi = i;
1720
69c79d3c
LP
1721 return 0;
1722}
1723
5aa4bb6b 1724static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1725 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1726 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1727 int r, bridge;
1728
1729 if (!arg_private_network)
1730 return 0;
1731
1732 if (!arg_network_veth)
1733 return 0;
1734
1735 if (!arg_network_bridge)
1736 return 0;
1737
1738 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
1739 if (bridge <= 0)
1740 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 1741
5aa4bb6b
LP
1742 *ifi = bridge;
1743
151b9b96 1744 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1745 if (r < 0)
1746 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 1747
151b9b96 1748 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
1749 if (r < 0)
1750 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 1751
039dd4af 1752 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
1753 if (r < 0)
1754 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 1755
ab046dde 1756 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
1757 if (r < 0)
1758 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
1759
1760 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
1763
1764 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1765 if (r < 0)
1766 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
1767
1768 return 0;
1769}
1770
c74e630d
LP
1771static int parse_interface(struct udev *udev, const char *name) {
1772 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1773 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1774 int ifi;
1775
1776 ifi = (int) if_nametoindex(name);
4a62c710
MS
1777 if (ifi <= 0)
1778 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
1779
1780 sprintf(ifi_str, "n%i", ifi);
1781 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
1782 if (!d)
1783 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
1784
1785 if (udev_device_get_is_initialized(d) <= 0) {
1786 log_error("Network interface %s is not initialized yet.", name);
1787 return -EBUSY;
1788 }
1789
1790 return ifi;
1791}
1792
69c79d3c 1793static int move_network_interfaces(pid_t pid) {
7e227024 1794 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1795 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1796 char **i;
1797 int r;
1798
1799 if (!arg_private_network)
1800 return 0;
1801
1802 if (strv_isempty(arg_network_interfaces))
1803 return 0;
1804
151b9b96 1805 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1806 if (r < 0)
1807 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 1808
7e227024
LP
1809 udev = udev_new();
1810 if (!udev) {
1811 log_error("Failed to connect to udev.");
1812 return -ENOMEM;
1813 }
1814
aa28aefe 1815 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1816 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1817 int ifi;
aa28aefe 1818
c74e630d
LP
1819 ifi = parse_interface(udev, *i);
1820 if (ifi < 0)
1821 return ifi;
1822
3125b3ef 1823 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
1824 if (r < 0)
1825 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1826
c74e630d 1827 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1828 if (r < 0)
1829 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 1830
c74e630d 1831 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 1834 }
7e227024 1835
c74e630d
LP
1836 return 0;
1837}
1838
1839static int setup_macvlan(pid_t pid) {
1840 _cleanup_udev_unref_ struct udev *udev = NULL;
1841 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 1842 unsigned idx = 0;
c74e630d
LP
1843 char **i;
1844 int r;
1845
1846 if (!arg_private_network)
1847 return 0;
1848
1849 if (strv_isempty(arg_network_macvlan))
1850 return 0;
1851
1852 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1853 if (r < 0)
1854 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
1855
1856 udev = udev_new();
1857 if (!udev) {
1858 log_error("Failed to connect to udev.");
1859 return -ENOMEM;
1860 }
1861
1862 STRV_FOREACH(i, arg_network_macvlan) {
1863 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1864 _cleanup_free_ char *n = NULL;
e867ceb6 1865 struct ether_addr mac;
c74e630d
LP
1866 int ifi;
1867
1868 ifi = parse_interface(udev, *i);
1869 if (ifi < 0)
1870 return ifi;
1871
e867ceb6
LP
1872 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1873 if (r < 0)
1874 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1875
c74e630d 1876 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1877 if (r < 0)
1878 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1879
c74e630d 1880 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
1881 if (r < 0)
1882 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
1883
1884 n = strappend("mv-", *i);
1885 if (!n)
1886 return log_oom();
1887
1888 strshorten(n, IFNAMSIZ-1);
1889
1890 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
1891 if (r < 0)
1892 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 1893
e867ceb6
LP
1894 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1895 if (r < 0)
1896 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1897
aa28aefe 1898 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1899 if (r < 0)
1900 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
1901
1902 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1903 if (r < 0)
1904 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 1905
d8e538ec 1906 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
1907 if (r < 0)
1908 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
1909
1910 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
1911 if (r < 0)
1912 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
1913
1914 r = sd_rtnl_message_close_container(m);
f647962d
MS
1915 if (r < 0)
1916 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
1917
1918 r = sd_rtnl_message_close_container(m);
f647962d
MS
1919 if (r < 0)
1920 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
1921
1922 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1923 if (r < 0)
1924 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
1925 }
1926
1927 return 0;
1928}
1929
28650077 1930static int setup_seccomp(void) {
24fb1112
LP
1931
1932#ifdef HAVE_SECCOMP
28650077
LP
1933 static const int blacklist[] = {
1934 SCMP_SYS(kexec_load),
1935 SCMP_SYS(open_by_handle_at),
1936 SCMP_SYS(init_module),
1937 SCMP_SYS(finit_module),
1938 SCMP_SYS(delete_module),
1939 SCMP_SYS(iopl),
1940 SCMP_SYS(ioperm),
1941 SCMP_SYS(swapon),
1942 SCMP_SYS(swapoff),
1943 };
1944
24fb1112 1945 scmp_filter_ctx seccomp;
28650077 1946 unsigned i;
24fb1112
LP
1947 int r;
1948
24fb1112
LP
1949 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1950 if (!seccomp)
1951 return log_oom();
1952
e9642be2 1953 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1954 if (r < 0) {
da927ba9 1955 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1956 goto finish;
1957 }
1958
28650077
LP
1959 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1960 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1961 if (r == -EFAULT)
1962 continue; /* unknown syscall */
1963 if (r < 0) {
da927ba9 1964 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1965 goto finish;
1966 }
1967 }
1968
1969 /*
1970 Audit is broken in containers, much of the userspace audit
1971 hookup will fail if running inside a container. We don't
1972 care and just turn off creation of audit sockets.
1973
1974 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1975 with EAFNOSUPPORT which audit userspace uses as indication
1976 that audit is disabled in the kernel.
1977 */
1978
3302da46 1979 r = seccomp_rule_add(
24fb1112
LP
1980 seccomp,
1981 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1982 SCMP_SYS(socket),
1983 2,
1984 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1985 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1986 if (r < 0) {
da927ba9 1987 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1988 goto finish;
1989 }
1990
1991 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1992 if (r < 0) {
da927ba9 1993 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1994 goto finish;
1995 }
1996
1997 r = seccomp_load(seccomp);
1998 if (r < 0)
da927ba9 1999 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2000
2001finish:
2002 seccomp_release(seccomp);
2003 return r;
2004#else
2005 return 0;
2006#endif
2007
2008}
2009
1b9e5b12
LP
2010static int setup_image(char **device_path, int *loop_nr) {
2011 struct loop_info64 info = {
2012 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2013 };
2014 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2015 _cleanup_free_ char* loopdev = NULL;
2016 struct stat st;
2017 int r, nr;
2018
2019 assert(device_path);
2020 assert(loop_nr);
2021
2022 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2023 if (fd < 0)
2024 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2025
4a62c710
MS
2026 if (fstat(fd, &st) < 0)
2027 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2028
2029 if (S_ISBLK(st.st_mode)) {
2030 char *p;
2031
2032 p = strdup(arg_image);
2033 if (!p)
2034 return log_oom();
2035
2036 *device_path = p;
2037
2038 *loop_nr = -1;
2039
2040 r = fd;
2041 fd = -1;
2042
2043 return r;
2044 }
2045
2046 if (!S_ISREG(st.st_mode)) {
56f64d95 2047 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2048 return -EINVAL;
2049 }
2050
2051 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2052 if (control < 0)
2053 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2054
2055 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2056 if (nr < 0)
2057 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2058
2059 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2060 return log_oom();
2061
2062 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2063 if (loop < 0)
2064 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2065
4a62c710
MS
2066 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2067 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2068
2069 if (arg_read_only)
2070 info.lo_flags |= LO_FLAGS_READ_ONLY;
2071
4a62c710
MS
2072 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2073 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2074
2075 *device_path = loopdev;
2076 loopdev = NULL;
2077
2078 *loop_nr = nr;
2079
2080 r = loop;
2081 loop = -1;
2082
2083 return r;
2084}
2085
2086static int dissect_image(
2087 int fd,
727fd4fd
LP
2088 char **root_device, bool *root_device_rw,
2089 char **home_device, bool *home_device_rw,
2090 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2091 bool *secondary) {
2092
2093#ifdef HAVE_BLKID
01dc33ce
ZJS
2094 int home_nr = -1, srv_nr = -1;
2095#ifdef GPT_ROOT_NATIVE
2096 int root_nr = -1;
2097#endif
2098#ifdef GPT_ROOT_SECONDARY
2099 int secondary_root_nr = -1;
2100#endif
2101
1b9e5b12
LP
2102 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2103 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2104 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2105 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2106 _cleanup_udev_unref_ struct udev *udev = NULL;
2107 struct udev_list_entry *first, *item;
727fd4fd 2108 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2109 const char *pttype = NULL;
2110 blkid_partlist pl;
2111 struct stat st;
2112 int r;
2113
2114 assert(fd >= 0);
2115 assert(root_device);
2116 assert(home_device);
2117 assert(srv_device);
2118 assert(secondary);
2119
2120 b = blkid_new_probe();
2121 if (!b)
2122 return log_oom();
2123
2124 errno = 0;
2125 r = blkid_probe_set_device(b, fd, 0, 0);
2126 if (r != 0) {
2127 if (errno == 0)
2128 return log_oom();
2129
56f64d95 2130 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2131 return -errno;
2132 }
2133
2134 blkid_probe_enable_partitions(b, 1);
2135 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2136
2137 errno = 0;
2138 r = blkid_do_safeprobe(b);
2139 if (r == -2 || r == 1) {
2140 log_error("Failed to identify any partition table on %s.\n"
2141 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2142 return -EINVAL;
2143 } else if (r != 0) {
2144 if (errno == 0)
2145 errno = EIO;
56f64d95 2146 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2147 return -errno;
2148 }
2149
2150 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2151 if (!streq_ptr(pttype, "gpt")) {
2152 log_error("Image %s does not carry a GUID Partition Table.\n"
2153 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2154 return -EINVAL;
2155 }
2156
2157 errno = 0;
2158 pl = blkid_probe_get_partitions(b);
2159 if (!pl) {
2160 if (errno == 0)
2161 return log_oom();
2162
2163 log_error("Failed to list partitions of %s", arg_image);
2164 return -errno;
2165 }
2166
2167 udev = udev_new();
2168 if (!udev)
2169 return log_oom();
2170
4a62c710
MS
2171 if (fstat(fd, &st) < 0)
2172 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12
LP
2173
2174 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2175 if (!d)
2176 return log_oom();
2177
2178 e = udev_enumerate_new(udev);
2179 if (!e)
2180 return log_oom();
2181
2182 r = udev_enumerate_add_match_parent(e, d);
2183 if (r < 0)
2184 return log_oom();
2185
2186 r = udev_enumerate_scan_devices(e);
f647962d
MS
2187 if (r < 0)
2188 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1b9e5b12
LP
2189
2190 first = udev_enumerate_get_list_entry(e);
2191 udev_list_entry_foreach(item, first) {
2192 _cleanup_udev_device_unref_ struct udev_device *q;
2193 const char *stype, *node;
727fd4fd 2194 unsigned long long flags;
1b9e5b12
LP
2195 sd_id128_t type_id;
2196 blkid_partition pp;
2197 dev_t qn;
2198 int nr;
2199
2200 errno = 0;
2201 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2202 if (!q) {
2203 if (!errno)
2204 errno = ENOMEM;
2205
56f64d95 2206 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2207 return -errno;
2208 }
2209
2210 qn = udev_device_get_devnum(q);
2211 if (major(qn) == 0)
2212 continue;
2213
2214 if (st.st_rdev == qn)
2215 continue;
2216
2217 node = udev_device_get_devnode(q);
2218 if (!node)
2219 continue;
2220
2221 pp = blkid_partlist_devno_to_partition(pl, qn);
2222 if (!pp)
2223 continue;
2224
727fd4fd
LP
2225 flags = blkid_partition_get_flags(pp);
2226 if (flags & GPT_FLAG_NO_AUTO)
2227 continue;
2228
1b9e5b12
LP
2229 nr = blkid_partition_get_partno(pp);
2230 if (nr < 0)
2231 continue;
2232
2233 stype = blkid_partition_get_type_string(pp);
2234 if (!stype)
2235 continue;
2236
2237 if (sd_id128_from_string(stype, &type_id) < 0)
2238 continue;
2239
2240 if (sd_id128_equal(type_id, GPT_HOME)) {
2241
2242 if (home && nr >= home_nr)
2243 continue;
2244
2245 home_nr = nr;
727fd4fd
LP
2246 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2247
1b9e5b12
LP
2248 free(home);
2249 home = strdup(node);
2250 if (!home)
2251 return log_oom();
2252 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2253
2254 if (srv && nr >= srv_nr)
2255 continue;
2256
2257 srv_nr = nr;
727fd4fd
LP
2258 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2259
1b9e5b12
LP
2260 free(srv);
2261 srv = strdup(node);
2262 if (!srv)
2263 return log_oom();
2264 }
2265#ifdef GPT_ROOT_NATIVE
2266 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2267
2268 if (root && nr >= root_nr)
2269 continue;
2270
2271 root_nr = nr;
727fd4fd
LP
2272 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2273
1b9e5b12
LP
2274 free(root);
2275 root = strdup(node);
2276 if (!root)
2277 return log_oom();
2278 }
2279#endif
2280#ifdef GPT_ROOT_SECONDARY
2281 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2282
2283 if (secondary_root && nr >= secondary_root_nr)
2284 continue;
2285
2286 secondary_root_nr = nr;
727fd4fd
LP
2287 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2288
2289
1b9e5b12
LP
2290 free(secondary_root);
2291 secondary_root = strdup(node);
2292 if (!secondary_root)
2293 return log_oom();
2294 }
2295#endif
2296 }
2297
2298 if (!root && !secondary_root) {
2299 log_error("Failed to identify root partition in disk image %s.\n"
2300 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2301 return -EINVAL;
2302 }
2303
2304 if (root) {
2305 *root_device = root;
2306 root = NULL;
727fd4fd
LP
2307
2308 *root_device_rw = root_rw;
1b9e5b12
LP
2309 *secondary = false;
2310 } else if (secondary_root) {
2311 *root_device = secondary_root;
2312 secondary_root = NULL;
727fd4fd
LP
2313
2314 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2315 *secondary = true;
2316 }
2317
2318 if (home) {
2319 *home_device = home;
2320 home = NULL;
727fd4fd
LP
2321
2322 *home_device_rw = home_rw;
1b9e5b12
LP
2323 }
2324
2325 if (srv) {
2326 *srv_device = srv;
2327 srv = NULL;
727fd4fd
LP
2328
2329 *srv_device_rw = srv_rw;
1b9e5b12
LP
2330 }
2331
2332 return 0;
2333#else
2334 log_error("--image= is not supported, compiled without blkid support.");
2335 return -ENOTSUP;
2336#endif
2337}
2338
727fd4fd 2339static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2340#ifdef HAVE_BLKID
2341 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2342 const char *fstype, *p;
2343 int r;
2344
2345 assert(what);
2346 assert(where);
2347
727fd4fd
LP
2348 if (arg_read_only)
2349 rw = false;
2350
1b9e5b12
LP
2351 if (directory)
2352 p = strappenda(where, directory);
2353 else
2354 p = where;
2355
2356 errno = 0;
2357 b = blkid_new_probe_from_filename(what);
2358 if (!b) {
2359 if (errno == 0)
2360 return log_oom();
56f64d95 2361 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2362 return -errno;
2363 }
2364
2365 blkid_probe_enable_superblocks(b, 1);
2366 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2367
2368 errno = 0;
2369 r = blkid_do_safeprobe(b);
2370 if (r == -1 || r == 1) {
2371 log_error("Cannot determine file system type of %s", what);
2372 return -EINVAL;
2373 } else if (r != 0) {
2374 if (errno == 0)
2375 errno = EIO;
56f64d95 2376 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2377 return -errno;
2378 }
2379
2380 errno = 0;
2381 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2382 if (errno == 0)
2383 errno = EINVAL;
2384 log_error("Failed to determine file system type of %s", what);
2385 return -errno;
2386 }
2387
2388 if (streq(fstype, "crypto_LUKS")) {
2389 log_error("nspawn currently does not support LUKS disk images.");
2390 return -ENOTSUP;
2391 }
2392
4a62c710
MS
2393 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2394 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2395
2396 return 0;
2397#else
2398 log_error("--image= is not supported, compiled without blkid support.");
2399 return -ENOTSUP;
2400#endif
2401}
2402
727fd4fd
LP
2403static int mount_devices(
2404 const char *where,
2405 const char *root_device, bool root_device_rw,
2406 const char *home_device, bool home_device_rw,
2407 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2408 int r;
2409
2410 assert(where);
2411
2412 if (root_device) {
727fd4fd 2413 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2414 if (r < 0)
2415 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2416 }
2417
2418 if (home_device) {
727fd4fd 2419 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2420 if (r < 0)
2421 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2422 }
2423
2424 if (srv_device) {
727fd4fd 2425 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2426 if (r < 0)
2427 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2428 }
2429
2430 return 0;
2431}
2432
2433static void loop_remove(int nr, int *image_fd) {
2434 _cleanup_close_ int control = -1;
e8c8ddcc 2435 int r;
1b9e5b12
LP
2436
2437 if (nr < 0)
2438 return;
2439
2440 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2441 r = ioctl(*image_fd, LOOP_CLR_FD);
2442 if (r < 0)
56f64d95 2443 log_warning_errno(errno, "Failed to close loop image: %m");
03e334a1 2444 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2445 }
2446
2447 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2448 if (control < 0) {
56f64d95 2449 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2450 return;
e8c8ddcc 2451 }
1b9e5b12 2452
e8c8ddcc
TG
2453 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2454 if (r < 0)
56f64d95 2455 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2456}
2457
0cb9fbcd
LP
2458static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2459 int pipe_fds[2];
2460 pid_t pid;
2461
2462 assert(database);
2463 assert(key);
2464 assert(rpid);
2465
4a62c710
MS
2466 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2467 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
2468
2469 pid = fork();
4a62c710
MS
2470 if (pid < 0)
2471 return log_error_errno(errno, "Failed to fork getent child: %m");
2472 else if (pid == 0) {
0cb9fbcd
LP
2473 int nullfd;
2474 char *empty_env = NULL;
2475
2476 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2477 _exit(EXIT_FAILURE);
2478
2479 if (pipe_fds[0] > 2)
03e334a1 2480 safe_close(pipe_fds[0]);
0cb9fbcd 2481 if (pipe_fds[1] > 2)
03e334a1 2482 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2483
2484 nullfd = open("/dev/null", O_RDWR);
2485 if (nullfd < 0)
2486 _exit(EXIT_FAILURE);
2487
2488 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2489 _exit(EXIT_FAILURE);
2490
2491 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2492 _exit(EXIT_FAILURE);
2493
2494 if (nullfd > 2)
03e334a1 2495 safe_close(nullfd);
0cb9fbcd
LP
2496
2497 reset_all_signal_handlers();
2498 close_all_fds(NULL, 0);
2499
4de82926
MM
2500 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2501 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2502 _exit(EXIT_FAILURE);
2503 }
2504
03e334a1 2505 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2506
2507 *rpid = pid;
2508
2509 return pipe_fds[0];
2510}
2511
2512static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2513 char line[LINE_MAX], *x, *u, *g, *h;
2514 const char *word, *state;
0cb9fbcd
LP
2515 _cleanup_free_ uid_t *uids = NULL;
2516 _cleanup_free_ char *home = NULL;
2517 _cleanup_fclose_ FILE *f = NULL;
2518 _cleanup_close_ int fd = -1;
2519 unsigned n_uids = 0;
70f539ca 2520 size_t sz = 0, l;
0cb9fbcd
LP
2521 uid_t uid;
2522 gid_t gid;
2523 pid_t pid;
2524 int r;
2525
2526 assert(_home);
2527
2528 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2529 /* Reset everything fully to 0, just in case */
2530
4a62c710
MS
2531 if (setgroups(0, NULL) < 0)
2532 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 2533
4a62c710
MS
2534 if (setresgid(0, 0, 0) < 0)
2535 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 2536
4a62c710
MS
2537 if (setresuid(0, 0, 0) < 0)
2538 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2539
2540 *_home = NULL;
2541 return 0;
2542 }
2543
2544 /* First, get user credentials */
2545 fd = spawn_getent("passwd", arg_user, &pid);
2546 if (fd < 0)
2547 return fd;
2548
2549 f = fdopen(fd, "r");
2550 if (!f)
2551 return log_oom();
2552 fd = -1;
2553
2554 if (!fgets(line, sizeof(line), f)) {
2555
2556 if (!ferror(f)) {
2557 log_error("Failed to resolve user %s.", arg_user);
2558 return -ESRCH;
2559 }
2560
56f64d95 2561 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2562 return -errno;
2563 }
2564
2565 truncate_nl(line);
2566
820d3acf 2567 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
2568
2569 x = strchr(line, ':');
2570 if (!x) {
2571 log_error("/etc/passwd entry has invalid user field.");
2572 return -EIO;
2573 }
2574
2575 u = strchr(x+1, ':');
2576 if (!u) {
2577 log_error("/etc/passwd entry has invalid password field.");
2578 return -EIO;
2579 }
2580
2581 u++;
2582 g = strchr(u, ':');
2583 if (!g) {
2584 log_error("/etc/passwd entry has invalid UID field.");
2585 return -EIO;
2586 }
2587
2588 *g = 0;
2589 g++;
2590 x = strchr(g, ':');
2591 if (!x) {
2592 log_error("/etc/passwd entry has invalid GID field.");
2593 return -EIO;
2594 }
2595
2596 *x = 0;
2597 h = strchr(x+1, ':');
2598 if (!h) {
2599 log_error("/etc/passwd entry has invalid GECOS field.");
2600 return -EIO;
2601 }
2602
2603 h++;
2604 x = strchr(h, ':');
2605 if (!x) {
2606 log_error("/etc/passwd entry has invalid home directory field.");
2607 return -EIO;
2608 }
2609
2610 *x = 0;
2611
2612 r = parse_uid(u, &uid);
2613 if (r < 0) {
2614 log_error("Failed to parse UID of user.");
2615 return -EIO;
2616 }
2617
2618 r = parse_gid(g, &gid);
2619 if (r < 0) {
2620 log_error("Failed to parse GID of user.");
2621 return -EIO;
2622 }
2623
2624 home = strdup(h);
2625 if (!home)
2626 return log_oom();
2627
2628 /* Second, get group memberships */
2629 fd = spawn_getent("initgroups", arg_user, &pid);
2630 if (fd < 0)
2631 return fd;
2632
2633 fclose(f);
2634 f = fdopen(fd, "r");
2635 if (!f)
2636 return log_oom();
2637 fd = -1;
2638
2639 if (!fgets(line, sizeof(line), f)) {
2640 if (!ferror(f)) {
2641 log_error("Failed to resolve user %s.", arg_user);
2642 return -ESRCH;
2643 }
2644
56f64d95 2645 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2646 return -errno;
2647 }
2648
2649 truncate_nl(line);
2650
820d3acf 2651 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
2652
2653 /* Skip over the username and subsequent separator whitespace */
2654 x = line;
2655 x += strcspn(x, WHITESPACE);
2656 x += strspn(x, WHITESPACE);
2657
a2a5291b 2658 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2659 char c[l+1];
2660
a2a5291b 2661 memcpy(c, word, l);
0cb9fbcd
LP
2662 c[l] = 0;
2663
2664 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2665 return log_oom();
2666
2667 r = parse_uid(c, &uids[n_uids++]);
2668 if (r < 0) {
2669 log_error("Failed to parse group data from getent.");
2670 return -EIO;
2671 }
2672 }
2673
2674 r = mkdir_parents(home, 0775);
f647962d
MS
2675 if (r < 0)
2676 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
2677
2678 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
2679 if (r < 0 && r != -EEXIST)
2680 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
2681
2682 fchown(STDIN_FILENO, uid, gid);
2683 fchown(STDOUT_FILENO, uid, gid);
2684 fchown(STDERR_FILENO, uid, gid);
2685
4a62c710
MS
2686 if (setgroups(n_uids, uids) < 0)
2687 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 2688
4a62c710
MS
2689 if (setresgid(gid, gid, gid) < 0)
2690 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 2691
4a62c710
MS
2692 if (setresuid(uid, uid, uid) < 0)
2693 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2694
2695 if (_home) {
2696 *_home = home;
2697 home = NULL;
2698 }
2699
2700 return 0;
2701}
2702
113cea80 2703/*
6d416b9c
LS
2704 * Return values:
2705 * < 0 : wait_for_terminate() failed to get the state of the
2706 * container, the container was terminated by a signal, or
2707 * failed for an unknown reason. No change is made to the
2708 * container argument.
2709 * > 0 : The program executed in the container terminated with an
2710 * error. The exit code of the program executed in the
919699ec
LP
2711 * container is returned. The container argument has been set
2712 * to CONTAINER_TERMINATED.
6d416b9c
LS
2713 * 0 : The container is being rebooted, has been shut down or exited
2714 * successfully. The container argument has been set to either
2715 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2716 *
6d416b9c
LS
2717 * That is, success is indicated by a return value of zero, and an
2718 * error is indicated by a non-zero value.
113cea80
DH
2719 */
2720static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2721 siginfo_t status;
919699ec 2722 int r;
113cea80
DH
2723
2724 r = wait_for_terminate(pid, &status);
f647962d
MS
2725 if (r < 0)
2726 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2727
2728 switch (status.si_code) {
fddbb89c 2729
113cea80 2730 case CLD_EXITED:
919699ec
LP
2731 if (status.si_status == 0) {
2732 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2733
fddbb89c 2734 } else
919699ec 2735 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2736
919699ec
LP
2737 *container = CONTAINER_TERMINATED;
2738 return status.si_status;
113cea80
DH
2739
2740 case CLD_KILLED:
2741 if (status.si_status == SIGINT) {
113cea80 2742
919699ec 2743 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2744 *container = CONTAINER_TERMINATED;
919699ec
LP
2745 return 0;
2746
113cea80 2747 } else if (status.si_status == SIGHUP) {
113cea80 2748
919699ec 2749 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2750 *container = CONTAINER_REBOOTED;
919699ec 2751 return 0;
113cea80 2752 }
919699ec 2753
113cea80
DH
2754 /* CLD_KILLED fallthrough */
2755
2756 case CLD_DUMPED:
fddbb89c 2757 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2758 return -EIO;
113cea80
DH
2759
2760 default:
fddbb89c 2761 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2762 return -EIO;
113cea80
DH
2763 }
2764
2765 return r;
2766}
2767
e866af3a
DH
2768static void nop_handler(int sig) {}
2769
023fb90b
LP
2770static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2771 pid_t pid;
2772
2773 pid = PTR_TO_UINT32(userdata);
2774 if (pid > 0) {
2775 if (kill(pid, SIGRTMIN+3) >= 0) {
2776 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2777 sd_event_source_set_userdata(s, NULL);
2778 return 0;
2779 }
2780 }
2781
2782 sd_event_exit(sd_event_source_get_event(s), 0);
2783 return 0;
2784}
2785
88213476 2786int main(int argc, char *argv[]) {
69c79d3c 2787
63cc4c31 2788 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2789 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 2790 _cleanup_close_ int master = -1, image_fd = -1;
3d94f76c 2791 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2792 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2793 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2794 const char *console = NULL;
1b9e5b12
LP
2795 char veth_name[IFNAMSIZ];
2796 bool secondary = false;
e866af3a 2797 sigset_t mask, mask_chld;
69c79d3c 2798 pid_t pid = 0;
88213476
LP
2799
2800 log_parse_environment();
2801 log_open();
2802
05947bef
LP
2803 k = parse_argv(argc, argv);
2804 if (k < 0)
88213476 2805 goto finish;
05947bef
LP
2806 else if (k == 0) {
2807 r = EXIT_SUCCESS;
2808 goto finish;
2809 }
88213476 2810
1b9e5b12
LP
2811 if (!arg_image) {
2812 if (arg_directory) {
2813 char *p;
88213476 2814
1b9e5b12
LP
2815 p = path_make_absolute_cwd(arg_directory);
2816 free(arg_directory);
2817 arg_directory = p;
2818 } else
2819 arg_directory = get_current_dir_name();
88213476 2820
1b9e5b12
LP
2821 if (!arg_directory) {
2822 log_error("Failed to determine path, please use -D.");
2823 goto finish;
2824 }
2825 path_kill_slashes(arg_directory);
88213476
LP
2826 }
2827
7027ff61 2828 if (!arg_machine) {
1b9e5b12 2829 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2830 if (!arg_machine) {
2831 log_oom();
2832 goto finish;
2833 }
2834
e724b063 2835 hostname_cleanup(arg_machine, false);
7027ff61
LP
2836 if (isempty(arg_machine)) {
2837 log_error("Failed to determine machine name automatically, please use -M.");
2838 goto finish;
2839 }
2840 }
2841
88213476
LP
2842 if (geteuid() != 0) {
2843 log_error("Need to be root.");
2844 goto finish;
2845 }
2846
04d391da
LP
2847 if (sd_booted() <= 0) {
2848 log_error("Not running on a systemd system.");
2849 goto finish;
2850 }
2851
1b9e5b12
LP
2852 log_close();
2853 n_fd_passed = sd_listen_fds(false);
2854 if (n_fd_passed > 0) {
2855 k = fdset_new_listen_fds(&fds, false);
2856 if (k < 0) {
da927ba9 2857 log_error_errno(k, "Failed to collect file descriptors: %m");
1b9e5b12
LP
2858 goto finish;
2859 }
88213476 2860 }
1b9e5b12
LP
2861 fdset_close_others(fds);
2862 log_open();
88213476 2863
1b9e5b12
LP
2864 if (arg_directory) {
2865 if (path_equal(arg_directory, "/")) {
2866 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2867 goto finish;
2868 }
1b9e5b12
LP
2869
2870 if (arg_boot) {
2871 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 2872 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
2873 goto finish;
2874 }
2875 } else {
2876 const char *p;
2877
2878 p = strappenda(arg_directory,
2879 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2880 if (access(p, F_OK) < 0) {
2881 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2882 goto finish;
2883
2884 }
2885 }
6b9132a9 2886 } else {
1b9e5b12 2887 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2888
1b9e5b12 2889 if (!mkdtemp(template)) {
56f64d95 2890 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 2891 r = -errno;
6b9132a9 2892 goto finish;
1b9e5b12 2893 }
6b9132a9 2894
1b9e5b12
LP
2895 arg_directory = strdup(template);
2896 if (!arg_directory) {
2897 r = log_oom();
2898 goto finish;
6b9132a9 2899 }
88213476 2900
1b9e5b12
LP
2901 image_fd = setup_image(&device_path, &loop_nr);
2902 if (image_fd < 0) {
2903 r = image_fd;
842f3b0f
LP
2904 goto finish;
2905 }
1b9e5b12 2906
4d9f07b4
LP
2907 r = dissect_image(image_fd,
2908 &root_device, &root_device_rw,
2909 &home_device, &home_device_rw,
2910 &srv_device, &srv_device_rw,
2911 &secondary);
1b9e5b12
LP
2912 if (r < 0)
2913 goto finish;
842f3b0f 2914 }
842f3b0f 2915
db7feb7e
LP
2916 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2917 if (master < 0) {
56f64d95 2918 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
2919 goto finish;
2920 }
2921
db7feb7e
LP
2922 console = ptsname(master);
2923 if (!console) {
56f64d95 2924 log_error_errno(errno, "Failed to determine tty name: %m");
a258bf26
LP
2925 goto finish;
2926 }
2927
284c0b91 2928 if (!arg_quiet)
45f1386c
ZJS
2929 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2930 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2931
2932 if (unlockpt(master) < 0) {
56f64d95 2933 log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
2934 goto finish;
2935 }
2936
e58a1277 2937 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
56f64d95 2938 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
354bfd2b
LP
2939 goto finish;
2940 }
2941
af4ec430
LP
2942 sd_notify(false,
2943 "READY=1\n"
2944 "STATUS=Container running.");
05947bef 2945
a258bf26
LP
2946 assert_se(sigemptyset(&mask) == 0);
2947 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2948 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2949
023fb90b
LP
2950 assert_se(sigemptyset(&mask_chld) == 0);
2951 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2952
d87be9b0 2953 for (;;) {
113cea80 2954 ContainerStatus container_status;
7566e267 2955 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
2956 struct sigaction sa = {
2957 .sa_handler = nop_handler,
2958 .sa_flags = SA_NOCLDSTOP,
2959 };
2960
7566e267 2961 r = barrier_create(&barrier);
a2da110b 2962 if (r < 0) {
da927ba9 2963 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
2964 goto finish;
2965 }
2966
e866af3a
DH
2967 /* Child can be killed before execv(), so handle SIGCHLD
2968 * in order to interrupt parent's blocking calls and
2969 * give it a chance to call wait() and terminate. */
2970 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2971 if (r < 0) {
56f64d95 2972 log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
2973 goto finish;
2974 }
2975
e866af3a
DH
2976 r = sigaction(SIGCHLD, &sa, NULL);
2977 if (r < 0) {
56f64d95 2978 log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
2979 goto finish;
2980 }
2981
a2da110b
DH
2982 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2983 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2984 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
2985 if (pid < 0) {
2986 if (errno == EINVAL)
56f64d95 2987 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 2988 else
56f64d95 2989 log_error_errno(errno, "clone() failed: %m");
a258bf26 2990
e866af3a 2991 r = pid;
d87be9b0
LP
2992 goto finish;
2993 }
a258bf26 2994
d87be9b0
LP
2995 if (pid == 0) {
2996 /* child */
0cb9fbcd 2997 _cleanup_free_ char *home = NULL;
5674767e 2998 unsigned n_env = 2;
d87be9b0 2999 const char *envp[] = {
e10a55fd 3000 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3001 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3002 NULL, /* TERM */
3003 NULL, /* HOME */
3004 NULL, /* USER */
3005 NULL, /* LOGNAME */
3006 NULL, /* container_uuid */
842f3b0f
LP
3007 NULL, /* LISTEN_FDS */
3008 NULL, /* LISTEN_PID */
d87be9b0
LP
3009 NULL
3010 };
f4889f65 3011 char **env_use;
a258bf26 3012
a2da110b
DH
3013 barrier_set_role(&barrier, BARRIER_CHILD);
3014
5674767e
ZJS
3015 envp[n_env] = strv_find_prefix(environ, "TERM=");
3016 if (envp[n_env])
3017 n_env ++;
a258bf26 3018
03e334a1 3019 master = safe_close(master);
a258bf26 3020
d87be9b0
LP
3021 close_nointr(STDIN_FILENO);
3022 close_nointr(STDOUT_FILENO);
3023 close_nointr(STDERR_FILENO);
db7feb7e 3024
03e334a1 3025 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3026
d87be9b0 3027 reset_all_signal_handlers();
1b6d7fa7 3028 reset_signal_mask();
f5c1b9ee 3029
842f3b0f
LP
3030 k = open_terminal(console, O_RDWR);
3031 if (k != STDIN_FILENO) {
3032 if (k >= 0) {
03e334a1 3033 safe_close(k);
842f3b0f
LP
3034 k = -EINVAL;
3035 }
3036
da927ba9 3037 log_error_errno(k, "Failed to open console: %m");
a2da110b 3038 _exit(EXIT_FAILURE);
842f3b0f
LP
3039 }
3040
3041 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3042 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
56f64d95 3043 log_error_errno(errno, "Failed to duplicate console: %m");
a2da110b 3044 _exit(EXIT_FAILURE);
842f3b0f 3045 }
bc2f673e 3046
d87be9b0 3047 if (setsid() < 0) {
56f64d95 3048 log_error_errno(errno, "setsid() failed: %m");
a2da110b 3049 _exit(EXIT_FAILURE);
bc2f673e
LP
3050 }
3051
db999e0f 3052 if (reset_audit_loginuid() < 0)
a2da110b 3053 _exit(EXIT_FAILURE);
db999e0f 3054
d87be9b0 3055 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 3056 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 3057 _exit(EXIT_FAILURE);
d87be9b0 3058 }
e58a1277 3059
d87be9b0
LP
3060 /* Mark everything as slave, so that we still
3061 * receive mounts from the real root, but don't
3062 * propagate mounts to the real root. */
3063 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 3064 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 3065 _exit(EXIT_FAILURE);
d87be9b0 3066 }
04bc4a3f 3067
727fd4fd
LP
3068 if (mount_devices(arg_directory,
3069 root_device, root_device_rw,
3070 home_device, home_device_rw,
3071 srv_device, srv_device_rw) < 0)
a2da110b 3072 _exit(EXIT_FAILURE);
1b9e5b12 3073
d87be9b0
LP
3074 /* Turn directory into bind mount */
3075 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 3076 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 3077 _exit(EXIT_FAILURE);
d87be9b0 3078 }
88213476 3079
4d9f07b4
LP
3080 r = setup_volatile(arg_directory);
3081 if (r < 0)
a2da110b 3082 _exit(EXIT_FAILURE);
4d9f07b4
LP
3083
3084 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3085 _exit(EXIT_FAILURE);
4d9f07b4
LP
3086
3087 r = base_filesystem_create(arg_directory);
3088 if (r < 0)
a2da110b 3089 _exit(EXIT_FAILURE);
4d9f07b4 3090
d6797c92
LP
3091 if (arg_read_only) {
3092 k = bind_remount_recursive(arg_directory, true);
3093 if (k < 0) {
da927ba9 3094 log_error_errno(k, "Failed to make tree read-only: %m");
a2da110b 3095 _exit(EXIT_FAILURE);
d87be9b0 3096 }
d6797c92 3097 }
2547bb41 3098
d87be9b0 3099 if (mount_all(arg_directory) < 0)
a2da110b 3100 _exit(EXIT_FAILURE);
57fb9fb5 3101
d87be9b0 3102 if (copy_devnodes(arg_directory) < 0)
a2da110b 3103 _exit(EXIT_FAILURE);
a258bf26 3104
f2d88580 3105 if (setup_ptmx(arg_directory) < 0)
a2da110b 3106 _exit(EXIT_FAILURE);
f2d88580 3107
d87be9b0 3108 dev_setup(arg_directory);
88213476 3109
28650077 3110 if (setup_seccomp() < 0)
a2da110b 3111 _exit(EXIT_FAILURE);
24fb1112 3112
d87be9b0 3113 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3114 _exit(EXIT_FAILURE);
88213476 3115
d87be9b0 3116 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3117 _exit(EXIT_FAILURE);
88213476 3118
03e334a1 3119 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3120
d87be9b0 3121 if (setup_boot_id(arg_directory) < 0)
a2da110b 3122 _exit(EXIT_FAILURE);
a41fe3a2 3123
d87be9b0 3124 if (setup_timezone(arg_directory) < 0)
a2da110b 3125 _exit(EXIT_FAILURE);
88213476 3126
d87be9b0 3127 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3128 _exit(EXIT_FAILURE);
687d0825 3129
d87be9b0 3130 if (setup_journal(arg_directory) < 0)
a2da110b 3131 _exit(EXIT_FAILURE);
687d0825 3132
d6797c92 3133 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3134 _exit(EXIT_FAILURE);
17fe0523 3135
d6797c92 3136 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3137 _exit(EXIT_FAILURE);
17fe0523 3138
06c17c39 3139 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3140 _exit(EXIT_FAILURE);
06c17c39 3141
d96c1ecf
LP
3142 /* Tell the parent that we are ready, and that
3143 * it can cgroupify us to that we lack access
3144 * to certain devices and resources. */
dfb05a1c 3145 (void)barrier_place(&barrier);
d96c1ecf 3146
d87be9b0 3147 if (chdir(arg_directory) < 0) {
56f64d95 3148 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 3149 _exit(EXIT_FAILURE);
687d0825
MV
3150 }
3151
d87be9b0 3152 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 3153 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 3154 _exit(EXIT_FAILURE);
687d0825
MV
3155 }
3156
d87be9b0 3157 if (chroot(".") < 0) {
56f64d95 3158 log_error_errno(errno, "chroot() failed: %m");
a2da110b 3159 _exit(EXIT_FAILURE);
687d0825
MV
3160 }
3161
d87be9b0 3162 if (chdir("/") < 0) {
56f64d95 3163 log_error_errno(errno, "chdir() failed: %m");
a2da110b 3164 _exit(EXIT_FAILURE);
687d0825
MV
3165 }
3166
d87be9b0
LP
3167 umask(0022);
3168
eb91eb18
LP
3169 if (arg_private_network)
3170 loopback_setup();
d87be9b0
LP
3171
3172 if (drop_capabilities() < 0) {
56f64d95 3173 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 3174 _exit(EXIT_FAILURE);
687d0825 3175 }
687d0825 3176
0cb9fbcd
LP
3177 r = change_uid_gid(&home);
3178 if (r < 0)
a2da110b 3179 _exit(EXIT_FAILURE);
d87be9b0 3180
842f3b0f
LP
3181 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3182 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3183 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3184 log_oom();
a2da110b 3185 _exit(EXIT_FAILURE);
144f0fc0 3186 }
687d0825 3187
9444b1f2 3188 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3189 char as_uuid[37];
3190
3191 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3192 log_oom();
a2da110b 3193 _exit(EXIT_FAILURE);
842f3b0f
LP
3194 }
3195 }
3196
3197 if (fdset_size(fds) > 0) {
3198 k = fdset_cloexec(fds, false);
3199 if (k < 0) {
3200 log_error("Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3201 _exit(EXIT_FAILURE);
842f3b0f
LP
3202 }
3203
3204 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3205 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3206 log_oom();
a2da110b 3207 _exit(EXIT_FAILURE);
d87be9b0
LP
3208 }
3209 }
3210
3211 setup_hostname();
3212
6afc95b7
LP
3213 if (arg_personality != 0xffffffffLU) {
3214 if (personality(arg_personality) < 0) {
56f64d95 3215 log_error_errno(errno, "personality() failed: %m");
a2da110b 3216 _exit(EXIT_FAILURE);
6afc95b7 3217 }
1b9e5b12
LP
3218 } else if (secondary) {
3219 if (personality(PER_LINUX32) < 0) {
56f64d95 3220 log_error_errno(errno, "personality() failed: %m");
a2da110b 3221 _exit(EXIT_FAILURE);
1b9e5b12 3222 }
6afc95b7
LP
3223 }
3224
d96c1ecf
LP
3225#ifdef HAVE_SELINUX
3226 if (arg_selinux_context)
0cb9fbcd 3227 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
56f64d95 3228 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3229 _exit(EXIT_FAILURE);
0cb9fbcd 3230 }
d96c1ecf 3231#endif
354bfd2b 3232
f4889f65
LP
3233 if (!strv_isempty(arg_setenv)) {
3234 char **n;
3235
3236 n = strv_env_merge(2, envp, arg_setenv);
3237 if (!n) {
3238 log_oom();
a2da110b 3239 _exit(EXIT_FAILURE);
f4889f65
LP
3240 }
3241
3242 env_use = n;
3243 } else
3244 env_use = (char**) envp;
3245
d96c1ecf 3246 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3247 if (!barrier_place_and_sync(&barrier))
3248 _exit(EXIT_FAILURE);
d96c1ecf 3249
d87be9b0
LP
3250 if (arg_boot) {
3251 char **a;
3252 size_t l;
88213476 3253
d87be9b0 3254 /* Automatically search for the init system */
0f0dbc46 3255
d87be9b0
LP
3256 l = 1 + argc - optind;
3257 a = newa(char*, l + 1);
3258 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3259
d87be9b0 3260 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3261 execve(a[0], a, env_use);
0f0dbc46 3262
d87be9b0 3263 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3264 execve(a[0], a, env_use);
0f0dbc46 3265
d87be9b0 3266 a[0] = (char*) "/sbin/init";
f4889f65 3267 execve(a[0], a, env_use);
d87be9b0 3268 } else if (argc > optind)
f4889f65 3269 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3270 else {
3271 chdir(home ? home : "/root");
f4889f65 3272 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3273 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3274 }
3275
56f64d95 3276 log_error_errno(errno, "execv() failed: %m");
d87be9b0 3277 _exit(EXIT_FAILURE);
da5b3bad 3278 }
88213476 3279
a2da110b 3280 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3281 fdset_free(fds);
3282 fds = NULL;
3283
a2da110b
DH
3284 /* wait for child-setup to be done */
3285 if (barrier_place_and_sync(&barrier)) {
023fb90b
LP
3286 _cleanup_event_unref_ sd_event *event = NULL;
3287 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
5aa4bb6b 3288 int ifi = 0;
354bfd2b 3289
840295fc
LP
3290 r = move_network_interfaces(pid);
3291 if (r < 0)
3292 goto finish;
aa28aefe 3293
5aa4bb6b 3294 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3295 if (r < 0)
3296 goto finish;
ab046dde 3297
5aa4bb6b 3298 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3299 if (r < 0)
3300 goto finish;
ab046dde 3301
840295fc
LP
3302 r = setup_macvlan(pid);
3303 if (r < 0)
3304 goto finish;
c74e630d 3305
5aa4bb6b
LP
3306 r = register_machine(pid, ifi);
3307 if (r < 0)
3308 goto finish;
3309
840295fc
LP
3310 /* Block SIGCHLD here, before notifying child.
3311 * process_pty() will handle it with the other signals. */
3312 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3313 if (r < 0)
3314 goto finish;
e866af3a 3315
840295fc
LP
3316 /* Reset signal to default */
3317 r = default_signals(SIGCHLD, -1);
3318 if (r < 0)
3319 goto finish;
e866af3a 3320
840295fc
LP
3321 /* Notify the child that the parent is ready with all
3322 * its setup, and that the child can now hand over
3323 * control to the code to run inside the container. */
dfb05a1c 3324 (void)barrier_place(&barrier);
354bfd2b 3325
023fb90b
LP
3326 r = sd_event_new(&event);
3327 if (r < 0) {
da927ba9 3328 log_error_errno(r, "Failed to get default event source: %m");
023fb90b 3329 goto finish;
840295fc 3330 }
88213476 3331
023fb90b
LP
3332 if (arg_boot) {
3333 /* Try to kill the init system on SIGINT or SIGTERM */
3334 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3335 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3336 } else {
3337 /* Immediately exit */
3338 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3339 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3340 }
3341
3342 /* simply exit on sigchld */
3343 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3344
3345 r = pty_forward_new(event, master, &forward);
3346 if (r < 0) {
da927ba9 3347 log_error_errno(r, "Failed to create PTY forwarder: %m");
023fb90b
LP
3348 goto finish;
3349 }
3350
3351 r = sd_event_loop(event);
f647962d
MS
3352 if (r < 0)
3353 return log_error_errno(r, "Failed to run event loop: %m");
023fb90b
LP
3354
3355 forward = pty_forward_free(forward);
3356
840295fc
LP
3357 if (!arg_quiet)
3358 putc('\n', stdout);
04d39279 3359
840295fc
LP
3360 /* Kill if it is not dead yet anyway */
3361 terminate_machine(pid);
3362 }
1f0cd86b 3363
840295fc 3364 /* Normally redundant, but better safe than sorry */
04d39279 3365 kill(pid, SIGKILL);
a258bf26 3366
113cea80 3367 r = wait_for_container(pid, &container_status);
04d39279
LP
3368 pid = 0;
3369
ce9f1527
LP
3370 if (r < 0) {
3371 /* We failed to wait for the container, or the
3372 * container exited abnormally */
3373 r = EXIT_FAILURE;
d87be9b0 3374 break;
ce9f1527
LP
3375 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3376 /* The container exited with a non-zero
3377 * status, or with zero status and no reboot
3378 * was requested. */
d87be9b0 3379 break;
88213476 3380
113cea80 3381 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3382
3383 if (arg_keep_unit) {
3384 /* Special handling if we are running as a
3385 * service: instead of simply restarting the
3386 * machine we want to restart the entire
3387 * service, so let's inform systemd about this
3388 * with the special exit code 133. The service
3389 * file uses RestartForceExitStatus=133 so
3390 * that this results in a full nspawn
3391 * restart. This is necessary since we might
3392 * have cgroup parameters set we want to have
3393 * flushed out. */
3394 r = 133;
3395 break;
3396 }
d87be9b0 3397 }
88213476
LP
3398
3399finish:
af4ec430
LP
3400 sd_notify(false,
3401 "STOPPING=1\n"
3402 "STATUS=Terminating...");
3403
1b9e5b12
LP
3404 loop_remove(loop_nr, &image_fd);
3405
9444b1f2
LP
3406 if (pid > 0)
3407 kill(pid, SIGKILL);
88213476 3408
04d391da 3409 free(arg_directory);
7027ff61 3410 free(arg_machine);
c74e630d
LP
3411 free(arg_user);
3412 strv_free(arg_setenv);
3413 strv_free(arg_network_interfaces);
3414 strv_free(arg_network_macvlan);
3415 strv_free(arg_bind);
3416 strv_free(arg_bind_ro);
06c17c39 3417 strv_free(arg_tmpfs);
88213476
LP
3418
3419 return r;
3420}