]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
journald: fix flushing
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <net/if.h>
69c79d3c 44#include <linux/veth.h>
6afc95b7 45#include <sys/personality.h>
1b9e5b12 46#include <linux/loop.h>
aa28aefe 47
5d63309c 48#ifdef HAVE_SELINUX
a8828ed9
DW
49#include <selinux/selinux.h>
50#endif
88213476 51
24fb1112
LP
52#ifdef HAVE_SECCOMP
53#include <seccomp.h>
54#endif
55
1b9e5b12
LP
56#ifdef HAVE_BLKID
57#include <blkid/blkid.h>
58#endif
59
1f0cd86b
LP
60#include "sd-daemon.h"
61#include "sd-bus.h"
62#include "sd-id128.h"
aa28aefe 63#include "sd-rtnl.h"
88213476
LP
64#include "log.h"
65#include "util.h"
49e942b2 66#include "mkdir.h"
6b2d0e85 67#include "macro.h"
d7832d2c 68#include "audit.h"
94d82985 69#include "missing.h"
04d391da 70#include "cgroup-util.h"
a258bf26 71#include "strv.h"
9eb977db 72#include "path-util.h"
a41fe3a2 73#include "loopback-setup.h"
4fc9982c 74#include "dev-setup.h"
842f3b0f 75#include "fdset.h"
acbeb427 76#include "build.h"
a5c32cff 77#include "fileio.h"
40ca29a1 78#include "bus-util.h"
1f0cd86b 79#include "bus-error.h"
4ba93280 80#include "ptyfwd.h"
9bd37b40 81#include "bus-kernel.h"
f4889f65 82#include "env-util.h"
7f112f50 83#include "def.h"
aa28aefe 84#include "rtnl-util.h"
7e227024 85#include "udev-util.h"
1b9e5b12
LP
86#include "blkid-util.h"
87#include "gpt.h"
01dde061 88#include "siphash24.h"
849958d1 89#include "copy.h"
3577de7a 90#include "base-filesystem.h"
a2da110b 91#include "barrier.h"
f2d88580 92
e9642be2
LP
93#ifdef HAVE_SECCOMP
94#include "seccomp-util.h"
95#endif
96
113cea80
DH
97typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100} ContainerStatus;
101
57fb9fb5
LP
102typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107} LinkJournal;
88213476 108
4d9f07b4
LP
109typedef enum Volatile {
110 VOLATILE_NO,
111 VOLATILE_YES,
112 VOLATILE_STATE,
113} Volatile;
114
88213476 115static char *arg_directory = NULL;
687d0825 116static char *arg_user = NULL;
9444b1f2 117static sd_id128_t arg_uuid = {};
7027ff61 118static char *arg_machine = NULL;
c74e630d
LP
119static const char *arg_selinux_context = NULL;
120static const char *arg_selinux_apifs_context = NULL;
9444b1f2 121static const char *arg_slice = NULL;
ff01d048 122static bool arg_private_network = false;
bc2f673e 123static bool arg_read_only = false;
0f0dbc46 124static bool arg_boot = false;
57fb9fb5 125static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
126static uint64_t arg_retain =
127 (1ULL << CAP_CHOWN) |
128 (1ULL << CAP_DAC_OVERRIDE) |
129 (1ULL << CAP_DAC_READ_SEARCH) |
130 (1ULL << CAP_FOWNER) |
131 (1ULL << CAP_FSETID) |
132 (1ULL << CAP_IPC_OWNER) |
133 (1ULL << CAP_KILL) |
134 (1ULL << CAP_LEASE) |
135 (1ULL << CAP_LINUX_IMMUTABLE) |
136 (1ULL << CAP_NET_BIND_SERVICE) |
137 (1ULL << CAP_NET_BROADCAST) |
138 (1ULL << CAP_NET_RAW) |
139 (1ULL << CAP_SETGID) |
140 (1ULL << CAP_SETFCAP) |
141 (1ULL << CAP_SETPCAP) |
142 (1ULL << CAP_SETUID) |
143 (1ULL << CAP_SYS_ADMIN) |
144 (1ULL << CAP_SYS_CHROOT) |
145 (1ULL << CAP_SYS_NICE) |
146 (1ULL << CAP_SYS_PTRACE) |
147 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 148 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
149 (1ULL << CAP_SYS_BOOT) |
150 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_MKNOD);
17fe0523
LP
153static char **arg_bind = NULL;
154static char **arg_bind_ro = NULL;
06c17c39 155static char **arg_tmpfs = NULL;
f4889f65 156static char **arg_setenv = NULL;
284c0b91 157static bool arg_quiet = false;
8a96d94e 158static bool arg_share_system = false;
eb91eb18 159static bool arg_register = true;
89f7c846 160static bool arg_keep_unit = false;
aa28aefe 161static char **arg_network_interfaces = NULL;
c74e630d 162static char **arg_network_macvlan = NULL;
69c79d3c 163static bool arg_network_veth = false;
c74e630d 164static const char *arg_network_bridge = NULL;
6afc95b7 165static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 166static const char *arg_image = NULL;
4d9f07b4 167static Volatile arg_volatile = VOLATILE_NO;
88213476 168
601185b4 169static void help(void) {
88213476
LP
170 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
172 " -h --help Show this help\n"
173 " --version Print version string\n"
69c79d3c 174 " -q --quiet Do not show status information\n"
1b9e5b12
LP
175 " -D --directory=PATH Root directory for the container\n"
176 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
177 " -b --boot Boot up full system (i.e. invoke init)\n"
178 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 179 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 180 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 181 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
182 " --private-network Disable network in container\n"
183 " --network-interface=INTERFACE\n"
184 " Assign an existing network interface to the\n"
185 " container\n"
c74e630d
LP
186 " --network-macvlan=INTERFACE\n"
187 " Create a macvlan network interface based on an\n"
188 " existing network interface to the container\n"
32457153 189 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 190 " and container\n"
ab046dde 191 " --network-bridge=INTERFACE\n"
32457153 192 " Add a virtual ethernet connection between host\n"
ab046dde
TG
193 " and container and add it to an existing bridge on\n"
194 " the host\n"
82adf6af
LP
195 " -Z --selinux-context=SECLABEL\n"
196 " Set the SELinux security context to be used by\n"
197 " processes in the container\n"
198 " -L --selinux-apifs-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " API/tmpfs file systems in the container\n"
a8828ed9
DW
201 " --capability=CAP In addition to the default, retain specified\n"
202 " capability\n"
203 " --drop-capability=CAP Drop the specified capability from the default set\n"
204 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
205 " -j Equivalent to --link-journal=host\n"
69c79d3c 206 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
207 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
208 " the container\n"
209 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 210 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 211 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 212 " --share-system Share system namespaces with host\n"
eb91eb18 213 " --register=BOOLEAN Register container as machine\n"
89f7c846 214 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
215 " the service unit nspawn is running in\n"
216 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 217 program_invocation_short_name);
88213476
LP
218}
219
220static int parse_argv(int argc, char *argv[]) {
221
a41fe3a2 222 enum {
acbeb427
ZJS
223 ARG_VERSION = 0x100,
224 ARG_PRIVATE_NETWORK,
bc2f673e 225 ARG_UUID,
5076f0cc 226 ARG_READ_ONLY,
57fb9fb5 227 ARG_CAPABILITY,
420c7379 228 ARG_DROP_CAPABILITY,
17fe0523
LP
229 ARG_LINK_JOURNAL,
230 ARG_BIND,
f4889f65 231 ARG_BIND_RO,
06c17c39 232 ARG_TMPFS,
f4889f65 233 ARG_SETENV,
eb91eb18 234 ARG_SHARE_SYSTEM,
89f7c846 235 ARG_REGISTER,
aa28aefe 236 ARG_KEEP_UNIT,
69c79d3c 237 ARG_NETWORK_INTERFACE,
c74e630d 238 ARG_NETWORK_MACVLAN,
69c79d3c 239 ARG_NETWORK_VETH,
ab046dde 240 ARG_NETWORK_BRIDGE,
6afc95b7 241 ARG_PERSONALITY,
4d9f07b4 242 ARG_VOLATILE,
a41fe3a2
LP
243 };
244
88213476 245 static const struct option options[] = {
aa28aefe
LP
246 { "help", no_argument, NULL, 'h' },
247 { "version", no_argument, NULL, ARG_VERSION },
248 { "directory", required_argument, NULL, 'D' },
249 { "user", required_argument, NULL, 'u' },
250 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
251 { "boot", no_argument, NULL, 'b' },
252 { "uuid", required_argument, NULL, ARG_UUID },
253 { "read-only", no_argument, NULL, ARG_READ_ONLY },
254 { "capability", required_argument, NULL, ARG_CAPABILITY },
255 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
256 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
257 { "bind", required_argument, NULL, ARG_BIND },
258 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 259 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
260 { "machine", required_argument, NULL, 'M' },
261 { "slice", required_argument, NULL, 'S' },
262 { "setenv", required_argument, NULL, ARG_SETENV },
263 { "selinux-context", required_argument, NULL, 'Z' },
264 { "selinux-apifs-context", required_argument, NULL, 'L' },
265 { "quiet", no_argument, NULL, 'q' },
266 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
267 { "register", required_argument, NULL, ARG_REGISTER },
268 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
269 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 270 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
271 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
272 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 273 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 274 { "image", required_argument, NULL, 'i' },
4d9f07b4 275 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 276 {}
88213476
LP
277 };
278
9444b1f2 279 int c, r;
a42c8b54 280 uint64_t plus = 0, minus = 0;
88213476
LP
281
282 assert(argc >= 0);
283 assert(argv);
284
601185b4 285 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
88213476
LP
286
287 switch (c) {
288
289 case 'h':
601185b4
ZJS
290 help();
291 return 0;
88213476 292
acbeb427
ZJS
293 case ARG_VERSION:
294 puts(PACKAGE_STRING);
295 puts(SYSTEMD_FEATURES);
296 return 0;
297
88213476
LP
298 case 'D':
299 free(arg_directory);
3a74cea5
LP
300 arg_directory = canonicalize_file_name(optarg);
301 if (!arg_directory) {
898d5c91 302 log_error("Invalid root directory: %m");
88213476
LP
303 return -ENOMEM;
304 }
305
306 break;
307
1b9e5b12
LP
308 case 'i':
309 arg_image = optarg;
310 break;
311
687d0825
MV
312 case 'u':
313 free(arg_user);
7027ff61
LP
314 arg_user = strdup(optarg);
315 if (!arg_user)
316 return log_oom();
687d0825
MV
317
318 break;
319
ab046dde 320 case ARG_NETWORK_BRIDGE:
c74e630d 321 arg_network_bridge = optarg;
ab046dde
TG
322
323 /* fall through */
324
69c79d3c
LP
325 case ARG_NETWORK_VETH:
326 arg_network_veth = true;
327 arg_private_network = true;
328 break;
329
aa28aefe 330 case ARG_NETWORK_INTERFACE:
c74e630d
LP
331 if (strv_extend(&arg_network_interfaces, optarg) < 0)
332 return log_oom();
333
334 arg_private_network = true;
335 break;
336
337 case ARG_NETWORK_MACVLAN:
338 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
339 return log_oom();
340
341 /* fall through */
342
ff01d048
LP
343 case ARG_PRIVATE_NETWORK:
344 arg_private_network = true;
a41fe3a2
LP
345 break;
346
0f0dbc46
LP
347 case 'b':
348 arg_boot = true;
349 break;
350
144f0fc0 351 case ARG_UUID:
9444b1f2
LP
352 r = sd_id128_from_string(optarg, &arg_uuid);
353 if (r < 0) {
aa96c6cb 354 log_error("Invalid UUID: %s", optarg);
9444b1f2 355 return r;
aa96c6cb 356 }
9444b1f2 357 break;
aa96c6cb 358
9444b1f2 359 case 'S':
c74e630d 360 arg_slice = optarg;
144f0fc0
LP
361 break;
362
7027ff61 363 case 'M':
eb91eb18
LP
364 if (isempty(optarg)) {
365 free(arg_machine);
366 arg_machine = NULL;
367 } else {
7027ff61 368
eb91eb18
LP
369 if (!hostname_is_valid(optarg)) {
370 log_error("Invalid machine name: %s", optarg);
371 return -EINVAL;
372 }
7027ff61 373
eb91eb18
LP
374 free(arg_machine);
375 arg_machine = strdup(optarg);
376 if (!arg_machine)
377 return log_oom();
378
379 break;
380 }
7027ff61 381
82adf6af
LP
382 case 'Z':
383 arg_selinux_context = optarg;
a8828ed9
DW
384 break;
385
82adf6af
LP
386 case 'L':
387 arg_selinux_apifs_context = optarg;
a8828ed9
DW
388 break;
389
bc2f673e
LP
390 case ARG_READ_ONLY:
391 arg_read_only = true;
392 break;
393
420c7379
LP
394 case ARG_CAPABILITY:
395 case ARG_DROP_CAPABILITY: {
a2a5291b 396 const char *state, *word;
5076f0cc
LP
397 size_t length;
398
399 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 400 _cleanup_free_ char *t;
5076f0cc 401 cap_value_t cap;
5076f0cc
LP
402
403 t = strndup(word, length);
0d0f0c50
SL
404 if (!t)
405 return log_oom();
5076f0cc 406
39ed67d1
LP
407 if (streq(t, "all")) {
408 if (c == ARG_CAPABILITY)
a42c8b54 409 plus = (uint64_t) -1;
39ed67d1 410 else
a42c8b54 411 minus = (uint64_t) -1;
39ed67d1
LP
412 } else {
413 if (cap_from_name(t, &cap) < 0) {
414 log_error("Failed to parse capability %s.", t);
415 return -EINVAL;
416 }
417
418 if (c == ARG_CAPABILITY)
a42c8b54 419 plus |= 1ULL << (uint64_t) cap;
39ed67d1 420 else
a42c8b54 421 minus |= 1ULL << (uint64_t) cap;
5076f0cc 422 }
5076f0cc
LP
423 }
424
425 break;
426 }
427
57fb9fb5
LP
428 case 'j':
429 arg_link_journal = LINK_GUEST;
430 break;
431
432 case ARG_LINK_JOURNAL:
433 if (streq(optarg, "auto"))
434 arg_link_journal = LINK_AUTO;
435 else if (streq(optarg, "no"))
436 arg_link_journal = LINK_NO;
437 else if (streq(optarg, "guest"))
438 arg_link_journal = LINK_GUEST;
439 else if (streq(optarg, "host"))
440 arg_link_journal = LINK_HOST;
441 else {
442 log_error("Failed to parse link journal mode %s", optarg);
443 return -EINVAL;
444 }
445
446 break;
447
17fe0523
LP
448 case ARG_BIND:
449 case ARG_BIND_RO: {
450 _cleanup_free_ char *a = NULL, *b = NULL;
451 char *e;
452 char ***x;
17fe0523
LP
453
454 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456 e = strchr(optarg, ':');
457 if (e) {
458 a = strndup(optarg, e - optarg);
459 b = strdup(e + 1);
460 } else {
461 a = strdup(optarg);
462 b = strdup(optarg);
463 }
464
465 if (!a || !b)
466 return log_oom();
467
468 if (!path_is_absolute(a) || !path_is_absolute(b)) {
469 log_error("Invalid bind mount specification: %s", optarg);
470 return -EINVAL;
471 }
472
473 r = strv_extend(x, a);
474 if (r < 0)
b3451bed 475 return log_oom();
17fe0523
LP
476
477 r = strv_extend(x, b);
478 if (r < 0)
b3451bed 479 return log_oom();
17fe0523
LP
480
481 break;
482 }
483
06c17c39
LP
484 case ARG_TMPFS: {
485 _cleanup_free_ char *a = NULL, *b = NULL;
486 char *e;
487
488 e = strchr(optarg, ':');
489 if (e) {
490 a = strndup(optarg, e - optarg);
491 b = strdup(e + 1);
492 } else {
493 a = strdup(optarg);
494 b = strdup("mode=0755");
495 }
496
497 if (!a || !b)
498 return log_oom();
499
500 if (!path_is_absolute(a)) {
501 log_error("Invalid tmpfs specification: %s", optarg);
502 return -EINVAL;
503 }
504
505 r = strv_push(&arg_tmpfs, a);
506 if (r < 0)
507 return log_oom();
508
509 a = NULL;
510
511 r = strv_push(&arg_tmpfs, b);
512 if (r < 0)
513 return log_oom();
514
515 b = NULL;
516
517 break;
518 }
519
f4889f65
LP
520 case ARG_SETENV: {
521 char **n;
522
523 if (!env_assignment_is_valid(optarg)) {
524 log_error("Environment variable assignment '%s' is not valid.", optarg);
525 return -EINVAL;
526 }
527
528 n = strv_env_set(arg_setenv, optarg);
529 if (!n)
530 return log_oom();
531
532 strv_free(arg_setenv);
533 arg_setenv = n;
534 break;
535 }
536
284c0b91
LP
537 case 'q':
538 arg_quiet = true;
539 break;
540
8a96d94e
LP
541 case ARG_SHARE_SYSTEM:
542 arg_share_system = true;
543 break;
544
eb91eb18
LP
545 case ARG_REGISTER:
546 r = parse_boolean(optarg);
547 if (r < 0) {
548 log_error("Failed to parse --register= argument: %s", optarg);
549 return r;
550 }
551
552 arg_register = r;
553 break;
554
89f7c846
LP
555 case ARG_KEEP_UNIT:
556 arg_keep_unit = true;
557 break;
558
6afc95b7
LP
559 case ARG_PERSONALITY:
560
ac45f971 561 arg_personality = personality_from_string(optarg);
6afc95b7
LP
562 if (arg_personality == 0xffffffffLU) {
563 log_error("Unknown or unsupported personality '%s'.", optarg);
564 return -EINVAL;
565 }
566
567 break;
568
4d9f07b4
LP
569 case ARG_VOLATILE:
570
571 if (!optarg)
572 arg_volatile = VOLATILE_YES;
573 else {
574 r = parse_boolean(optarg);
575 if (r < 0) {
576 if (streq(optarg, "state"))
577 arg_volatile = VOLATILE_STATE;
578 else {
579 log_error("Failed to parse --volatile= argument: %s", optarg);
580 return r;
581 }
582 } else
583 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584 }
585
586 break;
587
88213476
LP
588 case '?':
589 return -EINVAL;
590
591 default:
eb9da376 592 assert_not_reached("Unhandled option");
88213476 593 }
88213476 594
eb91eb18
LP
595 if (arg_share_system)
596 arg_register = false;
597
598 if (arg_boot && arg_share_system) {
599 log_error("--boot and --share-system may not be combined.");
600 return -EINVAL;
601 }
602
89f7c846
LP
603 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604 log_error("--keep-unit may not be used when invoked from a user session.");
605 return -EINVAL;
606 }
607
1b9e5b12
LP
608 if (arg_directory && arg_image) {
609 log_error("--directory= and --image= may not be combined.");
610 return -EINVAL;
611 }
612
4d9f07b4
LP
613 if (arg_volatile != VOLATILE_NO && arg_read_only) {
614 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615 return -EINVAL;
616 }
617
a42c8b54
LP
618 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
88213476
LP
620 return 1;
621}
622
623static int mount_all(const char *dest) {
624
625 typedef struct MountPoint {
626 const char *what;
627 const char *where;
628 const char *type;
629 const char *options;
630 unsigned long flags;
3bd66c05 631 bool fatal;
88213476
LP
632 } MountPoint;
633
634 static const MountPoint mount_table[] = {
06c17c39
LP
635 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
636 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
637 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
638 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
639 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 640 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
641 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
642 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 643#ifdef HAVE_SELINUX
06c17c39
LP
644 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
645 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 646#endif
88213476
LP
647 };
648
649 unsigned k;
650 int r = 0;
651
652 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 653 _cleanup_free_ char *where = NULL;
d002827b 654#ifdef HAVE_SELINUX
a8828ed9 655 _cleanup_free_ char *options = NULL;
d002827b
LP
656#endif
657 const char *o;
88213476
LP
658 int t;
659
17fe0523
LP
660 where = strjoin(dest, "/", mount_table[k].where, NULL);
661 if (!where)
662 return log_oom();
88213476 663
e65aec12 664 t = path_is_mount_point(where, true);
68fb0892 665 if (t < 0) {
88213476 666 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
667
668 if (r == 0)
669 r = t;
670
671 continue;
672 }
673
9c1c7f71
LP
674 /* Skip this entry if it is not a remount. */
675 if (mount_table[k].what && t > 0)
014a9c77
LP
676 continue;
677
79d80fc1
TG
678 t = mkdir_p(where, 0755);
679 if (t < 0) {
680 if (mount_table[k].fatal) {
681 log_error("Failed to create directory %s: %s", where, strerror(-t));
682
683 if (r == 0)
684 r = t;
685 } else
686 log_warning("Failed to create directory %s: %s", where, strerror(-t));
687
688 continue;
689 }
88213476 690
a8828ed9 691#ifdef HAVE_SELINUX
82adf6af
LP
692 if (arg_selinux_apifs_context &&
693 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
694 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
695 if (!options)
696 return log_oom();
697
698 o = options;
699 } else
a8828ed9 700#endif
d002827b 701 o = mount_table[k].options;
a8828ed9 702
a8828ed9 703
88213476
LP
704 if (mount(mount_table[k].what,
705 where,
706 mount_table[k].type,
707 mount_table[k].flags,
79d80fc1 708 o) < 0) {
88213476 709
79d80fc1
TG
710 if (mount_table[k].fatal) {
711 log_error("mount(%s) failed: %m", where);
88213476 712
79d80fc1
TG
713 if (r == 0)
714 r = -errno;
715 } else
716 log_warning("mount(%s) failed: %m", where);
88213476 717 }
88213476
LP
718 }
719
e58a1277
LP
720 return r;
721}
f8440af5 722
d6797c92 723static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
724 char **x, **y;
725
726 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 727 _cleanup_free_ char *where = NULL;
d2421337 728 struct stat source_st, dest_st;
2ed4e5e0 729 int r;
d2421337
DR
730
731 if (stat(*x, &source_st) < 0) {
1b9e5b12 732 log_error("Failed to stat %s: %m", *x);
d2421337
DR
733 return -errno;
734 }
17fe0523 735
06c17c39
LP
736 where = strappend(dest, *y);
737 if (!where)
738 return log_oom();
739
2ed4e5e0
SL
740 r = stat(where, &dest_st);
741 if (r == 0) {
d2421337 742 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 743 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
744 return -EINVAL;
745 }
2ed4e5e0
SL
746 } else if (errno == ENOENT) {
747 r = mkdir_parents_label(where, 0755);
748 if (r < 0) {
749 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
750 return r;
d2421337 751 }
2ed4e5e0 752 } else {
590b6b91 753 log_error("Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
754 return -errno;
755 }
06c17c39 756
2ed4e5e0 757 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 758 * and char devices. */
79d80fc1
TG
759 if (S_ISDIR(source_st.st_mode)) {
760 r = mkdir_label(where, 0755);
761 if (r < 0) {
762 log_error("Failed to create mount point %s: %s", where, strerror(-r));
763
764 return r;
765 }
766 } else if (S_ISFIFO(source_st.st_mode)) {
767 r = mkfifo(where, 0644);
768 if (r < 0 && errno != EEXIST) {
769 log_error("Failed to create mount point %s: %m", where);
770
771 return -errno;
772 }
773 } else if (S_ISSOCK(source_st.st_mode)) {
774 r = mknod(where, 0644 | S_IFSOCK, 0);
775 if (r < 0 && errno != EEXIST) {
776 log_error("Failed to create mount point %s: %m", where);
777
778 return -errno;
779 }
780 } else if (S_ISREG(source_st.st_mode)) {
781 r = touch(where);
782 if (r < 0) {
783 log_error("Failed to create mount point %s: %s", where, strerror(-r));
784
785 return r;
786 }
787 } else {
2ed4e5e0
SL
788 log_error("Refusing to create mountpoint for file: %s", *x);
789 return -ENOTSUP;
d2421337 790 }
17fe0523
LP
791
792 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
793 log_error("mount(%s) failed: %m", where);
794 return -errno;
795 }
796
d6797c92
LP
797 if (ro) {
798 r = bind_remount_recursive(where, true);
799 if (r < 0) {
800 log_error("Read-Only bind mount failed: %s", strerror(-r));
801 return r;
802 }
17fe0523
LP
803 }
804 }
805
806 return 0;
807}
808
06c17c39
LP
809static int mount_tmpfs(const char *dest) {
810 char **i, **o;
811
812 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813 _cleanup_free_ char *where = NULL;
79d80fc1 814 int r;
06c17c39
LP
815
816 where = strappend(dest, *i);
817 if (!where)
818 return log_oom();
819
79d80fc1
TG
820 r = mkdir_label(where, 0755);
821 if (r < 0) {
822 log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
823
824 return r;
825 }
06c17c39
LP
826
827 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
828 log_error("tmpfs mount to %s failed: %m", where);
829 return -errno;
830 }
831 }
832
833 return 0;
834}
835
e58a1277 836static int setup_timezone(const char *dest) {
d4036145
LP
837 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
838 char *z, *y;
839 int r;
f8440af5 840
e58a1277
LP
841 assert(dest);
842
843 /* Fix the timezone, if possible */
d4036145
LP
844 r = readlink_malloc("/etc/localtime", &p);
845 if (r < 0) {
846 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
847 return 0;
848 }
849
850 z = path_startswith(p, "../usr/share/zoneinfo/");
851 if (!z)
852 z = path_startswith(p, "/usr/share/zoneinfo/");
853 if (!z) {
854 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
855 return 0;
856 }
857
04bc4a3f
LP
858 where = strappend(dest, "/etc/localtime");
859 if (!where)
0d0f0c50 860 return log_oom();
715ac17a 861
d4036145
LP
862 r = readlink_malloc(where, &q);
863 if (r >= 0) {
864 y = path_startswith(q, "../usr/share/zoneinfo/");
865 if (!y)
866 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 867
d4036145
LP
868 /* Already pointing to the right place? Then do nothing .. */
869 if (y && streq(y, z))
870 return 0;
871 }
872
873 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
874 if (!check)
0d0f0c50 875 return log_oom();
4d1c38b8 876
d4036145
LP
877 if (access(check, F_OK) < 0) {
878 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
879 return 0;
880 }
68fb0892 881
d4036145
LP
882 what = strappend("../usr/share/zoneinfo/", z);
883 if (!what)
884 return log_oom();
885
79d80fc1
TG
886 r = mkdir_parents(where, 0755);
887 if (r < 0) {
888 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
889
890 return 0;
891 }
892
893 r = unlink(where);
894 if (r < 0 && errno != ENOENT) {
895 log_error("Failed to remove existing timezone info %s in container: %m", where);
896
897 return 0;
898 }
4d9f07b4 899
d4036145
LP
900 if (symlink(what, where) < 0) {
901 log_error("Failed to correct timezone of container: %m");
902 return 0;
903 }
e58a1277
LP
904
905 return 0;
88213476
LP
906}
907
2547bb41 908static int setup_resolv_conf(const char *dest) {
c8b32e11 909 _cleanup_free_ char *where = NULL;
79d80fc1 910 int r;
2547bb41
LP
911
912 assert(dest);
913
914 if (arg_private_network)
915 return 0;
916
917 /* Fix resolv.conf, if possible */
04bc4a3f
LP
918 where = strappend(dest, "/etc/resolv.conf");
919 if (!where)
0d0f0c50 920 return log_oom();
2547bb41 921
77e63faf
LP
922 /* We don't really care for the results of this really. If it
923 * fails, it fails, but meh... */
79d80fc1
TG
924 r = mkdir_parents(where, 0755);
925 if (r < 0) {
926 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
927
928 return 0;
929 }
930
931 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
932 if (r < 0) {
933 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
934
935 return 0;
936 }
2547bb41
LP
937
938 return 0;
939}
940
4d9f07b4
LP
941static int setup_volatile_state(const char *directory) {
942 const char *p;
943 int r;
944
945 assert(directory);
946
947 if (arg_volatile != VOLATILE_STATE)
948 return 0;
949
950 /* --volatile=state means we simply overmount /var
951 with a tmpfs, and the rest read-only. */
952
953 r = bind_remount_recursive(directory, true);
954 if (r < 0) {
955 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
956 return r;
957 }
958
959 p = strappenda(directory, "/var");
79d80fc1
TG
960 r = mkdir(p, 0755);
961 if (r < 0 && errno != EEXIST) {
962 log_error("Failed to create %s: %m", directory);
963 return -errno;
964 }
4d9f07b4
LP
965
966 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
967 log_error("Failed to mount tmpfs to /var: %m");
968 return -errno;
969 }
970
971 return 0;
972}
973
974static int setup_volatile(const char *directory) {
975 bool tmpfs_mounted = false, bind_mounted = false;
976 char template[] = "/tmp/nspawn-volatile-XXXXXX";
977 const char *f, *t;
978 int r;
979
980 assert(directory);
981
982 if (arg_volatile != VOLATILE_YES)
983 return 0;
984
985 /* --volatile=yes means we mount a tmpfs to the root dir, and
986 the original /usr to use inside it, and that read-only. */
987
988 if (!mkdtemp(template)) {
989 log_error("Failed to create temporary directory: %m");
990 return -errno;
991 }
992
993 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
994 log_error("Failed to mount tmpfs for root directory: %m");
995 r = -errno;
996 goto fail;
997 }
998
999 tmpfs_mounted = true;
1000
1001 f = strappenda(directory, "/usr");
1002 t = strappenda(template, "/usr");
1003
79d80fc1
TG
1004 r = mkdir(t, 0755);
1005 if (r < 0 && errno != EEXIST) {
1006 log_error("Failed to create %s: %m", t);
1007 r = -errno;
1008 goto fail;
1009 }
1010
4d9f07b4
LP
1011 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1012 log_error("Failed to create /usr bind mount: %m");
1013 r = -errno;
1014 goto fail;
1015 }
1016
1017 bind_mounted = true;
1018
1019 r = bind_remount_recursive(t, true);
1020 if (r < 0) {
1021 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1022 goto fail;
1023 }
1024
1025 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1026 log_error("Failed to move root mount: %m");
1027 r = -errno;
1028 goto fail;
1029 }
1030
1031 rmdir(template);
1032
1033 return 0;
1034
1035fail:
1036 if (bind_mounted)
1037 umount(t);
1038 if (tmpfs_mounted)
1039 umount(template);
1040 rmdir(template);
1041 return r;
1042}
1043
9f24adc2
LP
1044static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1045
1046 snprintf(s, 37,
1047 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1048 SD_ID128_FORMAT_VAL(id));
1049
1050 return s;
1051}
1052
04bc4a3f 1053static int setup_boot_id(const char *dest) {
7fd1b19b 1054 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1055 sd_id128_t rnd = {};
04bc4a3f
LP
1056 char as_uuid[37];
1057 int r;
1058
1059 assert(dest);
1060
eb91eb18
LP
1061 if (arg_share_system)
1062 return 0;
1063
04bc4a3f
LP
1064 /* Generate a new randomized boot ID, so that each boot-up of
1065 * the container gets a new one */
1066
1067 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1068 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1069 if (!from || !to)
1070 return log_oom();
04bc4a3f
LP
1071
1072 r = sd_id128_randomize(&rnd);
1073 if (r < 0) {
1074 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 1075 return r;
04bc4a3f
LP
1076 }
1077
9f24adc2 1078 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1079
574d5f2d 1080 r = write_string_file(from, as_uuid);
04bc4a3f
LP
1081 if (r < 0) {
1082 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 1083 return r;
04bc4a3f
LP
1084 }
1085
1086 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1087 log_error("Failed to bind mount boot id: %m");
1088 r = -errno;
10d18763
ZJS
1089 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1090 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
1091
1092 unlink(from);
04bc4a3f
LP
1093 return r;
1094}
1095
e58a1277 1096static int copy_devnodes(const char *dest) {
88213476
LP
1097
1098 static const char devnodes[] =
1099 "null\0"
1100 "zero\0"
1101 "full\0"
1102 "random\0"
1103 "urandom\0"
85614d66
TG
1104 "tty\0"
1105 "net/tun\0";
88213476
LP
1106
1107 const char *d;
e58a1277 1108 int r = 0;
7fd1b19b 1109 _cleanup_umask_ mode_t u;
a258bf26
LP
1110
1111 assert(dest);
124640f1
LP
1112
1113 u = umask(0000);
88213476
LP
1114
1115 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1116 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1117 struct stat st;
88213476 1118
7f112f50
LP
1119 from = strappend("/dev/", d);
1120 to = strjoin(dest, "/dev/", d, NULL);
1121 if (!from || !to)
1122 return log_oom();
88213476
LP
1123
1124 if (stat(from, &st) < 0) {
1125
1126 if (errno != ENOENT) {
1127 log_error("Failed to stat %s: %m", from);
7f112f50 1128 return -errno;
88213476
LP
1129 }
1130
a258bf26 1131 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1132
ed8b7a3e 1133 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1134 return -EIO;
a258bf26 1135
85614d66
TG
1136 } else {
1137 r = mkdir_parents(to, 0775);
1138 if (r < 0) {
1139 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1140 return -r;
1141 }
a258bf26 1142
85614d66
TG
1143 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144 log_error("mknod(%s) failed: %m", dest);
1145 return -errno;
1146 }
88213476 1147 }
88213476
LP
1148 }
1149
e58a1277
LP
1150 return r;
1151}
88213476 1152
f2d88580
LP
1153static int setup_ptmx(const char *dest) {
1154 _cleanup_free_ char *p = NULL;
1155
1156 p = strappend(dest, "/dev/ptmx");
1157 if (!p)
1158 return log_oom();
1159
1160 if (symlink("pts/ptmx", p) < 0) {
1161 log_error("Failed to create /dev/ptmx symlink: %m");
1162 return -errno;
1163 }
1164
1165 return 0;
1166}
1167
e58a1277 1168static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1169 _cleanup_umask_ mode_t u;
1170 const char *to;
e58a1277 1171 struct stat st;
e58a1277 1172 int r;
e58a1277
LP
1173
1174 assert(dest);
1175 assert(console);
1176
1177 u = umask(0000);
1178
eb0f0863
LP
1179 if (stat("/dev/null", &st) < 0) {
1180 log_error("Failed to stat /dev/null: %m");
25ea79fe 1181 return -errno;
e58a1277 1182 }
88213476 1183
e58a1277
LP
1184 r = chmod_and_chown(console, 0600, 0, 0);
1185 if (r < 0) {
1186 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 1187 return r;
a258bf26 1188 }
88213476 1189
a258bf26
LP
1190 /* We need to bind mount the right tty to /dev/console since
1191 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1192 * to bind mount things on we create a device node first, and
1193 * use /dev/null for that since we the cgroups device policy
1194 * allows us to create that freely, while we cannot create
1195 * /dev/console. (Note that the major minor doesn't actually
1196 * matter here, since we mount it over anyway). */
a258bf26 1197
eb0f0863 1198 to = strappenda(dest, "/dev/console");
e58a1277
LP
1199 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1200 log_error("mknod() for /dev/console failed: %m");
25ea79fe 1201 return -errno;
e58a1277 1202 }
a258bf26
LP
1203
1204 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 1205 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 1206 return -errno;
a258bf26
LP
1207 }
1208
25ea79fe 1209 return 0;
e58a1277
LP
1210}
1211
1212static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1213 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1214 int r, fd, k;
7fd1b19b 1215 _cleanup_umask_ mode_t u;
e58a1277
LP
1216 union {
1217 struct cmsghdr cmsghdr;
1218 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1219 } control = {};
1220 struct msghdr mh = {
1221 .msg_control = &control,
1222 .msg_controllen = sizeof(control),
1223 };
e58a1277
LP
1224 struct cmsghdr *cmsg;
1225
1226 assert(dest);
1227 assert(kmsg_socket >= 0);
a258bf26 1228
e58a1277 1229 u = umask(0000);
a258bf26 1230
f1e5dfe2
LP
1231 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1232 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1233 * on the reading side behave very similar to /proc/kmsg,
1234 * their writing side behaves differently from /dev/kmsg in
1235 * that writing blocks when nothing is reading. In order to
1236 * avoid any problems with containers deadlocking due to this
1237 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1238 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1239 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1240 return log_oom();
e58a1277
LP
1241
1242 if (mkfifo(from, 0600) < 0) {
1243 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 1244 return -errno;
e58a1277
LP
1245 }
1246
1247 r = chmod_and_chown(from, 0600, 0, 0);
1248 if (r < 0) {
1249 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 1250 return r;
e58a1277
LP
1251 }
1252
1253 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1254 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 1255 return -errno;
e58a1277
LP
1256 }
1257
1258 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1259 if (fd < 0) {
1260 log_error("Failed to open fifo: %m");
25ea79fe 1261 return -errno;
e58a1277
LP
1262 }
1263
e58a1277
LP
1264 cmsg = CMSG_FIRSTHDR(&mh);
1265 cmsg->cmsg_level = SOL_SOCKET;
1266 cmsg->cmsg_type = SCM_RIGHTS;
1267 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1268 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1269
1270 mh.msg_controllen = cmsg->cmsg_len;
1271
1272 /* Store away the fd in the socket, so that it stays open as
1273 * long as we run the child */
1274 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1275 safe_close(fd);
e58a1277
LP
1276
1277 if (k < 0) {
1278 log_error("Failed to send FIFO fd: %m");
25ea79fe 1279 return -errno;
a258bf26
LP
1280 }
1281
f1e5dfe2
LP
1282 /* And now make the FIFO unavailable as /dev/kmsg... */
1283 unlink(from);
25ea79fe 1284 return 0;
88213476
LP
1285}
1286
3a74cea5 1287static int setup_hostname(void) {
3a74cea5 1288
eb91eb18
LP
1289 if (arg_share_system)
1290 return 0;
1291
7027ff61
LP
1292 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1293 return -errno;
3a74cea5 1294
7027ff61 1295 return 0;
3a74cea5
LP
1296}
1297
57fb9fb5 1298static int setup_journal(const char *directory) {
4d680aee 1299 sd_id128_t machine_id, this_id;
7fd1b19b 1300 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1301 char *id;
57fb9fb5
LP
1302 int r;
1303
57fb9fb5 1304 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1305 if (!p)
1306 return log_oom();
57fb9fb5
LP
1307
1308 r = read_one_line_file(p, &b);
27407a01
ZJS
1309 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1310 return 0;
1311 else if (r < 0) {
1312 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1313 return r;
1314 }
1315
27407a01
ZJS
1316 id = strstrip(b);
1317 if (isempty(id) && arg_link_journal == LINK_AUTO)
1318 return 0;
57fb9fb5 1319
27407a01
ZJS
1320 /* Verify validity */
1321 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1322 if (r < 0) {
27407a01
ZJS
1323 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1324 return r;
57fb9fb5
LP
1325 }
1326
4d680aee
ZJS
1327 r = sd_id128_get_machine(&this_id);
1328 if (r < 0) {
1329 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1330 return r;
1331 }
1332
1333 if (sd_id128_equal(machine_id, this_id)) {
1334 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335 "Host and machine ids are equal (%s): refusing to link journals", id);
1336 if (arg_link_journal == LINK_AUTO)
1337 return 0;
1338 return
1339 -EEXIST;
1340 }
1341
1342 if (arg_link_journal == LINK_NO)
1343 return 0;
1344
57fb9fb5 1345 free(p);
27407a01
ZJS
1346 p = strappend("/var/log/journal/", id);
1347 q = strjoin(directory, "/var/log/journal/", id, NULL);
1348 if (!p || !q)
1349 return log_oom();
1350
1351 if (path_is_mount_point(p, false) > 0) {
1352 if (arg_link_journal != LINK_AUTO) {
1353 log_error("%s: already a mount point, refusing to use for journal", p);
1354 return -EEXIST;
1355 }
1356
1357 return 0;
57fb9fb5
LP
1358 }
1359
27407a01 1360 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1361 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1362 log_error("%s: already a mount point, refusing to use for journal", q);
1363 return -EEXIST;
57fb9fb5
LP
1364 }
1365
27407a01 1366 return 0;
57fb9fb5
LP
1367 }
1368
1369 r = readlink_and_make_absolute(p, &d);
1370 if (r >= 0) {
1371 if ((arg_link_journal == LINK_GUEST ||
1372 arg_link_journal == LINK_AUTO) &&
1373 path_equal(d, q)) {
1374
27407a01
ZJS
1375 r = mkdir_p(q, 0755);
1376 if (r < 0)
79d80fc1 1377 log_warning("Failed to create directory %s: %m", q);
27407a01 1378 return 0;
57fb9fb5
LP
1379 }
1380
1381 if (unlink(p) < 0) {
1382 log_error("Failed to remove symlink %s: %m", p);
27407a01 1383 return -errno;
57fb9fb5
LP
1384 }
1385 } else if (r == -EINVAL) {
1386
1387 if (arg_link_journal == LINK_GUEST &&
1388 rmdir(p) < 0) {
1389
27407a01
ZJS
1390 if (errno == ENOTDIR) {
1391 log_error("%s already exists and is neither a symlink nor a directory", p);
1392 return r;
1393 } else {
57fb9fb5 1394 log_error("Failed to remove %s: %m", p);
27407a01 1395 return -errno;
57fb9fb5 1396 }
57fb9fb5
LP
1397 }
1398 } else if (r != -ENOENT) {
1399 log_error("readlink(%s) failed: %m", p);
27407a01 1400 return r;
57fb9fb5
LP
1401 }
1402
1403 if (arg_link_journal == LINK_GUEST) {
1404
1405 if (symlink(q, p) < 0) {
1406 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1407 return -errno;
57fb9fb5
LP
1408 }
1409
27407a01
ZJS
1410 r = mkdir_p(q, 0755);
1411 if (r < 0)
79d80fc1 1412 log_warning("Failed to create directory %s: %m", q);
27407a01 1413 return 0;
57fb9fb5
LP
1414 }
1415
1416 if (arg_link_journal == LINK_HOST) {
1417 r = mkdir_p(p, 0755);
1418 if (r < 0) {
1419 log_error("Failed to create %s: %m", p);
27407a01 1420 return r;
57fb9fb5
LP
1421 }
1422
27407a01
ZJS
1423 } else if (access(p, F_OK) < 0)
1424 return 0;
57fb9fb5 1425
cdb2b9d0
LP
1426 if (dir_is_empty(q) == 0)
1427 log_warning("%s is not empty, proceeding anyway.", q);
1428
57fb9fb5
LP
1429 r = mkdir_p(q, 0755);
1430 if (r < 0) {
1431 log_error("Failed to create %s: %m", q);
27407a01 1432 return r;
57fb9fb5
LP
1433 }
1434
1435 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1436 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1437 return -errno;
57fb9fb5
LP
1438 }
1439
27407a01 1440 return 0;
57fb9fb5
LP
1441}
1442
9bd37b40
LP
1443static int setup_kdbus(const char *dest, const char *path) {
1444 const char *p;
1445
1446 if (!path)
1447 return 0;
1448
1449 p = strappenda(dest, "/dev/kdbus");
1450 if (mkdir(p, 0755) < 0) {
1451 log_error("Failed to create kdbus path: %m");
1452 return -errno;
1453 }
1454
1455 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1456 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1457 return -errno;
1458 }
1459
1460 return 0;
1461}
1462
88213476 1463static int drop_capabilities(void) {
5076f0cc 1464 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1465}
1466
5aa4bb6b 1467static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1468 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1469 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1470 int r;
1471
eb91eb18
LP
1472 if (!arg_register)
1473 return 0;
1474
1c03020c 1475 r = sd_bus_default_system(&bus);
9444b1f2
LP
1476 if (r < 0) {
1477 log_error("Failed to open system bus: %s", strerror(-r));
1478 return r;
1479 }
1480
89f7c846
LP
1481 if (arg_keep_unit) {
1482 r = sd_bus_call_method(
1483 bus,
1484 "org.freedesktop.machine1",
1485 "/org/freedesktop/machine1",
1486 "org.freedesktop.machine1.Manager",
5aa4bb6b 1487 "RegisterMachineWithNetwork",
89f7c846
LP
1488 &error,
1489 NULL,
5aa4bb6b 1490 "sayssusai",
89f7c846
LP
1491 arg_machine,
1492 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1493 "nspawn",
1494 "container",
1495 (uint32_t) pid,
5aa4bb6b
LP
1496 strempty(arg_directory),
1497 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1498 } else {
9457ac5b
LP
1499 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1500
1501 r = sd_bus_message_new_method_call(
89f7c846 1502 bus,
9457ac5b 1503 &m,
89f7c846
LP
1504 "org.freedesktop.machine1",
1505 "/org/freedesktop/machine1",
1506 "org.freedesktop.machine1.Manager",
5aa4bb6b 1507 "CreateMachineWithNetwork");
9457ac5b
LP
1508 if (r < 0) {
1509 log_error("Failed to create message: %s", strerror(-r));
1510 return r;
1511 }
1512
1513 r = sd_bus_message_append(
1514 m,
5aa4bb6b 1515 "sayssusai",
89f7c846
LP
1516 arg_machine,
1517 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1518 "nspawn",
1519 "container",
1520 (uint32_t) pid,
5aa4bb6b
LP
1521 strempty(arg_directory),
1522 local_ifindex > 0 ? 1 : 0, local_ifindex);
9457ac5b
LP
1523 if (r < 0) {
1524 log_error("Failed to append message arguments: %s", strerror(-r));
1525 return r;
1526 }
1527
1528 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529 if (r < 0) {
1530 log_error("Failed to open container: %s", strerror(-r));
1531 return r;
1532 }
1533
1534 if (!isempty(arg_slice)) {
1535 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1536 if (r < 0) {
1537 log_error("Failed to append slice: %s", strerror(-r));
1538 return r;
1539 }
1540 }
1541
1542 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1543 if (r < 0) {
1544 log_error("Failed to add device policy: %s", strerror(-r));
1545 return r;
1546 }
1547
317cde8b 1548 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 11,
9457ac5b
LP
1549 /* Allow the container to
1550 * access and create the API
1551 * device nodes, so that
1552 * PrivateDevices= in the
1553 * container can work
1554 * fine */
1555 "/dev/null", "rwm",
1556 "/dev/zero", "rwm",
1557 "/dev/full", "rwm",
1558 "/dev/random", "rwm",
1559 "/dev/urandom", "rwm",
1560 "/dev/tty", "rwm",
864e1706 1561 "/dev/net/tun", "rwm",
9457ac5b
LP
1562 /* Allow the container
1563 * access to ptys. However,
1564 * do not permit the
1565 * container to ever create
1566 * these device nodes. */
1567 "/dev/pts/ptmx", "rw",
a07f961e
LP
1568 "char-pts", "rw",
1569 /* Allow the container
1570 * access to all kdbus
1571 * devices. Again, the
1572 * container cannot create
1573 * these nodes, only use
1574 * them. We use a pretty
1575 * open match here, so that
1576 * the kernel API can still
1577 * change. */
1578 "char-kdbus", "rw",
1579 "char-kdbus/*", "rw");
9457ac5b
LP
1580 if (r < 0) {
1581 log_error("Failed to add device whitelist: %s", strerror(-r));
1582 return r;
1583 }
1584
1585 r = sd_bus_message_close_container(m);
1586 if (r < 0) {
1587 log_error("Failed to close container: %s", strerror(-r));
1588 return r;
1589 }
1590
1591 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1592 }
1593
9444b1f2 1594 if (r < 0) {
1f0cd86b
LP
1595 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1596 return r;
1597 }
1598
1599 return 0;
1600}
1601
1602static int terminate_machine(pid_t pid) {
1603 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1604 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1605 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1606 const char *path;
1607 int r;
1608
eb91eb18
LP
1609 if (!arg_register)
1610 return 0;
1611
76b54375 1612 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1613 if (r < 0) {
1614 log_error("Failed to open system bus: %s", strerror(-r));
1615 return r;
1616 }
1617
1618 r = sd_bus_call_method(
1619 bus,
1620 "org.freedesktop.machine1",
1621 "/org/freedesktop/machine1",
1622 "org.freedesktop.machine1.Manager",
1623 "GetMachineByPID",
1624 &error,
1625 &reply,
1626 "u",
1627 (uint32_t) pid);
1628 if (r < 0) {
1629 /* Note that the machine might already have been
1630 * cleaned up automatically, hence don't consider it a
1631 * failure if we cannot get the machine object. */
1632 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1633 return 0;
1634 }
1635
1636 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1637 if (r < 0)
1638 return bus_log_parse_error(r);
9444b1f2 1639
1f0cd86b
LP
1640 r = sd_bus_call_method(
1641 bus,
1642 "org.freedesktop.machine1",
1643 path,
1644 "org.freedesktop.machine1.Machine",
1645 "Terminate",
1646 &error,
1647 NULL,
1648 NULL);
1649 if (r < 0) {
1650 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1651 return 0;
1652 }
1653
9444b1f2
LP
1654 return 0;
1655}
1656
db999e0f
LP
1657static int reset_audit_loginuid(void) {
1658 _cleanup_free_ char *p = NULL;
1659 int r;
1660
1661 if (arg_share_system)
1662 return 0;
1663
1664 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1665 if (r == -ENOENT)
db999e0f
LP
1666 return 0;
1667 if (r < 0) {
1668 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1669 return r;
1670 }
1671
1672 /* Already reset? */
1673 if (streq(p, "4294967295"))
1674 return 0;
1675
1676 r = write_string_file("/proc/self/loginuid", "4294967295");
1677 if (r < 0) {
1678 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1679 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1680 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1681 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1682 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1683
db999e0f 1684 sleep(5);
77b6e194 1685 }
db999e0f
LP
1686
1687 return 0;
77b6e194
LP
1688}
1689
4f758c23
LP
1690#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1691#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
01dde061 1692
4f758c23 1693static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
01dde061
TG
1694 int r;
1695
1696 uint8_t result[8];
1697 size_t l, sz;
1698 uint8_t *v;
1699
1700 l = strlen(arg_machine);
1701 sz = sizeof(sd_id128_t) + l;
1702 v = alloca(sz);
1703
1704 /* fetch some persistent data unique to the host */
1705 r = sd_id128_get_machine((sd_id128_t*) v);
1706 if (r < 0)
1707 return r;
1708
1709 /* combine with some data unique (on this host) to this
1710 * container instance */
1711 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1712
1713 /* Let's hash the host machine ID plus the container name. We
1714 * use a fixed, but originally randomly created hash key here. */
4f758c23 1715 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
1716
1717 assert_cc(ETH_ALEN <= sizeof(result));
1718 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1719
1720 /* see eth_random_addr in the kernel */
1721 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1722 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1723
1724 return 0;
1725}
1726
5aa4bb6b 1727static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1728 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 1730 struct ether_addr mac_host, mac_container;
5aa4bb6b 1731 int r, i;
69c79d3c
LP
1732
1733 if (!arg_private_network)
1734 return 0;
1735
1736 if (!arg_network_veth)
1737 return 0;
1738
08af0da2
LP
1739 /* Use two different interface name prefixes depending whether
1740 * we are in bridge mode or not. */
c00524c9 1741 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 1742 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1743
4f758c23 1744 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
01dde061 1745 if (r < 0) {
4f758c23
LP
1746 log_error("Failed to generate predictable MAC address for container side");
1747 return r;
1748 }
1749
1750 r = generate_mac(&mac_host, HOST_HASH_KEY);
1751 if (r < 0) {
1752 log_error("Failed to generate predictable MAC address for host side");
01dde061
TG
1753 return r;
1754 }
1755
151b9b96 1756 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1757 if (r < 0) {
1758 log_error("Failed to connect to netlink: %s", strerror(-r));
1759 return r;
1760 }
1761
151b9b96 1762 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1763 if (r < 0) {
1764 log_error("Failed to allocate netlink message: %s", strerror(-r));
1765 return r;
1766 }
1767
ab046dde 1768 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1769 if (r < 0) {
ab046dde 1770 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1771 return r;
1772 }
1773
4f758c23
LP
1774 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1775 if (r < 0) {
1776 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1777 return r;
1778 }
1779
ee3a6a51 1780 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1781 if (r < 0) {
1782 log_error("Failed to open netlink container: %s", strerror(-r));
1783 return r;
1784 }
1785
d8e538ec 1786 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
69c79d3c
LP
1787 if (r < 0) {
1788 log_error("Failed to open netlink container: %s", strerror(-r));
1789 return r;
1790 }
1791
ee3a6a51 1792 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1793 if (r < 0) {
ab046dde 1794 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1795 return r;
1796 }
1797
ab046dde 1798 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1799 if (r < 0) {
ab046dde 1800 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1801 return r;
1802 }
01dde061 1803
4f758c23 1804 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
01dde061
TG
1805 if (r < 0) {
1806 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1807 return r;
1808 }
69c79d3c 1809
ab046dde 1810 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1811 if (r < 0) {
1812 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1813 return r;
1814 }
1815
1816 r = sd_rtnl_message_close_container(m);
1817 if (r < 0) {
1818 log_error("Failed to close netlink container: %s", strerror(-r));
1819 return r;
1820 }
1821
1822 r = sd_rtnl_message_close_container(m);
1823 if (r < 0) {
1824 log_error("Failed to close netlink container: %s", strerror(-r));
1825 return r;
1826 }
1827
1828 r = sd_rtnl_message_close_container(m);
1829 if (r < 0) {
1830 log_error("Failed to close netlink container: %s", strerror(-r));
1831 return r;
1832 }
1833
1834 r = sd_rtnl_call(rtnl, m, 0, NULL);
1835 if (r < 0) {
1836 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1837 return r;
1838 }
1839
5aa4bb6b
LP
1840 i = (int) if_nametoindex(iface_name);
1841 if (i <= 0) {
1842 log_error("Failed to resolve interface %s: %m", iface_name);
1843 return -errno;
1844 }
1845
1846 *ifi = i;
1847
69c79d3c
LP
1848 return 0;
1849}
1850
5aa4bb6b 1851static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1852 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1853 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1854 int r, bridge;
1855
1856 if (!arg_private_network)
1857 return 0;
1858
1859 if (!arg_network_veth)
1860 return 0;
1861
1862 if (!arg_network_bridge)
1863 return 0;
1864
1865 bridge = (int) if_nametoindex(arg_network_bridge);
1866 if (bridge <= 0) {
1867 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1868 return -errno;
1869 }
1870
5aa4bb6b
LP
1871 *ifi = bridge;
1872
151b9b96 1873 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1874 if (r < 0) {
1875 log_error("Failed to connect to netlink: %s", strerror(-r));
1876 return r;
1877 }
1878
151b9b96 1879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1880 if (r < 0) {
1881 log_error("Failed to allocate netlink message: %s", strerror(-r));
1882 return r;
1883 }
1884
039dd4af
TG
1885 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1886 if (r < 0) {
1887 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1888 return r;
1889 }
1890
ab046dde
TG
1891 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1892 if (r < 0) {
1893 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1894 return r;
1895 }
1896
1897 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1898 if (r < 0) {
1899 log_error("Failed to add netlink master field: %s", strerror(-r));
1900 return r;
1901 }
1902
1903 r = sd_rtnl_call(rtnl, m, 0, NULL);
1904 if (r < 0) {
1905 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1906 return r;
1907 }
1908
1909 return 0;
1910}
1911
c74e630d
LP
1912static int parse_interface(struct udev *udev, const char *name) {
1913 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1914 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1915 int ifi;
1916
1917 ifi = (int) if_nametoindex(name);
1918 if (ifi <= 0) {
1919 log_error("Failed to resolve interface %s: %m", name);
1920 return -errno;
1921 }
1922
1923 sprintf(ifi_str, "n%i", ifi);
1924 d = udev_device_new_from_device_id(udev, ifi_str);
1925 if (!d) {
1926 log_error("Failed to get udev device for interface %s: %m", name);
1927 return -errno;
1928 }
1929
1930 if (udev_device_get_is_initialized(d) <= 0) {
1931 log_error("Network interface %s is not initialized yet.", name);
1932 return -EBUSY;
1933 }
1934
1935 return ifi;
1936}
1937
69c79d3c 1938static int move_network_interfaces(pid_t pid) {
7e227024 1939 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1940 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1941 char **i;
1942 int r;
1943
1944 if (!arg_private_network)
1945 return 0;
1946
1947 if (strv_isempty(arg_network_interfaces))
1948 return 0;
1949
151b9b96 1950 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1951 if (r < 0) {
1952 log_error("Failed to connect to netlink: %s", strerror(-r));
1953 return r;
1954 }
1955
7e227024
LP
1956 udev = udev_new();
1957 if (!udev) {
1958 log_error("Failed to connect to udev.");
1959 return -ENOMEM;
1960 }
1961
aa28aefe 1962 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1963 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1964 int ifi;
aa28aefe 1965
c74e630d
LP
1966 ifi = parse_interface(udev, *i);
1967 if (ifi < 0)
1968 return ifi;
1969
3125b3ef 1970 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
c74e630d
LP
1971 if (r < 0) {
1972 log_error("Failed to allocate netlink message: %s", strerror(-r));
1973 return r;
aa28aefe
LP
1974 }
1975
c74e630d
LP
1976 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1977 if (r < 0) {
1978 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1979 return r;
7e227024
LP
1980 }
1981
c74e630d
LP
1982 r = sd_rtnl_call(rtnl, m, 0, NULL);
1983 if (r < 0) {
1984 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1985 return r;
7e227024 1986 }
c74e630d 1987 }
7e227024 1988
c74e630d
LP
1989 return 0;
1990}
1991
1992static int setup_macvlan(pid_t pid) {
1993 _cleanup_udev_unref_ struct udev *udev = NULL;
1994 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1995 char **i;
1996 int r;
1997
1998 if (!arg_private_network)
1999 return 0;
2000
2001 if (strv_isempty(arg_network_macvlan))
2002 return 0;
2003
2004 r = sd_rtnl_open(&rtnl, 0);
2005 if (r < 0) {
2006 log_error("Failed to connect to netlink: %s", strerror(-r));
2007 return r;
2008 }
2009
2010 udev = udev_new();
2011 if (!udev) {
2012 log_error("Failed to connect to udev.");
2013 return -ENOMEM;
2014 }
2015
2016 STRV_FOREACH(i, arg_network_macvlan) {
2017 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2018 _cleanup_free_ char *n = NULL;
2019 int ifi;
2020
2021 ifi = parse_interface(udev, *i);
2022 if (ifi < 0)
2023 return ifi;
2024
2025 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
2026 if (r < 0) {
2027 log_error("Failed to allocate netlink message: %s", strerror(-r));
2028 return r;
2029 }
2030
c74e630d
LP
2031 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2032 if (r < 0) {
2033 log_error("Failed to add netlink interface index: %s", strerror(-r));
2034 return r;
2035 }
2036
2037 n = strappend("mv-", *i);
2038 if (!n)
2039 return log_oom();
2040
2041 strshorten(n, IFNAMSIZ-1);
2042
2043 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2044 if (r < 0) {
2045 log_error("Failed to add netlink interface name: %s", strerror(-r));
2046 return r;
2047 }
2048
aa28aefe
LP
2049 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2050 if (r < 0) {
c74e630d
LP
2051 log_error("Failed to add netlink namespace field: %s", strerror(-r));
2052 return r;
2053 }
2054
2055 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2056 if (r < 0) {
2057 log_error("Failed to open netlink container: %s", strerror(-r));
2058 return r;
2059 }
2060
d8e538ec 2061 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
c74e630d
LP
2062 if (r < 0) {
2063 log_error("Failed to open netlink container: %s", strerror(-r));
2064 return r;
2065 }
2066
2067 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2068 if (r < 0) {
2069 log_error("Failed to append macvlan mode: %s", strerror(-r));
2070 return r;
2071 }
2072
2073 r = sd_rtnl_message_close_container(m);
2074 if (r < 0) {
2075 log_error("Failed to close netlink container: %s", strerror(-r));
2076 return r;
2077 }
2078
2079 r = sd_rtnl_message_close_container(m);
2080 if (r < 0) {
2081 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
2082 return r;
2083 }
2084
2085 r = sd_rtnl_call(rtnl, m, 0, NULL);
2086 if (r < 0) {
c74e630d 2087 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
2088 return r;
2089 }
2090 }
2091
2092 return 0;
2093}
2094
28650077 2095static int setup_seccomp(void) {
24fb1112
LP
2096
2097#ifdef HAVE_SECCOMP
28650077
LP
2098 static const int blacklist[] = {
2099 SCMP_SYS(kexec_load),
2100 SCMP_SYS(open_by_handle_at),
2101 SCMP_SYS(init_module),
2102 SCMP_SYS(finit_module),
2103 SCMP_SYS(delete_module),
2104 SCMP_SYS(iopl),
2105 SCMP_SYS(ioperm),
2106 SCMP_SYS(swapon),
2107 SCMP_SYS(swapoff),
2108 };
2109
24fb1112 2110 scmp_filter_ctx seccomp;
28650077 2111 unsigned i;
24fb1112
LP
2112 int r;
2113
24fb1112
LP
2114 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2115 if (!seccomp)
2116 return log_oom();
2117
e9642be2 2118 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2119 if (r < 0) {
e9642be2
LP
2120 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2121 goto finish;
2122 }
2123
28650077
LP
2124 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2125 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2126 if (r == -EFAULT)
2127 continue; /* unknown syscall */
2128 if (r < 0) {
2129 log_error("Failed to block syscall: %s", strerror(-r));
2130 goto finish;
2131 }
2132 }
2133
2134 /*
2135 Audit is broken in containers, much of the userspace audit
2136 hookup will fail if running inside a container. We don't
2137 care and just turn off creation of audit sockets.
2138
2139 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2140 with EAFNOSUPPORT which audit userspace uses as indication
2141 that audit is disabled in the kernel.
2142 */
2143
3302da46 2144 r = seccomp_rule_add(
24fb1112
LP
2145 seccomp,
2146 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2147 SCMP_SYS(socket),
2148 2,
2149 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2150 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2151 if (r < 0) {
2152 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2153 goto finish;
2154 }
2155
2156 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2157 if (r < 0) {
2158 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2159 goto finish;
2160 }
2161
2162 r = seccomp_load(seccomp);
2163 if (r < 0)
2164 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2165
2166finish:
2167 seccomp_release(seccomp);
2168 return r;
2169#else
2170 return 0;
2171#endif
2172
2173}
2174
1b9e5b12
LP
2175static int setup_image(char **device_path, int *loop_nr) {
2176 struct loop_info64 info = {
2177 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2178 };
2179 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2180 _cleanup_free_ char* loopdev = NULL;
2181 struct stat st;
2182 int r, nr;
2183
2184 assert(device_path);
2185 assert(loop_nr);
2186
2187 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2188 if (fd < 0) {
2189 log_error("Failed to open %s: %m", arg_image);
2190 return -errno;
2191 }
2192
2193 if (fstat(fd, &st) < 0) {
2194 log_error("Failed to stat %s: %m", arg_image);
2195 return -errno;
2196 }
2197
2198 if (S_ISBLK(st.st_mode)) {
2199 char *p;
2200
2201 p = strdup(arg_image);
2202 if (!p)
2203 return log_oom();
2204
2205 *device_path = p;
2206
2207 *loop_nr = -1;
2208
2209 r = fd;
2210 fd = -1;
2211
2212 return r;
2213 }
2214
2215 if (!S_ISREG(st.st_mode)) {
2216 log_error("%s is not a regular file or block device: %m", arg_image);
2217 return -EINVAL;
2218 }
2219
2220 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2221 if (control < 0) {
2222 log_error("Failed to open /dev/loop-control: %m");
2223 return -errno;
2224 }
2225
2226 nr = ioctl(control, LOOP_CTL_GET_FREE);
2227 if (nr < 0) {
2228 log_error("Failed to allocate loop device: %m");
2229 return -errno;
2230 }
2231
2232 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2233 return log_oom();
2234
2235 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2236 if (loop < 0) {
2237 log_error("Failed to open loop device %s: %m", loopdev);
2238 return -errno;
2239 }
2240
2241 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2242 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2243 return -errno;
2244 }
2245
2246 if (arg_read_only)
2247 info.lo_flags |= LO_FLAGS_READ_ONLY;
2248
2249 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2250 log_error("Failed to set loopback settings on %s: %m", loopdev);
2251 return -errno;
2252 }
2253
2254 *device_path = loopdev;
2255 loopdev = NULL;
2256
2257 *loop_nr = nr;
2258
2259 r = loop;
2260 loop = -1;
2261
2262 return r;
2263}
2264
2265static int dissect_image(
2266 int fd,
727fd4fd
LP
2267 char **root_device, bool *root_device_rw,
2268 char **home_device, bool *home_device_rw,
2269 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2270 bool *secondary) {
2271
2272#ifdef HAVE_BLKID
2273 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2274 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2275 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2276 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2277 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2278 _cleanup_udev_unref_ struct udev *udev = NULL;
2279 struct udev_list_entry *first, *item;
727fd4fd 2280 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2281 const char *pttype = NULL;
2282 blkid_partlist pl;
2283 struct stat st;
2284 int r;
2285
2286 assert(fd >= 0);
2287 assert(root_device);
2288 assert(home_device);
2289 assert(srv_device);
2290 assert(secondary);
2291
2292 b = blkid_new_probe();
2293 if (!b)
2294 return log_oom();
2295
2296 errno = 0;
2297 r = blkid_probe_set_device(b, fd, 0, 0);
2298 if (r != 0) {
2299 if (errno == 0)
2300 return log_oom();
2301
2302 log_error("Failed to set device on blkid probe: %m");
2303 return -errno;
2304 }
2305
2306 blkid_probe_enable_partitions(b, 1);
2307 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2308
2309 errno = 0;
2310 r = blkid_do_safeprobe(b);
2311 if (r == -2 || r == 1) {
2312 log_error("Failed to identify any partition table on %s.\n"
2313 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2314 return -EINVAL;
2315 } else if (r != 0) {
2316 if (errno == 0)
2317 errno = EIO;
2318 log_error("Failed to probe: %m");
2319 return -errno;
2320 }
2321
2322 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2323 if (!streq_ptr(pttype, "gpt")) {
2324 log_error("Image %s does not carry a GUID Partition Table.\n"
2325 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2326 return -EINVAL;
2327 }
2328
2329 errno = 0;
2330 pl = blkid_probe_get_partitions(b);
2331 if (!pl) {
2332 if (errno == 0)
2333 return log_oom();
2334
2335 log_error("Failed to list partitions of %s", arg_image);
2336 return -errno;
2337 }
2338
2339 udev = udev_new();
2340 if (!udev)
2341 return log_oom();
2342
2343 if (fstat(fd, &st) < 0) {
2344 log_error("Failed to stat block device: %m");
2345 return -errno;
2346 }
2347
2348 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2349 if (!d)
2350 return log_oom();
2351
2352 e = udev_enumerate_new(udev);
2353 if (!e)
2354 return log_oom();
2355
2356 r = udev_enumerate_add_match_parent(e, d);
2357 if (r < 0)
2358 return log_oom();
2359
2360 r = udev_enumerate_scan_devices(e);
2361 if (r < 0) {
2362 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2363 return r;
2364 }
2365
2366 first = udev_enumerate_get_list_entry(e);
2367 udev_list_entry_foreach(item, first) {
2368 _cleanup_udev_device_unref_ struct udev_device *q;
2369 const char *stype, *node;
727fd4fd 2370 unsigned long long flags;
1b9e5b12
LP
2371 sd_id128_t type_id;
2372 blkid_partition pp;
2373 dev_t qn;
2374 int nr;
2375
2376 errno = 0;
2377 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2378 if (!q) {
2379 if (!errno)
2380 errno = ENOMEM;
2381
2382 log_error("Failed to get partition device of %s: %m", arg_image);
2383 return -errno;
2384 }
2385
2386 qn = udev_device_get_devnum(q);
2387 if (major(qn) == 0)
2388 continue;
2389
2390 if (st.st_rdev == qn)
2391 continue;
2392
2393 node = udev_device_get_devnode(q);
2394 if (!node)
2395 continue;
2396
2397 pp = blkid_partlist_devno_to_partition(pl, qn);
2398 if (!pp)
2399 continue;
2400
727fd4fd
LP
2401 flags = blkid_partition_get_flags(pp);
2402 if (flags & GPT_FLAG_NO_AUTO)
2403 continue;
2404
1b9e5b12
LP
2405 nr = blkid_partition_get_partno(pp);
2406 if (nr < 0)
2407 continue;
2408
2409 stype = blkid_partition_get_type_string(pp);
2410 if (!stype)
2411 continue;
2412
2413 if (sd_id128_from_string(stype, &type_id) < 0)
2414 continue;
2415
2416 if (sd_id128_equal(type_id, GPT_HOME)) {
2417
2418 if (home && nr >= home_nr)
2419 continue;
2420
2421 home_nr = nr;
727fd4fd
LP
2422 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2423
1b9e5b12
LP
2424 free(home);
2425 home = strdup(node);
2426 if (!home)
2427 return log_oom();
2428 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2429
2430 if (srv && nr >= srv_nr)
2431 continue;
2432
2433 srv_nr = nr;
727fd4fd
LP
2434 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2435
1b9e5b12
LP
2436 free(srv);
2437 srv = strdup(node);
2438 if (!srv)
2439 return log_oom();
2440 }
2441#ifdef GPT_ROOT_NATIVE
2442 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2443
2444 if (root && nr >= root_nr)
2445 continue;
2446
2447 root_nr = nr;
727fd4fd
LP
2448 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2449
1b9e5b12
LP
2450 free(root);
2451 root = strdup(node);
2452 if (!root)
2453 return log_oom();
2454 }
2455#endif
2456#ifdef GPT_ROOT_SECONDARY
2457 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2458
2459 if (secondary_root && nr >= secondary_root_nr)
2460 continue;
2461
2462 secondary_root_nr = nr;
727fd4fd
LP
2463 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2464
2465
1b9e5b12
LP
2466 free(secondary_root);
2467 secondary_root = strdup(node);
2468 if (!secondary_root)
2469 return log_oom();
2470 }
2471#endif
2472 }
2473
2474 if (!root && !secondary_root) {
2475 log_error("Failed to identify root partition in disk image %s.\n"
2476 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2477 return -EINVAL;
2478 }
2479
2480 if (root) {
2481 *root_device = root;
2482 root = NULL;
727fd4fd
LP
2483
2484 *root_device_rw = root_rw;
1b9e5b12
LP
2485 *secondary = false;
2486 } else if (secondary_root) {
2487 *root_device = secondary_root;
2488 secondary_root = NULL;
727fd4fd
LP
2489
2490 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2491 *secondary = true;
2492 }
2493
2494 if (home) {
2495 *home_device = home;
2496 home = NULL;
727fd4fd
LP
2497
2498 *home_device_rw = home_rw;
1b9e5b12
LP
2499 }
2500
2501 if (srv) {
2502 *srv_device = srv;
2503 srv = NULL;
727fd4fd
LP
2504
2505 *srv_device_rw = srv_rw;
1b9e5b12
LP
2506 }
2507
2508 return 0;
2509#else
2510 log_error("--image= is not supported, compiled without blkid support.");
2511 return -ENOTSUP;
2512#endif
2513}
2514
727fd4fd 2515static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2516#ifdef HAVE_BLKID
2517 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2518 const char *fstype, *p;
2519 int r;
2520
2521 assert(what);
2522 assert(where);
2523
727fd4fd
LP
2524 if (arg_read_only)
2525 rw = false;
2526
1b9e5b12
LP
2527 if (directory)
2528 p = strappenda(where, directory);
2529 else
2530 p = where;
2531
2532 errno = 0;
2533 b = blkid_new_probe_from_filename(what);
2534 if (!b) {
2535 if (errno == 0)
2536 return log_oom();
2537 log_error("Failed to allocate prober for %s: %m", what);
2538 return -errno;
2539 }
2540
2541 blkid_probe_enable_superblocks(b, 1);
2542 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2543
2544 errno = 0;
2545 r = blkid_do_safeprobe(b);
2546 if (r == -1 || r == 1) {
2547 log_error("Cannot determine file system type of %s", what);
2548 return -EINVAL;
2549 } else if (r != 0) {
2550 if (errno == 0)
2551 errno = EIO;
2552 log_error("Failed to probe %s: %m", what);
2553 return -errno;
2554 }
2555
2556 errno = 0;
2557 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2558 if (errno == 0)
2559 errno = EINVAL;
2560 log_error("Failed to determine file system type of %s", what);
2561 return -errno;
2562 }
2563
2564 if (streq(fstype, "crypto_LUKS")) {
2565 log_error("nspawn currently does not support LUKS disk images.");
2566 return -ENOTSUP;
2567 }
2568
727fd4fd 2569 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
1b9e5b12
LP
2570 log_error("Failed to mount %s: %m", what);
2571 return -errno;
2572 }
2573
2574 return 0;
2575#else
2576 log_error("--image= is not supported, compiled without blkid support.");
2577 return -ENOTSUP;
2578#endif
2579}
2580
727fd4fd
LP
2581static int mount_devices(
2582 const char *where,
2583 const char *root_device, bool root_device_rw,
2584 const char *home_device, bool home_device_rw,
2585 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2586 int r;
2587
2588 assert(where);
2589
2590 if (root_device) {
727fd4fd 2591 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
1b9e5b12
LP
2592 if (r < 0) {
2593 log_error("Failed to mount root directory: %s", strerror(-r));
2594 return r;
2595 }
2596 }
2597
2598 if (home_device) {
727fd4fd 2599 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
1b9e5b12
LP
2600 if (r < 0) {
2601 log_error("Failed to mount home directory: %s", strerror(-r));
2602 return r;
2603 }
2604 }
2605
2606 if (srv_device) {
727fd4fd 2607 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
1b9e5b12
LP
2608 if (r < 0) {
2609 log_error("Failed to mount server data directory: %s", strerror(-r));
2610 return r;
2611 }
2612 }
2613
2614 return 0;
2615}
2616
2617static void loop_remove(int nr, int *image_fd) {
2618 _cleanup_close_ int control = -1;
e8c8ddcc 2619 int r;
1b9e5b12
LP
2620
2621 if (nr < 0)
2622 return;
2623
2624 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2625 r = ioctl(*image_fd, LOOP_CLR_FD);
2626 if (r < 0)
2627 log_warning("Failed to close loop image: %m");
03e334a1 2628 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2629 }
2630
2631 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc
TG
2632 if (control < 0) {
2633 log_warning("Failed to open /dev/loop-control: %m");
1b9e5b12 2634 return;
e8c8ddcc 2635 }
1b9e5b12 2636
e8c8ddcc
TG
2637 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2638 if (r < 0)
2639 log_warning("Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2640}
2641
0cb9fbcd
LP
2642static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2643 int pipe_fds[2];
2644 pid_t pid;
2645
2646 assert(database);
2647 assert(key);
2648 assert(rpid);
2649
2650 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2651 log_error("Failed to allocate pipe: %m");
2652 return -errno;
2653 }
2654
2655 pid = fork();
2656 if (pid < 0) {
2657 log_error("Failed to fork getent child: %m");
2658 return -errno;
2659 } else if (pid == 0) {
2660 int nullfd;
2661 char *empty_env = NULL;
2662
2663 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2664 _exit(EXIT_FAILURE);
2665
2666 if (pipe_fds[0] > 2)
03e334a1 2667 safe_close(pipe_fds[0]);
0cb9fbcd 2668 if (pipe_fds[1] > 2)
03e334a1 2669 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2670
2671 nullfd = open("/dev/null", O_RDWR);
2672 if (nullfd < 0)
2673 _exit(EXIT_FAILURE);
2674
2675 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2676 _exit(EXIT_FAILURE);
2677
2678 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2679 _exit(EXIT_FAILURE);
2680
2681 if (nullfd > 2)
03e334a1 2682 safe_close(nullfd);
0cb9fbcd
LP
2683
2684 reset_all_signal_handlers();
2685 close_all_fds(NULL, 0);
2686
4de82926
MM
2687 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2688 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2689 _exit(EXIT_FAILURE);
2690 }
2691
03e334a1 2692 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2693
2694 *rpid = pid;
2695
2696 return pipe_fds[0];
2697}
2698
2699static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2700 char line[LINE_MAX], *x, *u, *g, *h;
2701 const char *word, *state;
0cb9fbcd
LP
2702 _cleanup_free_ uid_t *uids = NULL;
2703 _cleanup_free_ char *home = NULL;
2704 _cleanup_fclose_ FILE *f = NULL;
2705 _cleanup_close_ int fd = -1;
2706 unsigned n_uids = 0;
70f539ca 2707 size_t sz = 0, l;
0cb9fbcd
LP
2708 uid_t uid;
2709 gid_t gid;
2710 pid_t pid;
2711 int r;
2712
2713 assert(_home);
2714
2715 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2716 /* Reset everything fully to 0, just in case */
2717
2718 if (setgroups(0, NULL) < 0) {
2719 log_error("setgroups() failed: %m");
2720 return -errno;
2721 }
2722
2723 if (setresgid(0, 0, 0) < 0) {
2724 log_error("setregid() failed: %m");
2725 return -errno;
2726 }
2727
2728 if (setresuid(0, 0, 0) < 0) {
2729 log_error("setreuid() failed: %m");
2730 return -errno;
2731 }
2732
2733 *_home = NULL;
2734 return 0;
2735 }
2736
2737 /* First, get user credentials */
2738 fd = spawn_getent("passwd", arg_user, &pid);
2739 if (fd < 0)
2740 return fd;
2741
2742 f = fdopen(fd, "r");
2743 if (!f)
2744 return log_oom();
2745 fd = -1;
2746
2747 if (!fgets(line, sizeof(line), f)) {
2748
2749 if (!ferror(f)) {
2750 log_error("Failed to resolve user %s.", arg_user);
2751 return -ESRCH;
2752 }
2753
2754 log_error("Failed to read from getent: %m");
2755 return -errno;
2756 }
2757
2758 truncate_nl(line);
2759
2760 wait_for_terminate_and_warn("getent passwd", pid);
2761
2762 x = strchr(line, ':');
2763 if (!x) {
2764 log_error("/etc/passwd entry has invalid user field.");
2765 return -EIO;
2766 }
2767
2768 u = strchr(x+1, ':');
2769 if (!u) {
2770 log_error("/etc/passwd entry has invalid password field.");
2771 return -EIO;
2772 }
2773
2774 u++;
2775 g = strchr(u, ':');
2776 if (!g) {
2777 log_error("/etc/passwd entry has invalid UID field.");
2778 return -EIO;
2779 }
2780
2781 *g = 0;
2782 g++;
2783 x = strchr(g, ':');
2784 if (!x) {
2785 log_error("/etc/passwd entry has invalid GID field.");
2786 return -EIO;
2787 }
2788
2789 *x = 0;
2790 h = strchr(x+1, ':');
2791 if (!h) {
2792 log_error("/etc/passwd entry has invalid GECOS field.");
2793 return -EIO;
2794 }
2795
2796 h++;
2797 x = strchr(h, ':');
2798 if (!x) {
2799 log_error("/etc/passwd entry has invalid home directory field.");
2800 return -EIO;
2801 }
2802
2803 *x = 0;
2804
2805 r = parse_uid(u, &uid);
2806 if (r < 0) {
2807 log_error("Failed to parse UID of user.");
2808 return -EIO;
2809 }
2810
2811 r = parse_gid(g, &gid);
2812 if (r < 0) {
2813 log_error("Failed to parse GID of user.");
2814 return -EIO;
2815 }
2816
2817 home = strdup(h);
2818 if (!home)
2819 return log_oom();
2820
2821 /* Second, get group memberships */
2822 fd = spawn_getent("initgroups", arg_user, &pid);
2823 if (fd < 0)
2824 return fd;
2825
2826 fclose(f);
2827 f = fdopen(fd, "r");
2828 if (!f)
2829 return log_oom();
2830 fd = -1;
2831
2832 if (!fgets(line, sizeof(line), f)) {
2833 if (!ferror(f)) {
2834 log_error("Failed to resolve user %s.", arg_user);
2835 return -ESRCH;
2836 }
2837
2838 log_error("Failed to read from getent: %m");
2839 return -errno;
2840 }
2841
2842 truncate_nl(line);
2843
2844 wait_for_terminate_and_warn("getent initgroups", pid);
2845
2846 /* Skip over the username and subsequent separator whitespace */
2847 x = line;
2848 x += strcspn(x, WHITESPACE);
2849 x += strspn(x, WHITESPACE);
2850
a2a5291b 2851 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2852 char c[l+1];
2853
a2a5291b 2854 memcpy(c, word, l);
0cb9fbcd
LP
2855 c[l] = 0;
2856
2857 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2858 return log_oom();
2859
2860 r = parse_uid(c, &uids[n_uids++]);
2861 if (r < 0) {
2862 log_error("Failed to parse group data from getent.");
2863 return -EIO;
2864 }
2865 }
2866
2867 r = mkdir_parents(home, 0775);
2868 if (r < 0) {
2869 log_error("Failed to make home root directory: %s", strerror(-r));
2870 return r;
2871 }
2872
2873 r = mkdir_safe(home, 0755, uid, gid);
f418f31d 2874 if (r < 0 && r != -EEXIST) {
0cb9fbcd
LP
2875 log_error("Failed to make home directory: %s", strerror(-r));
2876 return r;
2877 }
2878
2879 fchown(STDIN_FILENO, uid, gid);
2880 fchown(STDOUT_FILENO, uid, gid);
2881 fchown(STDERR_FILENO, uid, gid);
2882
2883 if (setgroups(n_uids, uids) < 0) {
2884 log_error("Failed to set auxiliary groups: %m");
2885 return -errno;
2886 }
2887
2888 if (setresgid(gid, gid, gid) < 0) {
2889 log_error("setregid() failed: %m");
2890 return -errno;
2891 }
2892
2893 if (setresuid(uid, uid, uid) < 0) {
2894 log_error("setreuid() failed: %m");
2895 return -errno;
2896 }
2897
2898 if (_home) {
2899 *_home = home;
2900 home = NULL;
2901 }
2902
2903 return 0;
2904}
2905
113cea80 2906/*
6d416b9c
LS
2907 * Return values:
2908 * < 0 : wait_for_terminate() failed to get the state of the
2909 * container, the container was terminated by a signal, or
2910 * failed for an unknown reason. No change is made to the
2911 * container argument.
2912 * > 0 : The program executed in the container terminated with an
2913 * error. The exit code of the program executed in the
2914 * container is returned. No change is made to the container
2915 * argument.
2916 * 0 : The container is being rebooted, has been shut down or exited
2917 * successfully. The container argument has been set to either
2918 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2919 *
6d416b9c
LS
2920 * That is, success is indicated by a return value of zero, and an
2921 * error is indicated by a non-zero value.
113cea80
DH
2922 */
2923static int wait_for_container(pid_t pid, ContainerStatus *container) {
2924 int r;
2925 siginfo_t status;
2926
2927 r = wait_for_terminate(pid, &status);
8baaf7a3
LS
2928 if (r < 0) {
2929 log_warning("Failed to wait for container: %s", strerror(-r));
113cea80 2930 return r;
8baaf7a3 2931 }
113cea80
DH
2932
2933 switch (status.si_code) {
2934 case CLD_EXITED:
2935 r = status.si_status;
2936 if (r == 0) {
2937 if (!arg_quiet)
2938 log_debug("Container %s exited successfully.",
2939 arg_machine);
2940
2941 *container = CONTAINER_TERMINATED;
2942 } else {
2943 log_error("Container %s failed with error code %i.",
2944 arg_machine, status.si_status);
113cea80
DH
2945 }
2946 break;
2947
2948 case CLD_KILLED:
2949 if (status.si_status == SIGINT) {
2950 if (!arg_quiet)
2951 log_info("Container %s has been shut down.",
2952 arg_machine);
2953
2954 *container = CONTAINER_TERMINATED;
2955 r = 0;
2956 break;
2957 } else if (status.si_status == SIGHUP) {
2958 if (!arg_quiet)
2959 log_info("Container %s is being rebooted.",
2960 arg_machine);
2961
2962 *container = CONTAINER_REBOOTED;
2963 r = 0;
2964 break;
2965 }
2966 /* CLD_KILLED fallthrough */
2967
2968 case CLD_DUMPED:
2969 log_error("Container %s terminated by signal %s.",
2970 arg_machine, signal_to_string(status.si_status));
2971 r = -1;
2972 break;
2973
2974 default:
2975 log_error("Container %s failed due to unknown reason.",
2976 arg_machine);
2977 r = -1;
2978 break;
2979 }
2980
2981 return r;
2982}
2983
e866af3a
DH
2984static void nop_handler(int sig) {}
2985
88213476 2986int main(int argc, char *argv[]) {
69c79d3c 2987
1b9e5b12 2988 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2989 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
d96c1ecf 2990 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
3d94f76c 2991 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2992 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2993 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2994 const char *console = NULL;
1b9e5b12
LP
2995 char veth_name[IFNAMSIZ];
2996 bool secondary = false;
e866af3a 2997 sigset_t mask, mask_chld;
69c79d3c 2998 pid_t pid = 0;
88213476
LP
2999
3000 log_parse_environment();
3001 log_open();
3002
05947bef
LP
3003 k = parse_argv(argc, argv);
3004 if (k < 0)
88213476 3005 goto finish;
05947bef
LP
3006 else if (k == 0) {
3007 r = EXIT_SUCCESS;
3008 goto finish;
3009 }
88213476 3010
1b9e5b12
LP
3011 if (!arg_image) {
3012 if (arg_directory) {
3013 char *p;
88213476 3014
1b9e5b12
LP
3015 p = path_make_absolute_cwd(arg_directory);
3016 free(arg_directory);
3017 arg_directory = p;
3018 } else
3019 arg_directory = get_current_dir_name();
88213476 3020
1b9e5b12
LP
3021 if (!arg_directory) {
3022 log_error("Failed to determine path, please use -D.");
3023 goto finish;
3024 }
3025 path_kill_slashes(arg_directory);
88213476
LP
3026 }
3027
7027ff61 3028 if (!arg_machine) {
1b9e5b12 3029 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
3030 if (!arg_machine) {
3031 log_oom();
3032 goto finish;
3033 }
3034
e724b063 3035 hostname_cleanup(arg_machine, false);
7027ff61
LP
3036 if (isempty(arg_machine)) {
3037 log_error("Failed to determine machine name automatically, please use -M.");
3038 goto finish;
3039 }
3040 }
3041
88213476
LP
3042 if (geteuid() != 0) {
3043 log_error("Need to be root.");
3044 goto finish;
3045 }
3046
04d391da
LP
3047 if (sd_booted() <= 0) {
3048 log_error("Not running on a systemd system.");
3049 goto finish;
3050 }
3051
1b9e5b12
LP
3052 log_close();
3053 n_fd_passed = sd_listen_fds(false);
3054 if (n_fd_passed > 0) {
3055 k = fdset_new_listen_fds(&fds, false);
3056 if (k < 0) {
3057 log_error("Failed to collect file descriptors: %s", strerror(-k));
3058 goto finish;
3059 }
88213476 3060 }
1b9e5b12
LP
3061 fdset_close_others(fds);
3062 log_open();
88213476 3063
1b9e5b12
LP
3064 if (arg_directory) {
3065 if (path_equal(arg_directory, "/")) {
3066 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
3067 goto finish;
3068 }
1b9e5b12
LP
3069
3070 if (arg_boot) {
3071 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3072 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
3073 goto finish;
3074 }
3075 } else {
3076 const char *p;
3077
3078 p = strappenda(arg_directory,
3079 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3080 if (access(p, F_OK) < 0) {
3081 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3082 goto finish;
3083
3084 }
3085 }
6b9132a9 3086 } else {
1b9e5b12 3087 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3088
1b9e5b12
LP
3089 if (!mkdtemp(template)) {
3090 log_error("Failed to create temporary directory: %m");
3091 r = -errno;
6b9132a9 3092 goto finish;
1b9e5b12 3093 }
6b9132a9 3094
1b9e5b12
LP
3095 arg_directory = strdup(template);
3096 if (!arg_directory) {
3097 r = log_oom();
3098 goto finish;
6b9132a9 3099 }
88213476 3100
1b9e5b12
LP
3101 image_fd = setup_image(&device_path, &loop_nr);
3102 if (image_fd < 0) {
3103 r = image_fd;
842f3b0f
LP
3104 goto finish;
3105 }
1b9e5b12 3106
4d9f07b4
LP
3107 r = dissect_image(image_fd,
3108 &root_device, &root_device_rw,
3109 &home_device, &home_device_rw,
3110 &srv_device, &srv_device_rw,
3111 &secondary);
1b9e5b12
LP
3112 if (r < 0)
3113 goto finish;
842f3b0f 3114 }
842f3b0f 3115
db7feb7e
LP
3116 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3117 if (master < 0) {
a258bf26
LP
3118 log_error("Failed to acquire pseudo tty: %m");
3119 goto finish;
3120 }
3121
db7feb7e
LP
3122 console = ptsname(master);
3123 if (!console) {
a258bf26
LP
3124 log_error("Failed to determine tty name: %m");
3125 goto finish;
3126 }
3127
284c0b91 3128 if (!arg_quiet)
45f1386c
ZJS
3129 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3130 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
3131
3132 if (unlockpt(master) < 0) {
3133 log_error("Failed to unlock tty: %m");
3134 goto finish;
3135 }
3136
eb91eb18
LP
3137 if (access("/dev/kdbus/control", F_OK) >= 0) {
3138
3139 if (arg_share_system) {
3140 kdbus_domain = strdup("/dev/kdbus");
3141 if (!kdbus_domain) {
3142 log_oom();
3143 goto finish;
3144 }
3145 } else {
3146 const char *ns;
3147
3148 ns = strappenda("machine-", arg_machine);
3149 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3150 if (r < 0)
3151 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3152 else
3153 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3154 }
3155 }
9bd37b40 3156
e58a1277 3157 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
3158 log_error("Failed to create kmsg socket pair: %m");
3159 goto finish;
3160 }
3161
af4ec430
LP
3162 sd_notify(false,
3163 "READY=1\n"
3164 "STATUS=Container running.");
05947bef 3165
a258bf26 3166 assert_se(sigemptyset(&mask) == 0);
e866af3a
DH
3167 assert_se(sigemptyset(&mask_chld) == 0);
3168 sigaddset(&mask_chld, SIGCHLD);
a258bf26
LP
3169 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3170 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3171
d87be9b0 3172 for (;;) {
113cea80 3173 ContainerStatus container_status;
7566e267 3174 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3175 struct sigaction sa = {
3176 .sa_handler = nop_handler,
3177 .sa_flags = SA_NOCLDSTOP,
3178 };
3179
7566e267 3180 r = barrier_create(&barrier);
a2da110b
DH
3181 if (r < 0) {
3182 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3183 goto finish;
3184 }
3185
e866af3a
DH
3186 /* Child can be killed before execv(), so handle SIGCHLD
3187 * in order to interrupt parent's blocking calls and
3188 * give it a chance to call wait() and terminate. */
3189 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3190 if (r < 0) {
3191 log_error("Failed to change the signal mask: %m");
d96c1ecf
LP
3192 goto finish;
3193 }
3194
e866af3a
DH
3195 r = sigaction(SIGCHLD, &sa, NULL);
3196 if (r < 0) {
3197 log_error("Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3198 goto finish;
3199 }
3200
a2da110b
DH
3201 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3202 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3203 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3204 if (pid < 0) {
3205 if (errno == EINVAL)
3206 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3207 else
3208 log_error("clone() failed: %m");
a258bf26 3209
e866af3a 3210 r = pid;
d87be9b0
LP
3211 goto finish;
3212 }
a258bf26 3213
d87be9b0
LP
3214 if (pid == 0) {
3215 /* child */
0cb9fbcd 3216 _cleanup_free_ char *home = NULL;
5674767e 3217 unsigned n_env = 2;
d87be9b0 3218 const char *envp[] = {
e10a55fd 3219 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3220 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3221 NULL, /* TERM */
3222 NULL, /* HOME */
3223 NULL, /* USER */
3224 NULL, /* LOGNAME */
3225 NULL, /* container_uuid */
842f3b0f
LP
3226 NULL, /* LISTEN_FDS */
3227 NULL, /* LISTEN_PID */
d87be9b0
LP
3228 NULL
3229 };
f4889f65 3230 char **env_use;
a258bf26 3231
a2da110b
DH
3232 barrier_set_role(&barrier, BARRIER_CHILD);
3233
5674767e
ZJS
3234 envp[n_env] = strv_find_prefix(environ, "TERM=");
3235 if (envp[n_env])
3236 n_env ++;
a258bf26 3237
03e334a1 3238 master = safe_close(master);
a258bf26 3239
d87be9b0
LP
3240 close_nointr(STDIN_FILENO);
3241 close_nointr(STDOUT_FILENO);
3242 close_nointr(STDERR_FILENO);
db7feb7e 3243
03e334a1 3244 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3245
d87be9b0 3246 reset_all_signal_handlers();
1b6d7fa7 3247 reset_signal_mask();
f5c1b9ee 3248
842f3b0f
LP
3249 k = open_terminal(console, O_RDWR);
3250 if (k != STDIN_FILENO) {
3251 if (k >= 0) {
03e334a1 3252 safe_close(k);
842f3b0f
LP
3253 k = -EINVAL;
3254 }
3255
3256 log_error("Failed to open console: %s", strerror(-k));
a2da110b 3257 _exit(EXIT_FAILURE);
842f3b0f
LP
3258 }
3259
3260 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3261 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3262 log_error("Failed to duplicate console: %m");
a2da110b 3263 _exit(EXIT_FAILURE);
842f3b0f 3264 }
bc2f673e 3265
d87be9b0
LP
3266 if (setsid() < 0) {
3267 log_error("setsid() failed: %m");
a2da110b 3268 _exit(EXIT_FAILURE);
bc2f673e
LP
3269 }
3270
db999e0f 3271 if (reset_audit_loginuid() < 0)
a2da110b 3272 _exit(EXIT_FAILURE);
db999e0f 3273
d87be9b0
LP
3274 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3275 log_error("PR_SET_PDEATHSIG failed: %m");
a2da110b 3276 _exit(EXIT_FAILURE);
d87be9b0 3277 }
e58a1277 3278
d87be9b0
LP
3279 /* Mark everything as slave, so that we still
3280 * receive mounts from the real root, but don't
3281 * propagate mounts to the real root. */
3282 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3283 log_error("MS_SLAVE|MS_REC failed: %m");
a2da110b 3284 _exit(EXIT_FAILURE);
d87be9b0 3285 }
04bc4a3f 3286
727fd4fd
LP
3287 if (mount_devices(arg_directory,
3288 root_device, root_device_rw,
3289 home_device, home_device_rw,
3290 srv_device, srv_device_rw) < 0)
a2da110b 3291 _exit(EXIT_FAILURE);
1b9e5b12 3292
d87be9b0
LP
3293 /* Turn directory into bind mount */
3294 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
d6797c92 3295 log_error("Failed to make bind mount: %m");
a2da110b 3296 _exit(EXIT_FAILURE);
d87be9b0 3297 }
88213476 3298
4d9f07b4
LP
3299 r = setup_volatile(arg_directory);
3300 if (r < 0)
a2da110b 3301 _exit(EXIT_FAILURE);
4d9f07b4
LP
3302
3303 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3304 _exit(EXIT_FAILURE);
4d9f07b4
LP
3305
3306 r = base_filesystem_create(arg_directory);
3307 if (r < 0)
a2da110b 3308 _exit(EXIT_FAILURE);
4d9f07b4 3309
d6797c92
LP
3310 if (arg_read_only) {
3311 k = bind_remount_recursive(arg_directory, true);
3312 if (k < 0) {
3313 log_error("Failed to make tree read-only: %s", strerror(-k));
a2da110b 3314 _exit(EXIT_FAILURE);
d87be9b0 3315 }
d6797c92 3316 }
2547bb41 3317
d87be9b0 3318 if (mount_all(arg_directory) < 0)
a2da110b 3319 _exit(EXIT_FAILURE);
57fb9fb5 3320
d87be9b0 3321 if (copy_devnodes(arg_directory) < 0)
a2da110b 3322 _exit(EXIT_FAILURE);
a258bf26 3323
f2d88580 3324 if (setup_ptmx(arg_directory) < 0)
a2da110b 3325 _exit(EXIT_FAILURE);
f2d88580 3326
d87be9b0 3327 dev_setup(arg_directory);
88213476 3328
28650077 3329 if (setup_seccomp() < 0)
a2da110b 3330 _exit(EXIT_FAILURE);
24fb1112 3331
d87be9b0 3332 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3333 _exit(EXIT_FAILURE);
88213476 3334
d87be9b0 3335 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3336 _exit(EXIT_FAILURE);
88213476 3337
03e334a1 3338 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3339
d87be9b0 3340 if (setup_boot_id(arg_directory) < 0)
a2da110b 3341 _exit(EXIT_FAILURE);
a41fe3a2 3342
d87be9b0 3343 if (setup_timezone(arg_directory) < 0)
a2da110b 3344 _exit(EXIT_FAILURE);
88213476 3345
d87be9b0 3346 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3347 _exit(EXIT_FAILURE);
687d0825 3348
d87be9b0 3349 if (setup_journal(arg_directory) < 0)
a2da110b 3350 _exit(EXIT_FAILURE);
687d0825 3351
d6797c92 3352 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3353 _exit(EXIT_FAILURE);
17fe0523 3354
d6797c92 3355 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3356 _exit(EXIT_FAILURE);
17fe0523 3357
06c17c39 3358 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3359 _exit(EXIT_FAILURE);
06c17c39 3360
486e99a3 3361 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
a2da110b 3362 _exit(EXIT_FAILURE);
9bd37b40 3363
d96c1ecf
LP
3364 /* Tell the parent that we are ready, and that
3365 * it can cgroupify us to that we lack access
3366 * to certain devices and resources. */
a2da110b 3367 barrier_place(&barrier);
d96c1ecf 3368
d87be9b0
LP
3369 if (chdir(arg_directory) < 0) {
3370 log_error("chdir(%s) failed: %m", arg_directory);
a2da110b 3371 _exit(EXIT_FAILURE);
687d0825
MV
3372 }
3373
d87be9b0
LP
3374 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3375 log_error("mount(MS_MOVE) failed: %m");
a2da110b 3376 _exit(EXIT_FAILURE);
687d0825
MV
3377 }
3378
d87be9b0
LP
3379 if (chroot(".") < 0) {
3380 log_error("chroot() failed: %m");
a2da110b 3381 _exit(EXIT_FAILURE);
687d0825
MV
3382 }
3383
d87be9b0
LP
3384 if (chdir("/") < 0) {
3385 log_error("chdir() failed: %m");
a2da110b 3386 _exit(EXIT_FAILURE);
687d0825
MV
3387 }
3388
d87be9b0
LP
3389 umask(0022);
3390
eb91eb18
LP
3391 if (arg_private_network)
3392 loopback_setup();
d87be9b0
LP
3393
3394 if (drop_capabilities() < 0) {
3395 log_error("drop_capabilities() failed: %m");
a2da110b 3396 _exit(EXIT_FAILURE);
687d0825 3397 }
687d0825 3398
0cb9fbcd
LP
3399 r = change_uid_gid(&home);
3400 if (r < 0)
a2da110b 3401 _exit(EXIT_FAILURE);
d87be9b0 3402
842f3b0f
LP
3403 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3404 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3405 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3406 log_oom();
a2da110b 3407 _exit(EXIT_FAILURE);
144f0fc0 3408 }
687d0825 3409
9444b1f2 3410 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3411 char as_uuid[37];
3412
3413 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3414 log_oom();
a2da110b 3415 _exit(EXIT_FAILURE);
842f3b0f
LP
3416 }
3417 }
3418
3419 if (fdset_size(fds) > 0) {
3420 k = fdset_cloexec(fds, false);
3421 if (k < 0) {
3422 log_error("Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3423 _exit(EXIT_FAILURE);
842f3b0f
LP
3424 }
3425
3426 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3427 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3428 log_oom();
a2da110b 3429 _exit(EXIT_FAILURE);
d87be9b0
LP
3430 }
3431 }
3432
3433 setup_hostname();
3434
6afc95b7
LP
3435 if (arg_personality != 0xffffffffLU) {
3436 if (personality(arg_personality) < 0) {
3437 log_error("personality() failed: %m");
a2da110b 3438 _exit(EXIT_FAILURE);
6afc95b7 3439 }
1b9e5b12
LP
3440 } else if (secondary) {
3441 if (personality(PER_LINUX32) < 0) {
3442 log_error("personality() failed: %m");
a2da110b 3443 _exit(EXIT_FAILURE);
1b9e5b12 3444 }
6afc95b7
LP
3445 }
3446
d96c1ecf
LP
3447#ifdef HAVE_SELINUX
3448 if (arg_selinux_context)
0cb9fbcd 3449 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 3450 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3451 _exit(EXIT_FAILURE);
0cb9fbcd 3452 }
d96c1ecf 3453#endif
354bfd2b 3454
f4889f65
LP
3455 if (!strv_isempty(arg_setenv)) {
3456 char **n;
3457
3458 n = strv_env_merge(2, envp, arg_setenv);
3459 if (!n) {
3460 log_oom();
a2da110b 3461 _exit(EXIT_FAILURE);
f4889f65
LP
3462 }
3463
3464 env_use = n;
3465 } else
3466 env_use = (char**) envp;
3467
d96c1ecf 3468 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3469 if (!barrier_place_and_sync(&barrier))
3470 _exit(EXIT_FAILURE);
d96c1ecf 3471
d87be9b0
LP
3472 if (arg_boot) {
3473 char **a;
3474 size_t l;
88213476 3475
d87be9b0 3476 /* Automatically search for the init system */
0f0dbc46 3477
d87be9b0
LP
3478 l = 1 + argc - optind;
3479 a = newa(char*, l + 1);
3480 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3481
d87be9b0 3482 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3483 execve(a[0], a, env_use);
0f0dbc46 3484
d87be9b0 3485 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3486 execve(a[0], a, env_use);
0f0dbc46 3487
d87be9b0 3488 a[0] = (char*) "/sbin/init";
f4889f65 3489 execve(a[0], a, env_use);
d87be9b0 3490 } else if (argc > optind)
f4889f65 3491 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3492 else {
3493 chdir(home ? home : "/root");
f4889f65 3494 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3495 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3496 }
3497
3498 log_error("execv() failed: %m");
d87be9b0 3499 _exit(EXIT_FAILURE);
da5b3bad 3500 }
88213476 3501
a2da110b 3502 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3503 fdset_free(fds);
3504 fds = NULL;
3505
a2da110b
DH
3506 /* wait for child-setup to be done */
3507 if (barrier_place_and_sync(&barrier)) {
5aa4bb6b 3508 int ifi = 0;
354bfd2b 3509
840295fc
LP
3510 r = move_network_interfaces(pid);
3511 if (r < 0)
3512 goto finish;
aa28aefe 3513
5aa4bb6b 3514 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3515 if (r < 0)
3516 goto finish;
ab046dde 3517
5aa4bb6b 3518 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3519 if (r < 0)
3520 goto finish;
ab046dde 3521
840295fc
LP
3522 r = setup_macvlan(pid);
3523 if (r < 0)
3524 goto finish;
c74e630d 3525
5aa4bb6b
LP
3526 r = register_machine(pid, ifi);
3527 if (r < 0)
3528 goto finish;
3529
840295fc
LP
3530 /* Block SIGCHLD here, before notifying child.
3531 * process_pty() will handle it with the other signals. */
3532 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3533 if (r < 0)
3534 goto finish;
e866af3a 3535
840295fc
LP
3536 /* Reset signal to default */
3537 r = default_signals(SIGCHLD, -1);
3538 if (r < 0)
3539 goto finish;
e866af3a 3540
840295fc
LP
3541 /* Notify the child that the parent is ready with all
3542 * its setup, and that the child can now hand over
3543 * control to the code to run inside the container. */
a2da110b 3544 barrier_place(&barrier);
354bfd2b 3545
840295fc
LP
3546 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3547 if (k < 0) {
3548 r = EXIT_FAILURE;
3549 break;
3550 }
88213476 3551
840295fc
LP
3552 if (!arg_quiet)
3553 putc('\n', stdout);
04d39279 3554
840295fc
LP
3555 /* Kill if it is not dead yet anyway */
3556 terminate_machine(pid);
3557 }
1f0cd86b 3558
840295fc 3559 /* Normally redundant, but better safe than sorry */
04d39279 3560 kill(pid, SIGKILL);
a258bf26 3561
113cea80 3562 r = wait_for_container(pid, &container_status);
04d39279
LP
3563 pid = 0;
3564
ce9f1527
LP
3565 if (r < 0) {
3566 /* We failed to wait for the container, or the
3567 * container exited abnormally */
3568 r = EXIT_FAILURE;
d87be9b0 3569 break;
ce9f1527
LP
3570 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3571 /* The container exited with a non-zero
3572 * status, or with zero status and no reboot
3573 * was requested. */
d87be9b0 3574 break;
88213476 3575
113cea80 3576 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3577
3578 if (arg_keep_unit) {
3579 /* Special handling if we are running as a
3580 * service: instead of simply restarting the
3581 * machine we want to restart the entire
3582 * service, so let's inform systemd about this
3583 * with the special exit code 133. The service
3584 * file uses RestartForceExitStatus=133 so
3585 * that this results in a full nspawn
3586 * restart. This is necessary since we might
3587 * have cgroup parameters set we want to have
3588 * flushed out. */
3589 r = 133;
3590 break;
3591 }
d87be9b0 3592 }
88213476
LP
3593
3594finish:
af4ec430
LP
3595 sd_notify(false,
3596 "STOPPING=1\n"
3597 "STATUS=Terminating...");
3598
1b9e5b12
LP
3599 loop_remove(loop_nr, &image_fd);
3600
9444b1f2
LP
3601 if (pid > 0)
3602 kill(pid, SIGKILL);
88213476 3603
04d391da 3604 free(arg_directory);
7027ff61 3605 free(arg_machine);
c74e630d
LP
3606 free(arg_user);
3607 strv_free(arg_setenv);
3608 strv_free(arg_network_interfaces);
3609 strv_free(arg_network_macvlan);
3610 strv_free(arg_bind);
3611 strv_free(arg_bind_ro);
06c17c39 3612 strv_free(arg_tmpfs);
88213476
LP
3613
3614 return r;
3615}