]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: copy /dev/net/tun from host
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <net/if.h>
69c79d3c 44#include <linux/veth.h>
6afc95b7 45#include <sys/personality.h>
1b9e5b12 46#include <linux/loop.h>
aa28aefe 47
5d63309c 48#ifdef HAVE_SELINUX
a8828ed9
DW
49#include <selinux/selinux.h>
50#endif
88213476 51
24fb1112
LP
52#ifdef HAVE_SECCOMP
53#include <seccomp.h>
54#endif
55
1b9e5b12
LP
56#ifdef HAVE_BLKID
57#include <blkid/blkid.h>
58#endif
59
1f0cd86b
LP
60#include "sd-daemon.h"
61#include "sd-bus.h"
62#include "sd-id128.h"
aa28aefe 63#include "sd-rtnl.h"
88213476
LP
64#include "log.h"
65#include "util.h"
49e942b2 66#include "mkdir.h"
6b2d0e85 67#include "macro.h"
d7832d2c 68#include "audit.h"
94d82985 69#include "missing.h"
04d391da 70#include "cgroup-util.h"
a258bf26 71#include "strv.h"
9eb977db 72#include "path-util.h"
a41fe3a2 73#include "loopback-setup.h"
4fc9982c 74#include "dev-setup.h"
842f3b0f 75#include "fdset.h"
acbeb427 76#include "build.h"
a5c32cff 77#include "fileio.h"
40ca29a1 78#include "bus-util.h"
1f0cd86b 79#include "bus-error.h"
4ba93280 80#include "ptyfwd.h"
9bd37b40 81#include "bus-kernel.h"
f4889f65 82#include "env-util.h"
7f112f50 83#include "def.h"
aa28aefe 84#include "rtnl-util.h"
7e227024 85#include "udev-util.h"
1b9e5b12
LP
86#include "blkid-util.h"
87#include "gpt.h"
01dde061 88#include "siphash24.h"
849958d1 89#include "copy.h"
3577de7a 90#include "base-filesystem.h"
a2da110b 91#include "barrier.h"
f2d88580 92
e9642be2
LP
93#ifdef HAVE_SECCOMP
94#include "seccomp-util.h"
95#endif
96
113cea80
DH
97typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100} ContainerStatus;
101
57fb9fb5
LP
102typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107} LinkJournal;
88213476 108
4d9f07b4
LP
109typedef enum Volatile {
110 VOLATILE_NO,
111 VOLATILE_YES,
112 VOLATILE_STATE,
113} Volatile;
114
88213476 115static char *arg_directory = NULL;
687d0825 116static char *arg_user = NULL;
9444b1f2 117static sd_id128_t arg_uuid = {};
7027ff61 118static char *arg_machine = NULL;
c74e630d
LP
119static const char *arg_selinux_context = NULL;
120static const char *arg_selinux_apifs_context = NULL;
9444b1f2 121static const char *arg_slice = NULL;
ff01d048 122static bool arg_private_network = false;
bc2f673e 123static bool arg_read_only = false;
0f0dbc46 124static bool arg_boot = false;
57fb9fb5 125static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
126static uint64_t arg_retain =
127 (1ULL << CAP_CHOWN) |
128 (1ULL << CAP_DAC_OVERRIDE) |
129 (1ULL << CAP_DAC_READ_SEARCH) |
130 (1ULL << CAP_FOWNER) |
131 (1ULL << CAP_FSETID) |
132 (1ULL << CAP_IPC_OWNER) |
133 (1ULL << CAP_KILL) |
134 (1ULL << CAP_LEASE) |
135 (1ULL << CAP_LINUX_IMMUTABLE) |
136 (1ULL << CAP_NET_BIND_SERVICE) |
137 (1ULL << CAP_NET_BROADCAST) |
138 (1ULL << CAP_NET_RAW) |
139 (1ULL << CAP_SETGID) |
140 (1ULL << CAP_SETFCAP) |
141 (1ULL << CAP_SETPCAP) |
142 (1ULL << CAP_SETUID) |
143 (1ULL << CAP_SYS_ADMIN) |
144 (1ULL << CAP_SYS_CHROOT) |
145 (1ULL << CAP_SYS_NICE) |
146 (1ULL << CAP_SYS_PTRACE) |
147 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 148 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
149 (1ULL << CAP_SYS_BOOT) |
150 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_MKNOD);
17fe0523
LP
153static char **arg_bind = NULL;
154static char **arg_bind_ro = NULL;
06c17c39 155static char **arg_tmpfs = NULL;
f4889f65 156static char **arg_setenv = NULL;
284c0b91 157static bool arg_quiet = false;
8a96d94e 158static bool arg_share_system = false;
eb91eb18 159static bool arg_register = true;
89f7c846 160static bool arg_keep_unit = false;
aa28aefe 161static char **arg_network_interfaces = NULL;
c74e630d 162static char **arg_network_macvlan = NULL;
69c79d3c 163static bool arg_network_veth = false;
c74e630d 164static const char *arg_network_bridge = NULL;
6afc95b7 165static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 166static const char *arg_image = NULL;
4d9f07b4 167static Volatile arg_volatile = VOLATILE_NO;
88213476 168
601185b4 169static void help(void) {
88213476
LP
170 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
172 " -h --help Show this help\n"
173 " --version Print version string\n"
69c79d3c 174 " -q --quiet Do not show status information\n"
1b9e5b12
LP
175 " -D --directory=PATH Root directory for the container\n"
176 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
177 " -b --boot Boot up full system (i.e. invoke init)\n"
178 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 179 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 180 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 181 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
182 " --private-network Disable network in container\n"
183 " --network-interface=INTERFACE\n"
184 " Assign an existing network interface to the\n"
185 " container\n"
c74e630d
LP
186 " --network-macvlan=INTERFACE\n"
187 " Create a macvlan network interface based on an\n"
188 " existing network interface to the container\n"
32457153 189 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 190 " and container\n"
ab046dde 191 " --network-bridge=INTERFACE\n"
32457153 192 " Add a virtual ethernet connection between host\n"
ab046dde
TG
193 " and container and add it to an existing bridge on\n"
194 " the host\n"
82adf6af
LP
195 " -Z --selinux-context=SECLABEL\n"
196 " Set the SELinux security context to be used by\n"
197 " processes in the container\n"
198 " -L --selinux-apifs-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " API/tmpfs file systems in the container\n"
a8828ed9
DW
201 " --capability=CAP In addition to the default, retain specified\n"
202 " capability\n"
203 " --drop-capability=CAP Drop the specified capability from the default set\n"
204 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
205 " -j Equivalent to --link-journal=host\n"
69c79d3c 206 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
207 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
208 " the container\n"
209 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 210 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 211 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 212 " --share-system Share system namespaces with host\n"
eb91eb18 213 " --register=BOOLEAN Register container as machine\n"
89f7c846 214 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
215 " the service unit nspawn is running in\n"
216 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 217 program_invocation_short_name);
88213476
LP
218}
219
220static int parse_argv(int argc, char *argv[]) {
221
a41fe3a2 222 enum {
acbeb427
ZJS
223 ARG_VERSION = 0x100,
224 ARG_PRIVATE_NETWORK,
bc2f673e 225 ARG_UUID,
5076f0cc 226 ARG_READ_ONLY,
57fb9fb5 227 ARG_CAPABILITY,
420c7379 228 ARG_DROP_CAPABILITY,
17fe0523
LP
229 ARG_LINK_JOURNAL,
230 ARG_BIND,
f4889f65 231 ARG_BIND_RO,
06c17c39 232 ARG_TMPFS,
f4889f65 233 ARG_SETENV,
eb91eb18 234 ARG_SHARE_SYSTEM,
89f7c846 235 ARG_REGISTER,
aa28aefe 236 ARG_KEEP_UNIT,
69c79d3c 237 ARG_NETWORK_INTERFACE,
c74e630d 238 ARG_NETWORK_MACVLAN,
69c79d3c 239 ARG_NETWORK_VETH,
ab046dde 240 ARG_NETWORK_BRIDGE,
6afc95b7 241 ARG_PERSONALITY,
4d9f07b4 242 ARG_VOLATILE,
a41fe3a2
LP
243 };
244
88213476 245 static const struct option options[] = {
aa28aefe
LP
246 { "help", no_argument, NULL, 'h' },
247 { "version", no_argument, NULL, ARG_VERSION },
248 { "directory", required_argument, NULL, 'D' },
249 { "user", required_argument, NULL, 'u' },
250 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
251 { "boot", no_argument, NULL, 'b' },
252 { "uuid", required_argument, NULL, ARG_UUID },
253 { "read-only", no_argument, NULL, ARG_READ_ONLY },
254 { "capability", required_argument, NULL, ARG_CAPABILITY },
255 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
256 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
257 { "bind", required_argument, NULL, ARG_BIND },
258 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 259 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
260 { "machine", required_argument, NULL, 'M' },
261 { "slice", required_argument, NULL, 'S' },
262 { "setenv", required_argument, NULL, ARG_SETENV },
263 { "selinux-context", required_argument, NULL, 'Z' },
264 { "selinux-apifs-context", required_argument, NULL, 'L' },
265 { "quiet", no_argument, NULL, 'q' },
266 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
267 { "register", required_argument, NULL, ARG_REGISTER },
268 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
269 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 270 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
271 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
272 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 273 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 274 { "image", required_argument, NULL, 'i' },
4d9f07b4 275 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 276 {}
88213476
LP
277 };
278
9444b1f2 279 int c, r;
a42c8b54 280 uint64_t plus = 0, minus = 0;
88213476
LP
281
282 assert(argc >= 0);
283 assert(argv);
284
601185b4 285 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
88213476
LP
286
287 switch (c) {
288
289 case 'h':
601185b4
ZJS
290 help();
291 return 0;
88213476 292
acbeb427
ZJS
293 case ARG_VERSION:
294 puts(PACKAGE_STRING);
295 puts(SYSTEMD_FEATURES);
296 return 0;
297
88213476
LP
298 case 'D':
299 free(arg_directory);
3a74cea5
LP
300 arg_directory = canonicalize_file_name(optarg);
301 if (!arg_directory) {
898d5c91 302 log_error("Invalid root directory: %m");
88213476
LP
303 return -ENOMEM;
304 }
305
306 break;
307
1b9e5b12
LP
308 case 'i':
309 arg_image = optarg;
310 break;
311
687d0825
MV
312 case 'u':
313 free(arg_user);
7027ff61
LP
314 arg_user = strdup(optarg);
315 if (!arg_user)
316 return log_oom();
687d0825
MV
317
318 break;
319
ab046dde 320 case ARG_NETWORK_BRIDGE:
c74e630d 321 arg_network_bridge = optarg;
ab046dde
TG
322
323 /* fall through */
324
69c79d3c
LP
325 case ARG_NETWORK_VETH:
326 arg_network_veth = true;
327 arg_private_network = true;
328 break;
329
aa28aefe 330 case ARG_NETWORK_INTERFACE:
c74e630d
LP
331 if (strv_extend(&arg_network_interfaces, optarg) < 0)
332 return log_oom();
333
334 arg_private_network = true;
335 break;
336
337 case ARG_NETWORK_MACVLAN:
338 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
339 return log_oom();
340
341 /* fall through */
342
ff01d048
LP
343 case ARG_PRIVATE_NETWORK:
344 arg_private_network = true;
a41fe3a2
LP
345 break;
346
0f0dbc46
LP
347 case 'b':
348 arg_boot = true;
349 break;
350
144f0fc0 351 case ARG_UUID:
9444b1f2
LP
352 r = sd_id128_from_string(optarg, &arg_uuid);
353 if (r < 0) {
aa96c6cb 354 log_error("Invalid UUID: %s", optarg);
9444b1f2 355 return r;
aa96c6cb 356 }
9444b1f2 357 break;
aa96c6cb 358
9444b1f2 359 case 'S':
c74e630d 360 arg_slice = optarg;
144f0fc0
LP
361 break;
362
7027ff61 363 case 'M':
eb91eb18
LP
364 if (isempty(optarg)) {
365 free(arg_machine);
366 arg_machine = NULL;
367 } else {
7027ff61 368
eb91eb18
LP
369 if (!hostname_is_valid(optarg)) {
370 log_error("Invalid machine name: %s", optarg);
371 return -EINVAL;
372 }
7027ff61 373
eb91eb18
LP
374 free(arg_machine);
375 arg_machine = strdup(optarg);
376 if (!arg_machine)
377 return log_oom();
378
379 break;
380 }
7027ff61 381
82adf6af
LP
382 case 'Z':
383 arg_selinux_context = optarg;
a8828ed9
DW
384 break;
385
82adf6af
LP
386 case 'L':
387 arg_selinux_apifs_context = optarg;
a8828ed9
DW
388 break;
389
bc2f673e
LP
390 case ARG_READ_ONLY:
391 arg_read_only = true;
392 break;
393
420c7379
LP
394 case ARG_CAPABILITY:
395 case ARG_DROP_CAPABILITY: {
a2a5291b 396 const char *state, *word;
5076f0cc
LP
397 size_t length;
398
399 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 400 _cleanup_free_ char *t;
5076f0cc 401 cap_value_t cap;
5076f0cc
LP
402
403 t = strndup(word, length);
0d0f0c50
SL
404 if (!t)
405 return log_oom();
5076f0cc 406
39ed67d1
LP
407 if (streq(t, "all")) {
408 if (c == ARG_CAPABILITY)
a42c8b54 409 plus = (uint64_t) -1;
39ed67d1 410 else
a42c8b54 411 minus = (uint64_t) -1;
39ed67d1
LP
412 } else {
413 if (cap_from_name(t, &cap) < 0) {
414 log_error("Failed to parse capability %s.", t);
415 return -EINVAL;
416 }
417
418 if (c == ARG_CAPABILITY)
a42c8b54 419 plus |= 1ULL << (uint64_t) cap;
39ed67d1 420 else
a42c8b54 421 minus |= 1ULL << (uint64_t) cap;
5076f0cc 422 }
5076f0cc
LP
423 }
424
425 break;
426 }
427
57fb9fb5
LP
428 case 'j':
429 arg_link_journal = LINK_GUEST;
430 break;
431
432 case ARG_LINK_JOURNAL:
433 if (streq(optarg, "auto"))
434 arg_link_journal = LINK_AUTO;
435 else if (streq(optarg, "no"))
436 arg_link_journal = LINK_NO;
437 else if (streq(optarg, "guest"))
438 arg_link_journal = LINK_GUEST;
439 else if (streq(optarg, "host"))
440 arg_link_journal = LINK_HOST;
441 else {
442 log_error("Failed to parse link journal mode %s", optarg);
443 return -EINVAL;
444 }
445
446 break;
447
17fe0523
LP
448 case ARG_BIND:
449 case ARG_BIND_RO: {
450 _cleanup_free_ char *a = NULL, *b = NULL;
451 char *e;
452 char ***x;
17fe0523
LP
453
454 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456 e = strchr(optarg, ':');
457 if (e) {
458 a = strndup(optarg, e - optarg);
459 b = strdup(e + 1);
460 } else {
461 a = strdup(optarg);
462 b = strdup(optarg);
463 }
464
465 if (!a || !b)
466 return log_oom();
467
468 if (!path_is_absolute(a) || !path_is_absolute(b)) {
469 log_error("Invalid bind mount specification: %s", optarg);
470 return -EINVAL;
471 }
472
473 r = strv_extend(x, a);
474 if (r < 0)
b3451bed 475 return log_oom();
17fe0523
LP
476
477 r = strv_extend(x, b);
478 if (r < 0)
b3451bed 479 return log_oom();
17fe0523
LP
480
481 break;
482 }
483
06c17c39
LP
484 case ARG_TMPFS: {
485 _cleanup_free_ char *a = NULL, *b = NULL;
486 char *e;
487
488 e = strchr(optarg, ':');
489 if (e) {
490 a = strndup(optarg, e - optarg);
491 b = strdup(e + 1);
492 } else {
493 a = strdup(optarg);
494 b = strdup("mode=0755");
495 }
496
497 if (!a || !b)
498 return log_oom();
499
500 if (!path_is_absolute(a)) {
501 log_error("Invalid tmpfs specification: %s", optarg);
502 return -EINVAL;
503 }
504
505 r = strv_push(&arg_tmpfs, a);
506 if (r < 0)
507 return log_oom();
508
509 a = NULL;
510
511 r = strv_push(&arg_tmpfs, b);
512 if (r < 0)
513 return log_oom();
514
515 b = NULL;
516
517 break;
518 }
519
f4889f65
LP
520 case ARG_SETENV: {
521 char **n;
522
523 if (!env_assignment_is_valid(optarg)) {
524 log_error("Environment variable assignment '%s' is not valid.", optarg);
525 return -EINVAL;
526 }
527
528 n = strv_env_set(arg_setenv, optarg);
529 if (!n)
530 return log_oom();
531
532 strv_free(arg_setenv);
533 arg_setenv = n;
534 break;
535 }
536
284c0b91
LP
537 case 'q':
538 arg_quiet = true;
539 break;
540
8a96d94e
LP
541 case ARG_SHARE_SYSTEM:
542 arg_share_system = true;
543 break;
544
eb91eb18
LP
545 case ARG_REGISTER:
546 r = parse_boolean(optarg);
547 if (r < 0) {
548 log_error("Failed to parse --register= argument: %s", optarg);
549 return r;
550 }
551
552 arg_register = r;
553 break;
554
89f7c846
LP
555 case ARG_KEEP_UNIT:
556 arg_keep_unit = true;
557 break;
558
6afc95b7
LP
559 case ARG_PERSONALITY:
560
ac45f971 561 arg_personality = personality_from_string(optarg);
6afc95b7
LP
562 if (arg_personality == 0xffffffffLU) {
563 log_error("Unknown or unsupported personality '%s'.", optarg);
564 return -EINVAL;
565 }
566
567 break;
568
4d9f07b4
LP
569 case ARG_VOLATILE:
570
571 if (!optarg)
572 arg_volatile = VOLATILE_YES;
573 else {
574 r = parse_boolean(optarg);
575 if (r < 0) {
576 if (streq(optarg, "state"))
577 arg_volatile = VOLATILE_STATE;
578 else {
579 log_error("Failed to parse --volatile= argument: %s", optarg);
580 return r;
581 }
582 } else
583 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584 }
585
586 break;
587
88213476
LP
588 case '?':
589 return -EINVAL;
590
591 default:
eb9da376 592 assert_not_reached("Unhandled option");
88213476 593 }
88213476 594
eb91eb18
LP
595 if (arg_share_system)
596 arg_register = false;
597
598 if (arg_boot && arg_share_system) {
599 log_error("--boot and --share-system may not be combined.");
600 return -EINVAL;
601 }
602
89f7c846
LP
603 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604 log_error("--keep-unit may not be used when invoked from a user session.");
605 return -EINVAL;
606 }
607
1b9e5b12
LP
608 if (arg_directory && arg_image) {
609 log_error("--directory= and --image= may not be combined.");
610 return -EINVAL;
611 }
612
4d9f07b4
LP
613 if (arg_volatile != VOLATILE_NO && arg_read_only) {
614 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615 return -EINVAL;
616 }
617
a42c8b54
LP
618 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
88213476
LP
620 return 1;
621}
622
623static int mount_all(const char *dest) {
624
625 typedef struct MountPoint {
626 const char *what;
627 const char *where;
628 const char *type;
629 const char *options;
630 unsigned long flags;
3bd66c05 631 bool fatal;
88213476
LP
632 } MountPoint;
633
634 static const MountPoint mount_table[] = {
06c17c39
LP
635 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
636 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
637 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
638 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
639 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 640 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
641 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
642 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 643#ifdef HAVE_SELINUX
06c17c39
LP
644 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
645 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 646#endif
88213476
LP
647 };
648
649 unsigned k;
650 int r = 0;
651
652 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 653 _cleanup_free_ char *where = NULL;
d002827b 654#ifdef HAVE_SELINUX
a8828ed9 655 _cleanup_free_ char *options = NULL;
d002827b
LP
656#endif
657 const char *o;
88213476
LP
658 int t;
659
17fe0523
LP
660 where = strjoin(dest, "/", mount_table[k].where, NULL);
661 if (!where)
662 return log_oom();
88213476 663
e65aec12 664 t = path_is_mount_point(where, true);
68fb0892 665 if (t < 0) {
88213476 666 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
667
668 if (r == 0)
669 r = t;
670
671 continue;
672 }
673
9c1c7f71
LP
674 /* Skip this entry if it is not a remount. */
675 if (mount_table[k].what && t > 0)
014a9c77
LP
676 continue;
677
79d80fc1
TG
678 t = mkdir_p(where, 0755);
679 if (t < 0) {
680 if (mount_table[k].fatal) {
681 log_error("Failed to create directory %s: %s", where, strerror(-t));
682
683 if (r == 0)
684 r = t;
685 } else
686 log_warning("Failed to create directory %s: %s", where, strerror(-t));
687
688 continue;
689 }
88213476 690
a8828ed9 691#ifdef HAVE_SELINUX
82adf6af
LP
692 if (arg_selinux_apifs_context &&
693 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
694 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
695 if (!options)
696 return log_oom();
697
698 o = options;
699 } else
a8828ed9 700#endif
d002827b 701 o = mount_table[k].options;
a8828ed9 702
a8828ed9 703
88213476
LP
704 if (mount(mount_table[k].what,
705 where,
706 mount_table[k].type,
707 mount_table[k].flags,
79d80fc1 708 o) < 0) {
88213476 709
79d80fc1
TG
710 if (mount_table[k].fatal) {
711 log_error("mount(%s) failed: %m", where);
88213476 712
79d80fc1
TG
713 if (r == 0)
714 r = -errno;
715 } else
716 log_warning("mount(%s) failed: %m", where);
88213476 717 }
88213476
LP
718 }
719
e58a1277
LP
720 return r;
721}
f8440af5 722
d6797c92 723static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
724 char **x, **y;
725
726 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 727 _cleanup_free_ char *where = NULL;
d2421337 728 struct stat source_st, dest_st;
2ed4e5e0 729 int r;
d2421337
DR
730
731 if (stat(*x, &source_st) < 0) {
1b9e5b12 732 log_error("Failed to stat %s: %m", *x);
d2421337
DR
733 return -errno;
734 }
17fe0523 735
06c17c39
LP
736 where = strappend(dest, *y);
737 if (!where)
738 return log_oom();
739
2ed4e5e0
SL
740 r = stat(where, &dest_st);
741 if (r == 0) {
d2421337 742 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 743 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
744 return -EINVAL;
745 }
2ed4e5e0
SL
746 } else if (errno == ENOENT) {
747 r = mkdir_parents_label(where, 0755);
748 if (r < 0) {
749 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
750 return r;
d2421337 751 }
2ed4e5e0 752 } else {
590b6b91 753 log_error("Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
754 return -errno;
755 }
06c17c39 756
2ed4e5e0 757 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 758 * and char devices. */
79d80fc1
TG
759 if (S_ISDIR(source_st.st_mode)) {
760 r = mkdir_label(where, 0755);
761 if (r < 0) {
762 log_error("Failed to create mount point %s: %s", where, strerror(-r));
763
764 return r;
765 }
766 } else if (S_ISFIFO(source_st.st_mode)) {
767 r = mkfifo(where, 0644);
768 if (r < 0 && errno != EEXIST) {
769 log_error("Failed to create mount point %s: %m", where);
770
771 return -errno;
772 }
773 } else if (S_ISSOCK(source_st.st_mode)) {
774 r = mknod(where, 0644 | S_IFSOCK, 0);
775 if (r < 0 && errno != EEXIST) {
776 log_error("Failed to create mount point %s: %m", where);
777
778 return -errno;
779 }
780 } else if (S_ISREG(source_st.st_mode)) {
781 r = touch(where);
782 if (r < 0) {
783 log_error("Failed to create mount point %s: %s", where, strerror(-r));
784
785 return r;
786 }
787 } else {
2ed4e5e0
SL
788 log_error("Refusing to create mountpoint for file: %s", *x);
789 return -ENOTSUP;
d2421337 790 }
17fe0523
LP
791
792 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
793 log_error("mount(%s) failed: %m", where);
794 return -errno;
795 }
796
d6797c92
LP
797 if (ro) {
798 r = bind_remount_recursive(where, true);
799 if (r < 0) {
800 log_error("Read-Only bind mount failed: %s", strerror(-r));
801 return r;
802 }
17fe0523
LP
803 }
804 }
805
806 return 0;
807}
808
06c17c39
LP
809static int mount_tmpfs(const char *dest) {
810 char **i, **o;
811
812 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813 _cleanup_free_ char *where = NULL;
79d80fc1 814 int r;
06c17c39
LP
815
816 where = strappend(dest, *i);
817 if (!where)
818 return log_oom();
819
79d80fc1
TG
820 r = mkdir_label(where, 0755);
821 if (r < 0) {
822 log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
823
824 return r;
825 }
06c17c39
LP
826
827 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
828 log_error("tmpfs mount to %s failed: %m", where);
829 return -errno;
830 }
831 }
832
833 return 0;
834}
835
e58a1277 836static int setup_timezone(const char *dest) {
d4036145
LP
837 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
838 char *z, *y;
839 int r;
f8440af5 840
e58a1277
LP
841 assert(dest);
842
843 /* Fix the timezone, if possible */
d4036145
LP
844 r = readlink_malloc("/etc/localtime", &p);
845 if (r < 0) {
846 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
847 return 0;
848 }
849
850 z = path_startswith(p, "../usr/share/zoneinfo/");
851 if (!z)
852 z = path_startswith(p, "/usr/share/zoneinfo/");
853 if (!z) {
854 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
855 return 0;
856 }
857
04bc4a3f
LP
858 where = strappend(dest, "/etc/localtime");
859 if (!where)
0d0f0c50 860 return log_oom();
715ac17a 861
d4036145
LP
862 r = readlink_malloc(where, &q);
863 if (r >= 0) {
864 y = path_startswith(q, "../usr/share/zoneinfo/");
865 if (!y)
866 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 867
d4036145
LP
868 /* Already pointing to the right place? Then do nothing .. */
869 if (y && streq(y, z))
870 return 0;
871 }
872
873 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
874 if (!check)
0d0f0c50 875 return log_oom();
4d1c38b8 876
d4036145
LP
877 if (access(check, F_OK) < 0) {
878 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
879 return 0;
880 }
68fb0892 881
d4036145
LP
882 what = strappend("../usr/share/zoneinfo/", z);
883 if (!what)
884 return log_oom();
885
79d80fc1
TG
886 r = mkdir_parents(where, 0755);
887 if (r < 0) {
888 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
889
890 return 0;
891 }
892
893 r = unlink(where);
894 if (r < 0 && errno != ENOENT) {
895 log_error("Failed to remove existing timezone info %s in container: %m", where);
896
897 return 0;
898 }
4d9f07b4 899
d4036145
LP
900 if (symlink(what, where) < 0) {
901 log_error("Failed to correct timezone of container: %m");
902 return 0;
903 }
e58a1277
LP
904
905 return 0;
88213476
LP
906}
907
2547bb41 908static int setup_resolv_conf(const char *dest) {
c8b32e11 909 _cleanup_free_ char *where = NULL;
79d80fc1 910 int r;
2547bb41
LP
911
912 assert(dest);
913
914 if (arg_private_network)
915 return 0;
916
917 /* Fix resolv.conf, if possible */
04bc4a3f
LP
918 where = strappend(dest, "/etc/resolv.conf");
919 if (!where)
0d0f0c50 920 return log_oom();
2547bb41 921
77e63faf
LP
922 /* We don't really care for the results of this really. If it
923 * fails, it fails, but meh... */
79d80fc1
TG
924 r = mkdir_parents(where, 0755);
925 if (r < 0) {
926 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
927
928 return 0;
929 }
930
931 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
932 if (r < 0) {
933 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
934
935 return 0;
936 }
2547bb41
LP
937
938 return 0;
939}
940
4d9f07b4
LP
941static int setup_volatile_state(const char *directory) {
942 const char *p;
943 int r;
944
945 assert(directory);
946
947 if (arg_volatile != VOLATILE_STATE)
948 return 0;
949
950 /* --volatile=state means we simply overmount /var
951 with a tmpfs, and the rest read-only. */
952
953 r = bind_remount_recursive(directory, true);
954 if (r < 0) {
955 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
956 return r;
957 }
958
959 p = strappenda(directory, "/var");
79d80fc1
TG
960 r = mkdir(p, 0755);
961 if (r < 0 && errno != EEXIST) {
962 log_error("Failed to create %s: %m", directory);
963 return -errno;
964 }
4d9f07b4
LP
965
966 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
967 log_error("Failed to mount tmpfs to /var: %m");
968 return -errno;
969 }
970
971 return 0;
972}
973
974static int setup_volatile(const char *directory) {
975 bool tmpfs_mounted = false, bind_mounted = false;
976 char template[] = "/tmp/nspawn-volatile-XXXXXX";
977 const char *f, *t;
978 int r;
979
980 assert(directory);
981
982 if (arg_volatile != VOLATILE_YES)
983 return 0;
984
985 /* --volatile=yes means we mount a tmpfs to the root dir, and
986 the original /usr to use inside it, and that read-only. */
987
988 if (!mkdtemp(template)) {
989 log_error("Failed to create temporary directory: %m");
990 return -errno;
991 }
992
993 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
994 log_error("Failed to mount tmpfs for root directory: %m");
995 r = -errno;
996 goto fail;
997 }
998
999 tmpfs_mounted = true;
1000
1001 f = strappenda(directory, "/usr");
1002 t = strappenda(template, "/usr");
1003
79d80fc1
TG
1004 r = mkdir(t, 0755);
1005 if (r < 0 && errno != EEXIST) {
1006 log_error("Failed to create %s: %m", t);
1007 r = -errno;
1008 goto fail;
1009 }
1010
4d9f07b4
LP
1011 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1012 log_error("Failed to create /usr bind mount: %m");
1013 r = -errno;
1014 goto fail;
1015 }
1016
1017 bind_mounted = true;
1018
1019 r = bind_remount_recursive(t, true);
1020 if (r < 0) {
1021 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1022 goto fail;
1023 }
1024
1025 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1026 log_error("Failed to move root mount: %m");
1027 r = -errno;
1028 goto fail;
1029 }
1030
1031 rmdir(template);
1032
1033 return 0;
1034
1035fail:
1036 if (bind_mounted)
1037 umount(t);
1038 if (tmpfs_mounted)
1039 umount(template);
1040 rmdir(template);
1041 return r;
1042}
1043
9f24adc2
LP
1044static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1045
1046 snprintf(s, 37,
1047 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1048 SD_ID128_FORMAT_VAL(id));
1049
1050 return s;
1051}
1052
04bc4a3f 1053static int setup_boot_id(const char *dest) {
7fd1b19b 1054 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1055 sd_id128_t rnd = {};
04bc4a3f
LP
1056 char as_uuid[37];
1057 int r;
1058
1059 assert(dest);
1060
eb91eb18
LP
1061 if (arg_share_system)
1062 return 0;
1063
04bc4a3f
LP
1064 /* Generate a new randomized boot ID, so that each boot-up of
1065 * the container gets a new one */
1066
1067 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1068 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1069 if (!from || !to)
1070 return log_oom();
04bc4a3f
LP
1071
1072 r = sd_id128_randomize(&rnd);
1073 if (r < 0) {
1074 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 1075 return r;
04bc4a3f
LP
1076 }
1077
9f24adc2 1078 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1079
574d5f2d 1080 r = write_string_file(from, as_uuid);
04bc4a3f
LP
1081 if (r < 0) {
1082 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 1083 return r;
04bc4a3f
LP
1084 }
1085
1086 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1087 log_error("Failed to bind mount boot id: %m");
1088 r = -errno;
10d18763
ZJS
1089 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1090 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
1091
1092 unlink(from);
04bc4a3f
LP
1093 return r;
1094}
1095
e58a1277 1096static int copy_devnodes(const char *dest) {
88213476
LP
1097
1098 static const char devnodes[] =
1099 "null\0"
1100 "zero\0"
1101 "full\0"
1102 "random\0"
1103 "urandom\0"
85614d66
TG
1104 "tty\0"
1105 "net/tun\0";
88213476
LP
1106
1107 const char *d;
e58a1277 1108 int r = 0;
7fd1b19b 1109 _cleanup_umask_ mode_t u;
a258bf26
LP
1110
1111 assert(dest);
124640f1
LP
1112
1113 u = umask(0000);
88213476
LP
1114
1115 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1116 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1117 struct stat st;
88213476 1118
7f112f50
LP
1119 from = strappend("/dev/", d);
1120 to = strjoin(dest, "/dev/", d, NULL);
1121 if (!from || !to)
1122 return log_oom();
88213476
LP
1123
1124 if (stat(from, &st) < 0) {
1125
1126 if (errno != ENOENT) {
1127 log_error("Failed to stat %s: %m", from);
7f112f50 1128 return -errno;
88213476
LP
1129 }
1130
a258bf26 1131 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1132
ed8b7a3e 1133 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1134 return -EIO;
a258bf26 1135
85614d66
TG
1136 } else {
1137 r = mkdir_parents(to, 0775);
1138 if (r < 0) {
1139 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1140 return -r;
1141 }
a258bf26 1142
85614d66
TG
1143 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144 log_error("mknod(%s) failed: %m", dest);
1145 return -errno;
1146 }
88213476 1147 }
88213476
LP
1148 }
1149
e58a1277
LP
1150 return r;
1151}
88213476 1152
f2d88580
LP
1153static int setup_ptmx(const char *dest) {
1154 _cleanup_free_ char *p = NULL;
1155
1156 p = strappend(dest, "/dev/ptmx");
1157 if (!p)
1158 return log_oom();
1159
1160 if (symlink("pts/ptmx", p) < 0) {
1161 log_error("Failed to create /dev/ptmx symlink: %m");
1162 return -errno;
1163 }
1164
1165 return 0;
1166}
1167
e58a1277 1168static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1169 _cleanup_umask_ mode_t u;
1170 const char *to;
e58a1277 1171 struct stat st;
e58a1277 1172 int r;
e58a1277
LP
1173
1174 assert(dest);
1175 assert(console);
1176
1177 u = umask(0000);
1178
eb0f0863
LP
1179 if (stat("/dev/null", &st) < 0) {
1180 log_error("Failed to stat /dev/null: %m");
25ea79fe 1181 return -errno;
e58a1277 1182 }
88213476 1183
e58a1277
LP
1184 r = chmod_and_chown(console, 0600, 0, 0);
1185 if (r < 0) {
1186 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 1187 return r;
a258bf26 1188 }
88213476 1189
a258bf26
LP
1190 /* We need to bind mount the right tty to /dev/console since
1191 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1192 * to bind mount things on we create a device node first, and
1193 * use /dev/null for that since we the cgroups device policy
1194 * allows us to create that freely, while we cannot create
1195 * /dev/console. (Note that the major minor doesn't actually
1196 * matter here, since we mount it over anyway). */
a258bf26 1197
eb0f0863 1198 to = strappenda(dest, "/dev/console");
e58a1277
LP
1199 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1200 log_error("mknod() for /dev/console failed: %m");
25ea79fe 1201 return -errno;
e58a1277 1202 }
a258bf26
LP
1203
1204 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 1205 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 1206 return -errno;
a258bf26
LP
1207 }
1208
25ea79fe 1209 return 0;
e58a1277
LP
1210}
1211
1212static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1213 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1214 int r, fd, k;
7fd1b19b 1215 _cleanup_umask_ mode_t u;
e58a1277
LP
1216 union {
1217 struct cmsghdr cmsghdr;
1218 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1219 } control = {};
1220 struct msghdr mh = {
1221 .msg_control = &control,
1222 .msg_controllen = sizeof(control),
1223 };
e58a1277
LP
1224 struct cmsghdr *cmsg;
1225
1226 assert(dest);
1227 assert(kmsg_socket >= 0);
a258bf26 1228
e58a1277 1229 u = umask(0000);
a258bf26 1230
f1e5dfe2
LP
1231 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1232 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1233 * on the reading side behave very similar to /proc/kmsg,
1234 * their writing side behaves differently from /dev/kmsg in
1235 * that writing blocks when nothing is reading. In order to
1236 * avoid any problems with containers deadlocking due to this
1237 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1238 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1239 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1240 return log_oom();
e58a1277
LP
1241
1242 if (mkfifo(from, 0600) < 0) {
1243 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 1244 return -errno;
e58a1277
LP
1245 }
1246
1247 r = chmod_and_chown(from, 0600, 0, 0);
1248 if (r < 0) {
1249 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 1250 return r;
e58a1277
LP
1251 }
1252
1253 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1254 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 1255 return -errno;
e58a1277
LP
1256 }
1257
1258 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1259 if (fd < 0) {
1260 log_error("Failed to open fifo: %m");
25ea79fe 1261 return -errno;
e58a1277
LP
1262 }
1263
e58a1277
LP
1264 cmsg = CMSG_FIRSTHDR(&mh);
1265 cmsg->cmsg_level = SOL_SOCKET;
1266 cmsg->cmsg_type = SCM_RIGHTS;
1267 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1268 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1269
1270 mh.msg_controllen = cmsg->cmsg_len;
1271
1272 /* Store away the fd in the socket, so that it stays open as
1273 * long as we run the child */
1274 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1275 safe_close(fd);
e58a1277
LP
1276
1277 if (k < 0) {
1278 log_error("Failed to send FIFO fd: %m");
25ea79fe 1279 return -errno;
a258bf26
LP
1280 }
1281
f1e5dfe2
LP
1282 /* And now make the FIFO unavailable as /dev/kmsg... */
1283 unlink(from);
25ea79fe 1284 return 0;
88213476
LP
1285}
1286
3a74cea5 1287static int setup_hostname(void) {
3a74cea5 1288
eb91eb18
LP
1289 if (arg_share_system)
1290 return 0;
1291
7027ff61
LP
1292 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1293 return -errno;
3a74cea5 1294
7027ff61 1295 return 0;
3a74cea5
LP
1296}
1297
57fb9fb5 1298static int setup_journal(const char *directory) {
4d680aee 1299 sd_id128_t machine_id, this_id;
7fd1b19b 1300 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1301 char *id;
57fb9fb5
LP
1302 int r;
1303
57fb9fb5 1304 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1305 if (!p)
1306 return log_oom();
57fb9fb5
LP
1307
1308 r = read_one_line_file(p, &b);
27407a01
ZJS
1309 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1310 return 0;
1311 else if (r < 0) {
1312 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1313 return r;
1314 }
1315
27407a01
ZJS
1316 id = strstrip(b);
1317 if (isempty(id) && arg_link_journal == LINK_AUTO)
1318 return 0;
57fb9fb5 1319
27407a01
ZJS
1320 /* Verify validity */
1321 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1322 if (r < 0) {
27407a01
ZJS
1323 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1324 return r;
57fb9fb5
LP
1325 }
1326
4d680aee
ZJS
1327 r = sd_id128_get_machine(&this_id);
1328 if (r < 0) {
1329 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1330 return r;
1331 }
1332
1333 if (sd_id128_equal(machine_id, this_id)) {
1334 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335 "Host and machine ids are equal (%s): refusing to link journals", id);
1336 if (arg_link_journal == LINK_AUTO)
1337 return 0;
1338 return
1339 -EEXIST;
1340 }
1341
1342 if (arg_link_journal == LINK_NO)
1343 return 0;
1344
57fb9fb5 1345 free(p);
27407a01
ZJS
1346 p = strappend("/var/log/journal/", id);
1347 q = strjoin(directory, "/var/log/journal/", id, NULL);
1348 if (!p || !q)
1349 return log_oom();
1350
1351 if (path_is_mount_point(p, false) > 0) {
1352 if (arg_link_journal != LINK_AUTO) {
1353 log_error("%s: already a mount point, refusing to use for journal", p);
1354 return -EEXIST;
1355 }
1356
1357 return 0;
57fb9fb5
LP
1358 }
1359
27407a01 1360 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1361 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1362 log_error("%s: already a mount point, refusing to use for journal", q);
1363 return -EEXIST;
57fb9fb5
LP
1364 }
1365
27407a01 1366 return 0;
57fb9fb5
LP
1367 }
1368
1369 r = readlink_and_make_absolute(p, &d);
1370 if (r >= 0) {
1371 if ((arg_link_journal == LINK_GUEST ||
1372 arg_link_journal == LINK_AUTO) &&
1373 path_equal(d, q)) {
1374
27407a01
ZJS
1375 r = mkdir_p(q, 0755);
1376 if (r < 0)
79d80fc1 1377 log_warning("Failed to create directory %s: %m", q);
27407a01 1378 return 0;
57fb9fb5
LP
1379 }
1380
1381 if (unlink(p) < 0) {
1382 log_error("Failed to remove symlink %s: %m", p);
27407a01 1383 return -errno;
57fb9fb5
LP
1384 }
1385 } else if (r == -EINVAL) {
1386
1387 if (arg_link_journal == LINK_GUEST &&
1388 rmdir(p) < 0) {
1389
27407a01
ZJS
1390 if (errno == ENOTDIR) {
1391 log_error("%s already exists and is neither a symlink nor a directory", p);
1392 return r;
1393 } else {
57fb9fb5 1394 log_error("Failed to remove %s: %m", p);
27407a01 1395 return -errno;
57fb9fb5 1396 }
57fb9fb5
LP
1397 }
1398 } else if (r != -ENOENT) {
1399 log_error("readlink(%s) failed: %m", p);
27407a01 1400 return r;
57fb9fb5
LP
1401 }
1402
1403 if (arg_link_journal == LINK_GUEST) {
1404
1405 if (symlink(q, p) < 0) {
1406 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1407 return -errno;
57fb9fb5
LP
1408 }
1409
27407a01
ZJS
1410 r = mkdir_p(q, 0755);
1411 if (r < 0)
79d80fc1 1412 log_warning("Failed to create directory %s: %m", q);
27407a01 1413 return 0;
57fb9fb5
LP
1414 }
1415
1416 if (arg_link_journal == LINK_HOST) {
1417 r = mkdir_p(p, 0755);
1418 if (r < 0) {
1419 log_error("Failed to create %s: %m", p);
27407a01 1420 return r;
57fb9fb5
LP
1421 }
1422
27407a01
ZJS
1423 } else if (access(p, F_OK) < 0)
1424 return 0;
57fb9fb5 1425
cdb2b9d0
LP
1426 if (dir_is_empty(q) == 0)
1427 log_warning("%s is not empty, proceeding anyway.", q);
1428
57fb9fb5
LP
1429 r = mkdir_p(q, 0755);
1430 if (r < 0) {
1431 log_error("Failed to create %s: %m", q);
27407a01 1432 return r;
57fb9fb5
LP
1433 }
1434
1435 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1436 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1437 return -errno;
57fb9fb5
LP
1438 }
1439
27407a01 1440 return 0;
57fb9fb5
LP
1441}
1442
9bd37b40
LP
1443static int setup_kdbus(const char *dest, const char *path) {
1444 const char *p;
1445
1446 if (!path)
1447 return 0;
1448
1449 p = strappenda(dest, "/dev/kdbus");
1450 if (mkdir(p, 0755) < 0) {
1451 log_error("Failed to create kdbus path: %m");
1452 return -errno;
1453 }
1454
1455 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1456 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1457 return -errno;
1458 }
1459
1460 return 0;
1461}
1462
88213476 1463static int drop_capabilities(void) {
5076f0cc 1464 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1465}
1466
5aa4bb6b 1467static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1468 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1469 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1470 int r;
1471
eb91eb18
LP
1472 if (!arg_register)
1473 return 0;
1474
1c03020c 1475 r = sd_bus_default_system(&bus);
9444b1f2
LP
1476 if (r < 0) {
1477 log_error("Failed to open system bus: %s", strerror(-r));
1478 return r;
1479 }
1480
89f7c846
LP
1481 if (arg_keep_unit) {
1482 r = sd_bus_call_method(
1483 bus,
1484 "org.freedesktop.machine1",
1485 "/org/freedesktop/machine1",
1486 "org.freedesktop.machine1.Manager",
5aa4bb6b 1487 "RegisterMachineWithNetwork",
89f7c846
LP
1488 &error,
1489 NULL,
5aa4bb6b 1490 "sayssusai",
89f7c846
LP
1491 arg_machine,
1492 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1493 "nspawn",
1494 "container",
1495 (uint32_t) pid,
5aa4bb6b
LP
1496 strempty(arg_directory),
1497 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1498 } else {
9457ac5b
LP
1499 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1500
1501 r = sd_bus_message_new_method_call(
89f7c846 1502 bus,
9457ac5b 1503 &m,
89f7c846
LP
1504 "org.freedesktop.machine1",
1505 "/org/freedesktop/machine1",
1506 "org.freedesktop.machine1.Manager",
5aa4bb6b 1507 "CreateMachineWithNetwork");
9457ac5b
LP
1508 if (r < 0) {
1509 log_error("Failed to create message: %s", strerror(-r));
1510 return r;
1511 }
1512
1513 r = sd_bus_message_append(
1514 m,
5aa4bb6b 1515 "sayssusai",
89f7c846
LP
1516 arg_machine,
1517 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1518 "nspawn",
1519 "container",
1520 (uint32_t) pid,
5aa4bb6b
LP
1521 strempty(arg_directory),
1522 local_ifindex > 0 ? 1 : 0, local_ifindex);
9457ac5b
LP
1523 if (r < 0) {
1524 log_error("Failed to append message arguments: %s", strerror(-r));
1525 return r;
1526 }
1527
1528 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529 if (r < 0) {
1530 log_error("Failed to open container: %s", strerror(-r));
1531 return r;
1532 }
1533
1534 if (!isempty(arg_slice)) {
1535 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1536 if (r < 0) {
1537 log_error("Failed to append slice: %s", strerror(-r));
1538 return r;
1539 }
1540 }
1541
1542 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1543 if (r < 0) {
1544 log_error("Failed to add device policy: %s", strerror(-r));
1545 return r;
1546 }
1547
a07f961e 1548 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
9457ac5b
LP
1549 /* Allow the container to
1550 * access and create the API
1551 * device nodes, so that
1552 * PrivateDevices= in the
1553 * container can work
1554 * fine */
1555 "/dev/null", "rwm",
1556 "/dev/zero", "rwm",
1557 "/dev/full", "rwm",
1558 "/dev/random", "rwm",
1559 "/dev/urandom", "rwm",
1560 "/dev/tty", "rwm",
1561 /* Allow the container
1562 * access to ptys. However,
1563 * do not permit the
1564 * container to ever create
1565 * these device nodes. */
1566 "/dev/pts/ptmx", "rw",
a07f961e
LP
1567 "char-pts", "rw",
1568 /* Allow the container
1569 * access to all kdbus
1570 * devices. Again, the
1571 * container cannot create
1572 * these nodes, only use
1573 * them. We use a pretty
1574 * open match here, so that
1575 * the kernel API can still
1576 * change. */
1577 "char-kdbus", "rw",
1578 "char-kdbus/*", "rw");
9457ac5b
LP
1579 if (r < 0) {
1580 log_error("Failed to add device whitelist: %s", strerror(-r));
1581 return r;
1582 }
1583
1584 r = sd_bus_message_close_container(m);
1585 if (r < 0) {
1586 log_error("Failed to close container: %s", strerror(-r));
1587 return r;
1588 }
1589
1590 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1591 }
1592
9444b1f2 1593 if (r < 0) {
1f0cd86b
LP
1594 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1595 return r;
1596 }
1597
1598 return 0;
1599}
1600
1601static int terminate_machine(pid_t pid) {
1602 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1603 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1604 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1605 const char *path;
1606 int r;
1607
eb91eb18
LP
1608 if (!arg_register)
1609 return 0;
1610
76b54375 1611 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1612 if (r < 0) {
1613 log_error("Failed to open system bus: %s", strerror(-r));
1614 return r;
1615 }
1616
1617 r = sd_bus_call_method(
1618 bus,
1619 "org.freedesktop.machine1",
1620 "/org/freedesktop/machine1",
1621 "org.freedesktop.machine1.Manager",
1622 "GetMachineByPID",
1623 &error,
1624 &reply,
1625 "u",
1626 (uint32_t) pid);
1627 if (r < 0) {
1628 /* Note that the machine might already have been
1629 * cleaned up automatically, hence don't consider it a
1630 * failure if we cannot get the machine object. */
1631 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1632 return 0;
1633 }
1634
1635 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1636 if (r < 0)
1637 return bus_log_parse_error(r);
9444b1f2 1638
1f0cd86b
LP
1639 r = sd_bus_call_method(
1640 bus,
1641 "org.freedesktop.machine1",
1642 path,
1643 "org.freedesktop.machine1.Machine",
1644 "Terminate",
1645 &error,
1646 NULL,
1647 NULL);
1648 if (r < 0) {
1649 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1650 return 0;
1651 }
1652
9444b1f2
LP
1653 return 0;
1654}
1655
db999e0f
LP
1656static int reset_audit_loginuid(void) {
1657 _cleanup_free_ char *p = NULL;
1658 int r;
1659
1660 if (arg_share_system)
1661 return 0;
1662
1663 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1664 if (r == -ENOENT)
db999e0f
LP
1665 return 0;
1666 if (r < 0) {
1667 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1668 return r;
1669 }
1670
1671 /* Already reset? */
1672 if (streq(p, "4294967295"))
1673 return 0;
1674
1675 r = write_string_file("/proc/self/loginuid", "4294967295");
1676 if (r < 0) {
1677 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1678 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1679 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1680 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1681 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1682
db999e0f 1683 sleep(5);
77b6e194 1684 }
db999e0f
LP
1685
1686 return 0;
77b6e194
LP
1687}
1688
4f758c23
LP
1689#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1690#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
01dde061 1691
4f758c23 1692static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
01dde061
TG
1693 int r;
1694
1695 uint8_t result[8];
1696 size_t l, sz;
1697 uint8_t *v;
1698
1699 l = strlen(arg_machine);
1700 sz = sizeof(sd_id128_t) + l;
1701 v = alloca(sz);
1702
1703 /* fetch some persistent data unique to the host */
1704 r = sd_id128_get_machine((sd_id128_t*) v);
1705 if (r < 0)
1706 return r;
1707
1708 /* combine with some data unique (on this host) to this
1709 * container instance */
1710 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1711
1712 /* Let's hash the host machine ID plus the container name. We
1713 * use a fixed, but originally randomly created hash key here. */
4f758c23 1714 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
1715
1716 assert_cc(ETH_ALEN <= sizeof(result));
1717 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1718
1719 /* see eth_random_addr in the kernel */
1720 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1721 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1722
1723 return 0;
1724}
1725
5aa4bb6b 1726static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1727 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1728 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 1729 struct ether_addr mac_host, mac_container;
5aa4bb6b 1730 int r, i;
69c79d3c
LP
1731
1732 if (!arg_private_network)
1733 return 0;
1734
1735 if (!arg_network_veth)
1736 return 0;
1737
08af0da2
LP
1738 /* Use two different interface name prefixes depending whether
1739 * we are in bridge mode or not. */
c00524c9 1740 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 1741 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1742
4f758c23 1743 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
01dde061 1744 if (r < 0) {
4f758c23
LP
1745 log_error("Failed to generate predictable MAC address for container side");
1746 return r;
1747 }
1748
1749 r = generate_mac(&mac_host, HOST_HASH_KEY);
1750 if (r < 0) {
1751 log_error("Failed to generate predictable MAC address for host side");
01dde061
TG
1752 return r;
1753 }
1754
151b9b96 1755 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1756 if (r < 0) {
1757 log_error("Failed to connect to netlink: %s", strerror(-r));
1758 return r;
1759 }
1760
151b9b96 1761 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1762 if (r < 0) {
1763 log_error("Failed to allocate netlink message: %s", strerror(-r));
1764 return r;
1765 }
1766
ab046dde 1767 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1768 if (r < 0) {
ab046dde 1769 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1770 return r;
1771 }
1772
4f758c23
LP
1773 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1774 if (r < 0) {
1775 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1776 return r;
1777 }
1778
ee3a6a51 1779 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1780 if (r < 0) {
1781 log_error("Failed to open netlink container: %s", strerror(-r));
1782 return r;
1783 }
1784
d8e538ec 1785 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
69c79d3c
LP
1786 if (r < 0) {
1787 log_error("Failed to open netlink container: %s", strerror(-r));
1788 return r;
1789 }
1790
ee3a6a51 1791 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1792 if (r < 0) {
ab046dde 1793 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1794 return r;
1795 }
1796
ab046dde 1797 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1798 if (r < 0) {
ab046dde 1799 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1800 return r;
1801 }
01dde061 1802
4f758c23 1803 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
01dde061
TG
1804 if (r < 0) {
1805 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1806 return r;
1807 }
69c79d3c 1808
ab046dde 1809 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1810 if (r < 0) {
1811 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1812 return r;
1813 }
1814
1815 r = sd_rtnl_message_close_container(m);
1816 if (r < 0) {
1817 log_error("Failed to close netlink container: %s", strerror(-r));
1818 return r;
1819 }
1820
1821 r = sd_rtnl_message_close_container(m);
1822 if (r < 0) {
1823 log_error("Failed to close netlink container: %s", strerror(-r));
1824 return r;
1825 }
1826
1827 r = sd_rtnl_message_close_container(m);
1828 if (r < 0) {
1829 log_error("Failed to close netlink container: %s", strerror(-r));
1830 return r;
1831 }
1832
1833 r = sd_rtnl_call(rtnl, m, 0, NULL);
1834 if (r < 0) {
1835 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1836 return r;
1837 }
1838
5aa4bb6b
LP
1839 i = (int) if_nametoindex(iface_name);
1840 if (i <= 0) {
1841 log_error("Failed to resolve interface %s: %m", iface_name);
1842 return -errno;
1843 }
1844
1845 *ifi = i;
1846
69c79d3c
LP
1847 return 0;
1848}
1849
5aa4bb6b 1850static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1851 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1852 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1853 int r, bridge;
1854
1855 if (!arg_private_network)
1856 return 0;
1857
1858 if (!arg_network_veth)
1859 return 0;
1860
1861 if (!arg_network_bridge)
1862 return 0;
1863
1864 bridge = (int) if_nametoindex(arg_network_bridge);
1865 if (bridge <= 0) {
1866 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1867 return -errno;
1868 }
1869
5aa4bb6b
LP
1870 *ifi = bridge;
1871
151b9b96 1872 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1873 if (r < 0) {
1874 log_error("Failed to connect to netlink: %s", strerror(-r));
1875 return r;
1876 }
1877
151b9b96 1878 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1879 if (r < 0) {
1880 log_error("Failed to allocate netlink message: %s", strerror(-r));
1881 return r;
1882 }
1883
039dd4af
TG
1884 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1885 if (r < 0) {
1886 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1887 return r;
1888 }
1889
ab046dde
TG
1890 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1891 if (r < 0) {
1892 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1893 return r;
1894 }
1895
1896 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1897 if (r < 0) {
1898 log_error("Failed to add netlink master field: %s", strerror(-r));
1899 return r;
1900 }
1901
1902 r = sd_rtnl_call(rtnl, m, 0, NULL);
1903 if (r < 0) {
1904 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1905 return r;
1906 }
1907
1908 return 0;
1909}
1910
c74e630d
LP
1911static int parse_interface(struct udev *udev, const char *name) {
1912 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1913 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1914 int ifi;
1915
1916 ifi = (int) if_nametoindex(name);
1917 if (ifi <= 0) {
1918 log_error("Failed to resolve interface %s: %m", name);
1919 return -errno;
1920 }
1921
1922 sprintf(ifi_str, "n%i", ifi);
1923 d = udev_device_new_from_device_id(udev, ifi_str);
1924 if (!d) {
1925 log_error("Failed to get udev device for interface %s: %m", name);
1926 return -errno;
1927 }
1928
1929 if (udev_device_get_is_initialized(d) <= 0) {
1930 log_error("Network interface %s is not initialized yet.", name);
1931 return -EBUSY;
1932 }
1933
1934 return ifi;
1935}
1936
69c79d3c 1937static int move_network_interfaces(pid_t pid) {
7e227024 1938 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1939 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1940 char **i;
1941 int r;
1942
1943 if (!arg_private_network)
1944 return 0;
1945
1946 if (strv_isempty(arg_network_interfaces))
1947 return 0;
1948
151b9b96 1949 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1950 if (r < 0) {
1951 log_error("Failed to connect to netlink: %s", strerror(-r));
1952 return r;
1953 }
1954
7e227024
LP
1955 udev = udev_new();
1956 if (!udev) {
1957 log_error("Failed to connect to udev.");
1958 return -ENOMEM;
1959 }
1960
aa28aefe 1961 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1962 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1963 int ifi;
aa28aefe 1964
c74e630d
LP
1965 ifi = parse_interface(udev, *i);
1966 if (ifi < 0)
1967 return ifi;
1968
3125b3ef 1969 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
c74e630d
LP
1970 if (r < 0) {
1971 log_error("Failed to allocate netlink message: %s", strerror(-r));
1972 return r;
aa28aefe
LP
1973 }
1974
c74e630d
LP
1975 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1976 if (r < 0) {
1977 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1978 return r;
7e227024
LP
1979 }
1980
c74e630d
LP
1981 r = sd_rtnl_call(rtnl, m, 0, NULL);
1982 if (r < 0) {
1983 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1984 return r;
7e227024 1985 }
c74e630d 1986 }
7e227024 1987
c74e630d
LP
1988 return 0;
1989}
1990
1991static int setup_macvlan(pid_t pid) {
1992 _cleanup_udev_unref_ struct udev *udev = NULL;
1993 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1994 char **i;
1995 int r;
1996
1997 if (!arg_private_network)
1998 return 0;
1999
2000 if (strv_isempty(arg_network_macvlan))
2001 return 0;
2002
2003 r = sd_rtnl_open(&rtnl, 0);
2004 if (r < 0) {
2005 log_error("Failed to connect to netlink: %s", strerror(-r));
2006 return r;
2007 }
2008
2009 udev = udev_new();
2010 if (!udev) {
2011 log_error("Failed to connect to udev.");
2012 return -ENOMEM;
2013 }
2014
2015 STRV_FOREACH(i, arg_network_macvlan) {
2016 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2017 _cleanup_free_ char *n = NULL;
2018 int ifi;
2019
2020 ifi = parse_interface(udev, *i);
2021 if (ifi < 0)
2022 return ifi;
2023
2024 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
2025 if (r < 0) {
2026 log_error("Failed to allocate netlink message: %s", strerror(-r));
2027 return r;
2028 }
2029
c74e630d
LP
2030 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2031 if (r < 0) {
2032 log_error("Failed to add netlink interface index: %s", strerror(-r));
2033 return r;
2034 }
2035
2036 n = strappend("mv-", *i);
2037 if (!n)
2038 return log_oom();
2039
2040 strshorten(n, IFNAMSIZ-1);
2041
2042 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2043 if (r < 0) {
2044 log_error("Failed to add netlink interface name: %s", strerror(-r));
2045 return r;
2046 }
2047
aa28aefe
LP
2048 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2049 if (r < 0) {
c74e630d
LP
2050 log_error("Failed to add netlink namespace field: %s", strerror(-r));
2051 return r;
2052 }
2053
2054 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2055 if (r < 0) {
2056 log_error("Failed to open netlink container: %s", strerror(-r));
2057 return r;
2058 }
2059
d8e538ec 2060 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
c74e630d
LP
2061 if (r < 0) {
2062 log_error("Failed to open netlink container: %s", strerror(-r));
2063 return r;
2064 }
2065
2066 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2067 if (r < 0) {
2068 log_error("Failed to append macvlan mode: %s", strerror(-r));
2069 return r;
2070 }
2071
2072 r = sd_rtnl_message_close_container(m);
2073 if (r < 0) {
2074 log_error("Failed to close netlink container: %s", strerror(-r));
2075 return r;
2076 }
2077
2078 r = sd_rtnl_message_close_container(m);
2079 if (r < 0) {
2080 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
2081 return r;
2082 }
2083
2084 r = sd_rtnl_call(rtnl, m, 0, NULL);
2085 if (r < 0) {
c74e630d 2086 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
2087 return r;
2088 }
2089 }
2090
2091 return 0;
2092}
2093
28650077 2094static int setup_seccomp(void) {
24fb1112
LP
2095
2096#ifdef HAVE_SECCOMP
28650077
LP
2097 static const int blacklist[] = {
2098 SCMP_SYS(kexec_load),
2099 SCMP_SYS(open_by_handle_at),
2100 SCMP_SYS(init_module),
2101 SCMP_SYS(finit_module),
2102 SCMP_SYS(delete_module),
2103 SCMP_SYS(iopl),
2104 SCMP_SYS(ioperm),
2105 SCMP_SYS(swapon),
2106 SCMP_SYS(swapoff),
2107 };
2108
24fb1112 2109 scmp_filter_ctx seccomp;
28650077 2110 unsigned i;
24fb1112
LP
2111 int r;
2112
24fb1112
LP
2113 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2114 if (!seccomp)
2115 return log_oom();
2116
e9642be2 2117 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2118 if (r < 0) {
e9642be2
LP
2119 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2120 goto finish;
2121 }
2122
28650077
LP
2123 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2124 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2125 if (r == -EFAULT)
2126 continue; /* unknown syscall */
2127 if (r < 0) {
2128 log_error("Failed to block syscall: %s", strerror(-r));
2129 goto finish;
2130 }
2131 }
2132
2133 /*
2134 Audit is broken in containers, much of the userspace audit
2135 hookup will fail if running inside a container. We don't
2136 care and just turn off creation of audit sockets.
2137
2138 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2139 with EAFNOSUPPORT which audit userspace uses as indication
2140 that audit is disabled in the kernel.
2141 */
2142
3302da46 2143 r = seccomp_rule_add(
24fb1112
LP
2144 seccomp,
2145 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2146 SCMP_SYS(socket),
2147 2,
2148 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2149 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2150 if (r < 0) {
2151 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2152 goto finish;
2153 }
2154
2155 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2156 if (r < 0) {
2157 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2158 goto finish;
2159 }
2160
2161 r = seccomp_load(seccomp);
2162 if (r < 0)
2163 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2164
2165finish:
2166 seccomp_release(seccomp);
2167 return r;
2168#else
2169 return 0;
2170#endif
2171
2172}
2173
1b9e5b12
LP
2174static int setup_image(char **device_path, int *loop_nr) {
2175 struct loop_info64 info = {
2176 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2177 };
2178 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2179 _cleanup_free_ char* loopdev = NULL;
2180 struct stat st;
2181 int r, nr;
2182
2183 assert(device_path);
2184 assert(loop_nr);
2185
2186 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2187 if (fd < 0) {
2188 log_error("Failed to open %s: %m", arg_image);
2189 return -errno;
2190 }
2191
2192 if (fstat(fd, &st) < 0) {
2193 log_error("Failed to stat %s: %m", arg_image);
2194 return -errno;
2195 }
2196
2197 if (S_ISBLK(st.st_mode)) {
2198 char *p;
2199
2200 p = strdup(arg_image);
2201 if (!p)
2202 return log_oom();
2203
2204 *device_path = p;
2205
2206 *loop_nr = -1;
2207
2208 r = fd;
2209 fd = -1;
2210
2211 return r;
2212 }
2213
2214 if (!S_ISREG(st.st_mode)) {
2215 log_error("%s is not a regular file or block device: %m", arg_image);
2216 return -EINVAL;
2217 }
2218
2219 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2220 if (control < 0) {
2221 log_error("Failed to open /dev/loop-control: %m");
2222 return -errno;
2223 }
2224
2225 nr = ioctl(control, LOOP_CTL_GET_FREE);
2226 if (nr < 0) {
2227 log_error("Failed to allocate loop device: %m");
2228 return -errno;
2229 }
2230
2231 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2232 return log_oom();
2233
2234 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2235 if (loop < 0) {
2236 log_error("Failed to open loop device %s: %m", loopdev);
2237 return -errno;
2238 }
2239
2240 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2241 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2242 return -errno;
2243 }
2244
2245 if (arg_read_only)
2246 info.lo_flags |= LO_FLAGS_READ_ONLY;
2247
2248 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2249 log_error("Failed to set loopback settings on %s: %m", loopdev);
2250 return -errno;
2251 }
2252
2253 *device_path = loopdev;
2254 loopdev = NULL;
2255
2256 *loop_nr = nr;
2257
2258 r = loop;
2259 loop = -1;
2260
2261 return r;
2262}
2263
2264static int dissect_image(
2265 int fd,
727fd4fd
LP
2266 char **root_device, bool *root_device_rw,
2267 char **home_device, bool *home_device_rw,
2268 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2269 bool *secondary) {
2270
2271#ifdef HAVE_BLKID
2272 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2273 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2274 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2275 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2276 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2277 _cleanup_udev_unref_ struct udev *udev = NULL;
2278 struct udev_list_entry *first, *item;
727fd4fd 2279 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2280 const char *pttype = NULL;
2281 blkid_partlist pl;
2282 struct stat st;
2283 int r;
2284
2285 assert(fd >= 0);
2286 assert(root_device);
2287 assert(home_device);
2288 assert(srv_device);
2289 assert(secondary);
2290
2291 b = blkid_new_probe();
2292 if (!b)
2293 return log_oom();
2294
2295 errno = 0;
2296 r = blkid_probe_set_device(b, fd, 0, 0);
2297 if (r != 0) {
2298 if (errno == 0)
2299 return log_oom();
2300
2301 log_error("Failed to set device on blkid probe: %m");
2302 return -errno;
2303 }
2304
2305 blkid_probe_enable_partitions(b, 1);
2306 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2307
2308 errno = 0;
2309 r = blkid_do_safeprobe(b);
2310 if (r == -2 || r == 1) {
2311 log_error("Failed to identify any partition table on %s.\n"
2312 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2313 return -EINVAL;
2314 } else if (r != 0) {
2315 if (errno == 0)
2316 errno = EIO;
2317 log_error("Failed to probe: %m");
2318 return -errno;
2319 }
2320
2321 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2322 if (!streq_ptr(pttype, "gpt")) {
2323 log_error("Image %s does not carry a GUID Partition Table.\n"
2324 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2325 return -EINVAL;
2326 }
2327
2328 errno = 0;
2329 pl = blkid_probe_get_partitions(b);
2330 if (!pl) {
2331 if (errno == 0)
2332 return log_oom();
2333
2334 log_error("Failed to list partitions of %s", arg_image);
2335 return -errno;
2336 }
2337
2338 udev = udev_new();
2339 if (!udev)
2340 return log_oom();
2341
2342 if (fstat(fd, &st) < 0) {
2343 log_error("Failed to stat block device: %m");
2344 return -errno;
2345 }
2346
2347 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2348 if (!d)
2349 return log_oom();
2350
2351 e = udev_enumerate_new(udev);
2352 if (!e)
2353 return log_oom();
2354
2355 r = udev_enumerate_add_match_parent(e, d);
2356 if (r < 0)
2357 return log_oom();
2358
2359 r = udev_enumerate_scan_devices(e);
2360 if (r < 0) {
2361 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2362 return r;
2363 }
2364
2365 first = udev_enumerate_get_list_entry(e);
2366 udev_list_entry_foreach(item, first) {
2367 _cleanup_udev_device_unref_ struct udev_device *q;
2368 const char *stype, *node;
727fd4fd 2369 unsigned long long flags;
1b9e5b12
LP
2370 sd_id128_t type_id;
2371 blkid_partition pp;
2372 dev_t qn;
2373 int nr;
2374
2375 errno = 0;
2376 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2377 if (!q) {
2378 if (!errno)
2379 errno = ENOMEM;
2380
2381 log_error("Failed to get partition device of %s: %m", arg_image);
2382 return -errno;
2383 }
2384
2385 qn = udev_device_get_devnum(q);
2386 if (major(qn) == 0)
2387 continue;
2388
2389 if (st.st_rdev == qn)
2390 continue;
2391
2392 node = udev_device_get_devnode(q);
2393 if (!node)
2394 continue;
2395
2396 pp = blkid_partlist_devno_to_partition(pl, qn);
2397 if (!pp)
2398 continue;
2399
727fd4fd
LP
2400 flags = blkid_partition_get_flags(pp);
2401 if (flags & GPT_FLAG_NO_AUTO)
2402 continue;
2403
1b9e5b12
LP
2404 nr = blkid_partition_get_partno(pp);
2405 if (nr < 0)
2406 continue;
2407
2408 stype = blkid_partition_get_type_string(pp);
2409 if (!stype)
2410 continue;
2411
2412 if (sd_id128_from_string(stype, &type_id) < 0)
2413 continue;
2414
2415 if (sd_id128_equal(type_id, GPT_HOME)) {
2416
2417 if (home && nr >= home_nr)
2418 continue;
2419
2420 home_nr = nr;
727fd4fd
LP
2421 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2422
1b9e5b12
LP
2423 free(home);
2424 home = strdup(node);
2425 if (!home)
2426 return log_oom();
2427 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2428
2429 if (srv && nr >= srv_nr)
2430 continue;
2431
2432 srv_nr = nr;
727fd4fd
LP
2433 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2434
1b9e5b12
LP
2435 free(srv);
2436 srv = strdup(node);
2437 if (!srv)
2438 return log_oom();
2439 }
2440#ifdef GPT_ROOT_NATIVE
2441 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2442
2443 if (root && nr >= root_nr)
2444 continue;
2445
2446 root_nr = nr;
727fd4fd
LP
2447 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2448
1b9e5b12
LP
2449 free(root);
2450 root = strdup(node);
2451 if (!root)
2452 return log_oom();
2453 }
2454#endif
2455#ifdef GPT_ROOT_SECONDARY
2456 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2457
2458 if (secondary_root && nr >= secondary_root_nr)
2459 continue;
2460
2461 secondary_root_nr = nr;
727fd4fd
LP
2462 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2463
2464
1b9e5b12
LP
2465 free(secondary_root);
2466 secondary_root = strdup(node);
2467 if (!secondary_root)
2468 return log_oom();
2469 }
2470#endif
2471 }
2472
2473 if (!root && !secondary_root) {
2474 log_error("Failed to identify root partition in disk image %s.\n"
2475 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2476 return -EINVAL;
2477 }
2478
2479 if (root) {
2480 *root_device = root;
2481 root = NULL;
727fd4fd
LP
2482
2483 *root_device_rw = root_rw;
1b9e5b12
LP
2484 *secondary = false;
2485 } else if (secondary_root) {
2486 *root_device = secondary_root;
2487 secondary_root = NULL;
727fd4fd
LP
2488
2489 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2490 *secondary = true;
2491 }
2492
2493 if (home) {
2494 *home_device = home;
2495 home = NULL;
727fd4fd
LP
2496
2497 *home_device_rw = home_rw;
1b9e5b12
LP
2498 }
2499
2500 if (srv) {
2501 *srv_device = srv;
2502 srv = NULL;
727fd4fd
LP
2503
2504 *srv_device_rw = srv_rw;
1b9e5b12
LP
2505 }
2506
2507 return 0;
2508#else
2509 log_error("--image= is not supported, compiled without blkid support.");
2510 return -ENOTSUP;
2511#endif
2512}
2513
727fd4fd 2514static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2515#ifdef HAVE_BLKID
2516 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2517 const char *fstype, *p;
2518 int r;
2519
2520 assert(what);
2521 assert(where);
2522
727fd4fd
LP
2523 if (arg_read_only)
2524 rw = false;
2525
1b9e5b12
LP
2526 if (directory)
2527 p = strappenda(where, directory);
2528 else
2529 p = where;
2530
2531 errno = 0;
2532 b = blkid_new_probe_from_filename(what);
2533 if (!b) {
2534 if (errno == 0)
2535 return log_oom();
2536 log_error("Failed to allocate prober for %s: %m", what);
2537 return -errno;
2538 }
2539
2540 blkid_probe_enable_superblocks(b, 1);
2541 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2542
2543 errno = 0;
2544 r = blkid_do_safeprobe(b);
2545 if (r == -1 || r == 1) {
2546 log_error("Cannot determine file system type of %s", what);
2547 return -EINVAL;
2548 } else if (r != 0) {
2549 if (errno == 0)
2550 errno = EIO;
2551 log_error("Failed to probe %s: %m", what);
2552 return -errno;
2553 }
2554
2555 errno = 0;
2556 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2557 if (errno == 0)
2558 errno = EINVAL;
2559 log_error("Failed to determine file system type of %s", what);
2560 return -errno;
2561 }
2562
2563 if (streq(fstype, "crypto_LUKS")) {
2564 log_error("nspawn currently does not support LUKS disk images.");
2565 return -ENOTSUP;
2566 }
2567
727fd4fd 2568 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
1b9e5b12
LP
2569 log_error("Failed to mount %s: %m", what);
2570 return -errno;
2571 }
2572
2573 return 0;
2574#else
2575 log_error("--image= is not supported, compiled without blkid support.");
2576 return -ENOTSUP;
2577#endif
2578}
2579
727fd4fd
LP
2580static int mount_devices(
2581 const char *where,
2582 const char *root_device, bool root_device_rw,
2583 const char *home_device, bool home_device_rw,
2584 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2585 int r;
2586
2587 assert(where);
2588
2589 if (root_device) {
727fd4fd 2590 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
1b9e5b12
LP
2591 if (r < 0) {
2592 log_error("Failed to mount root directory: %s", strerror(-r));
2593 return r;
2594 }
2595 }
2596
2597 if (home_device) {
727fd4fd 2598 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
1b9e5b12
LP
2599 if (r < 0) {
2600 log_error("Failed to mount home directory: %s", strerror(-r));
2601 return r;
2602 }
2603 }
2604
2605 if (srv_device) {
727fd4fd 2606 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
1b9e5b12
LP
2607 if (r < 0) {
2608 log_error("Failed to mount server data directory: %s", strerror(-r));
2609 return r;
2610 }
2611 }
2612
2613 return 0;
2614}
2615
2616static void loop_remove(int nr, int *image_fd) {
2617 _cleanup_close_ int control = -1;
e8c8ddcc 2618 int r;
1b9e5b12
LP
2619
2620 if (nr < 0)
2621 return;
2622
2623 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2624 r = ioctl(*image_fd, LOOP_CLR_FD);
2625 if (r < 0)
2626 log_warning("Failed to close loop image: %m");
03e334a1 2627 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2628 }
2629
2630 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc
TG
2631 if (control < 0) {
2632 log_warning("Failed to open /dev/loop-control: %m");
1b9e5b12 2633 return;
e8c8ddcc 2634 }
1b9e5b12 2635
e8c8ddcc
TG
2636 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2637 if (r < 0)
2638 log_warning("Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2639}
2640
0cb9fbcd
LP
2641static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2642 int pipe_fds[2];
2643 pid_t pid;
2644
2645 assert(database);
2646 assert(key);
2647 assert(rpid);
2648
2649 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2650 log_error("Failed to allocate pipe: %m");
2651 return -errno;
2652 }
2653
2654 pid = fork();
2655 if (pid < 0) {
2656 log_error("Failed to fork getent child: %m");
2657 return -errno;
2658 } else if (pid == 0) {
2659 int nullfd;
2660 char *empty_env = NULL;
2661
2662 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2663 _exit(EXIT_FAILURE);
2664
2665 if (pipe_fds[0] > 2)
03e334a1 2666 safe_close(pipe_fds[0]);
0cb9fbcd 2667 if (pipe_fds[1] > 2)
03e334a1 2668 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2669
2670 nullfd = open("/dev/null", O_RDWR);
2671 if (nullfd < 0)
2672 _exit(EXIT_FAILURE);
2673
2674 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2675 _exit(EXIT_FAILURE);
2676
2677 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2678 _exit(EXIT_FAILURE);
2679
2680 if (nullfd > 2)
03e334a1 2681 safe_close(nullfd);
0cb9fbcd
LP
2682
2683 reset_all_signal_handlers();
2684 close_all_fds(NULL, 0);
2685
4de82926
MM
2686 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2687 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2688 _exit(EXIT_FAILURE);
2689 }
2690
03e334a1 2691 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2692
2693 *rpid = pid;
2694
2695 return pipe_fds[0];
2696}
2697
2698static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2699 char line[LINE_MAX], *x, *u, *g, *h;
2700 const char *word, *state;
0cb9fbcd
LP
2701 _cleanup_free_ uid_t *uids = NULL;
2702 _cleanup_free_ char *home = NULL;
2703 _cleanup_fclose_ FILE *f = NULL;
2704 _cleanup_close_ int fd = -1;
2705 unsigned n_uids = 0;
70f539ca 2706 size_t sz = 0, l;
0cb9fbcd
LP
2707 uid_t uid;
2708 gid_t gid;
2709 pid_t pid;
2710 int r;
2711
2712 assert(_home);
2713
2714 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2715 /* Reset everything fully to 0, just in case */
2716
2717 if (setgroups(0, NULL) < 0) {
2718 log_error("setgroups() failed: %m");
2719 return -errno;
2720 }
2721
2722 if (setresgid(0, 0, 0) < 0) {
2723 log_error("setregid() failed: %m");
2724 return -errno;
2725 }
2726
2727 if (setresuid(0, 0, 0) < 0) {
2728 log_error("setreuid() failed: %m");
2729 return -errno;
2730 }
2731
2732 *_home = NULL;
2733 return 0;
2734 }
2735
2736 /* First, get user credentials */
2737 fd = spawn_getent("passwd", arg_user, &pid);
2738 if (fd < 0)
2739 return fd;
2740
2741 f = fdopen(fd, "r");
2742 if (!f)
2743 return log_oom();
2744 fd = -1;
2745
2746 if (!fgets(line, sizeof(line), f)) {
2747
2748 if (!ferror(f)) {
2749 log_error("Failed to resolve user %s.", arg_user);
2750 return -ESRCH;
2751 }
2752
2753 log_error("Failed to read from getent: %m");
2754 return -errno;
2755 }
2756
2757 truncate_nl(line);
2758
2759 wait_for_terminate_and_warn("getent passwd", pid);
2760
2761 x = strchr(line, ':');
2762 if (!x) {
2763 log_error("/etc/passwd entry has invalid user field.");
2764 return -EIO;
2765 }
2766
2767 u = strchr(x+1, ':');
2768 if (!u) {
2769 log_error("/etc/passwd entry has invalid password field.");
2770 return -EIO;
2771 }
2772
2773 u++;
2774 g = strchr(u, ':');
2775 if (!g) {
2776 log_error("/etc/passwd entry has invalid UID field.");
2777 return -EIO;
2778 }
2779
2780 *g = 0;
2781 g++;
2782 x = strchr(g, ':');
2783 if (!x) {
2784 log_error("/etc/passwd entry has invalid GID field.");
2785 return -EIO;
2786 }
2787
2788 *x = 0;
2789 h = strchr(x+1, ':');
2790 if (!h) {
2791 log_error("/etc/passwd entry has invalid GECOS field.");
2792 return -EIO;
2793 }
2794
2795 h++;
2796 x = strchr(h, ':');
2797 if (!x) {
2798 log_error("/etc/passwd entry has invalid home directory field.");
2799 return -EIO;
2800 }
2801
2802 *x = 0;
2803
2804 r = parse_uid(u, &uid);
2805 if (r < 0) {
2806 log_error("Failed to parse UID of user.");
2807 return -EIO;
2808 }
2809
2810 r = parse_gid(g, &gid);
2811 if (r < 0) {
2812 log_error("Failed to parse GID of user.");
2813 return -EIO;
2814 }
2815
2816 home = strdup(h);
2817 if (!home)
2818 return log_oom();
2819
2820 /* Second, get group memberships */
2821 fd = spawn_getent("initgroups", arg_user, &pid);
2822 if (fd < 0)
2823 return fd;
2824
2825 fclose(f);
2826 f = fdopen(fd, "r");
2827 if (!f)
2828 return log_oom();
2829 fd = -1;
2830
2831 if (!fgets(line, sizeof(line), f)) {
2832 if (!ferror(f)) {
2833 log_error("Failed to resolve user %s.", arg_user);
2834 return -ESRCH;
2835 }
2836
2837 log_error("Failed to read from getent: %m");
2838 return -errno;
2839 }
2840
2841 truncate_nl(line);
2842
2843 wait_for_terminate_and_warn("getent initgroups", pid);
2844
2845 /* Skip over the username and subsequent separator whitespace */
2846 x = line;
2847 x += strcspn(x, WHITESPACE);
2848 x += strspn(x, WHITESPACE);
2849
a2a5291b 2850 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2851 char c[l+1];
2852
a2a5291b 2853 memcpy(c, word, l);
0cb9fbcd
LP
2854 c[l] = 0;
2855
2856 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2857 return log_oom();
2858
2859 r = parse_uid(c, &uids[n_uids++]);
2860 if (r < 0) {
2861 log_error("Failed to parse group data from getent.");
2862 return -EIO;
2863 }
2864 }
2865
2866 r = mkdir_parents(home, 0775);
2867 if (r < 0) {
2868 log_error("Failed to make home root directory: %s", strerror(-r));
2869 return r;
2870 }
2871
2872 r = mkdir_safe(home, 0755, uid, gid);
f418f31d 2873 if (r < 0 && r != -EEXIST) {
0cb9fbcd
LP
2874 log_error("Failed to make home directory: %s", strerror(-r));
2875 return r;
2876 }
2877
2878 fchown(STDIN_FILENO, uid, gid);
2879 fchown(STDOUT_FILENO, uid, gid);
2880 fchown(STDERR_FILENO, uid, gid);
2881
2882 if (setgroups(n_uids, uids) < 0) {
2883 log_error("Failed to set auxiliary groups: %m");
2884 return -errno;
2885 }
2886
2887 if (setresgid(gid, gid, gid) < 0) {
2888 log_error("setregid() failed: %m");
2889 return -errno;
2890 }
2891
2892 if (setresuid(uid, uid, uid) < 0) {
2893 log_error("setreuid() failed: %m");
2894 return -errno;
2895 }
2896
2897 if (_home) {
2898 *_home = home;
2899 home = NULL;
2900 }
2901
2902 return 0;
2903}
2904
113cea80 2905/*
6d416b9c
LS
2906 * Return values:
2907 * < 0 : wait_for_terminate() failed to get the state of the
2908 * container, the container was terminated by a signal, or
2909 * failed for an unknown reason. No change is made to the
2910 * container argument.
2911 * > 0 : The program executed in the container terminated with an
2912 * error. The exit code of the program executed in the
2913 * container is returned. No change is made to the container
2914 * argument.
2915 * 0 : The container is being rebooted, has been shut down or exited
2916 * successfully. The container argument has been set to either
2917 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2918 *
6d416b9c
LS
2919 * That is, success is indicated by a return value of zero, and an
2920 * error is indicated by a non-zero value.
113cea80
DH
2921 */
2922static int wait_for_container(pid_t pid, ContainerStatus *container) {
2923 int r;
2924 siginfo_t status;
2925
2926 r = wait_for_terminate(pid, &status);
8baaf7a3
LS
2927 if (r < 0) {
2928 log_warning("Failed to wait for container: %s", strerror(-r));
113cea80 2929 return r;
8baaf7a3 2930 }
113cea80
DH
2931
2932 switch (status.si_code) {
2933 case CLD_EXITED:
2934 r = status.si_status;
2935 if (r == 0) {
2936 if (!arg_quiet)
2937 log_debug("Container %s exited successfully.",
2938 arg_machine);
2939
2940 *container = CONTAINER_TERMINATED;
2941 } else {
2942 log_error("Container %s failed with error code %i.",
2943 arg_machine, status.si_status);
113cea80
DH
2944 }
2945 break;
2946
2947 case CLD_KILLED:
2948 if (status.si_status == SIGINT) {
2949 if (!arg_quiet)
2950 log_info("Container %s has been shut down.",
2951 arg_machine);
2952
2953 *container = CONTAINER_TERMINATED;
2954 r = 0;
2955 break;
2956 } else if (status.si_status == SIGHUP) {
2957 if (!arg_quiet)
2958 log_info("Container %s is being rebooted.",
2959 arg_machine);
2960
2961 *container = CONTAINER_REBOOTED;
2962 r = 0;
2963 break;
2964 }
2965 /* CLD_KILLED fallthrough */
2966
2967 case CLD_DUMPED:
2968 log_error("Container %s terminated by signal %s.",
2969 arg_machine, signal_to_string(status.si_status));
2970 r = -1;
2971 break;
2972
2973 default:
2974 log_error("Container %s failed due to unknown reason.",
2975 arg_machine);
2976 r = -1;
2977 break;
2978 }
2979
2980 return r;
2981}
2982
e866af3a
DH
2983static void nop_handler(int sig) {}
2984
88213476 2985int main(int argc, char *argv[]) {
69c79d3c 2986
1b9e5b12 2987 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2988 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
d96c1ecf 2989 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
3d94f76c 2990 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2991 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2992 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2993 const char *console = NULL;
1b9e5b12
LP
2994 char veth_name[IFNAMSIZ];
2995 bool secondary = false;
e866af3a 2996 sigset_t mask, mask_chld;
69c79d3c 2997 pid_t pid = 0;
88213476
LP
2998
2999 log_parse_environment();
3000 log_open();
3001
05947bef
LP
3002 k = parse_argv(argc, argv);
3003 if (k < 0)
88213476 3004 goto finish;
05947bef
LP
3005 else if (k == 0) {
3006 r = EXIT_SUCCESS;
3007 goto finish;
3008 }
88213476 3009
1b9e5b12
LP
3010 if (!arg_image) {
3011 if (arg_directory) {
3012 char *p;
88213476 3013
1b9e5b12
LP
3014 p = path_make_absolute_cwd(arg_directory);
3015 free(arg_directory);
3016 arg_directory = p;
3017 } else
3018 arg_directory = get_current_dir_name();
88213476 3019
1b9e5b12
LP
3020 if (!arg_directory) {
3021 log_error("Failed to determine path, please use -D.");
3022 goto finish;
3023 }
3024 path_kill_slashes(arg_directory);
88213476
LP
3025 }
3026
7027ff61 3027 if (!arg_machine) {
1b9e5b12 3028 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
3029 if (!arg_machine) {
3030 log_oom();
3031 goto finish;
3032 }
3033
e724b063 3034 hostname_cleanup(arg_machine, false);
7027ff61
LP
3035 if (isempty(arg_machine)) {
3036 log_error("Failed to determine machine name automatically, please use -M.");
3037 goto finish;
3038 }
3039 }
3040
88213476
LP
3041 if (geteuid() != 0) {
3042 log_error("Need to be root.");
3043 goto finish;
3044 }
3045
04d391da
LP
3046 if (sd_booted() <= 0) {
3047 log_error("Not running on a systemd system.");
3048 goto finish;
3049 }
3050
1b9e5b12
LP
3051 log_close();
3052 n_fd_passed = sd_listen_fds(false);
3053 if (n_fd_passed > 0) {
3054 k = fdset_new_listen_fds(&fds, false);
3055 if (k < 0) {
3056 log_error("Failed to collect file descriptors: %s", strerror(-k));
3057 goto finish;
3058 }
88213476 3059 }
1b9e5b12
LP
3060 fdset_close_others(fds);
3061 log_open();
88213476 3062
1b9e5b12
LP
3063 if (arg_directory) {
3064 if (path_equal(arg_directory, "/")) {
3065 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
3066 goto finish;
3067 }
1b9e5b12
LP
3068
3069 if (arg_boot) {
3070 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3071 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
3072 goto finish;
3073 }
3074 } else {
3075 const char *p;
3076
3077 p = strappenda(arg_directory,
3078 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3079 if (access(p, F_OK) < 0) {
3080 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3081 goto finish;
3082
3083 }
3084 }
6b9132a9 3085 } else {
1b9e5b12 3086 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3087
1b9e5b12
LP
3088 if (!mkdtemp(template)) {
3089 log_error("Failed to create temporary directory: %m");
3090 r = -errno;
6b9132a9 3091 goto finish;
1b9e5b12 3092 }
6b9132a9 3093
1b9e5b12
LP
3094 arg_directory = strdup(template);
3095 if (!arg_directory) {
3096 r = log_oom();
3097 goto finish;
6b9132a9 3098 }
88213476 3099
1b9e5b12
LP
3100 image_fd = setup_image(&device_path, &loop_nr);
3101 if (image_fd < 0) {
3102 r = image_fd;
842f3b0f
LP
3103 goto finish;
3104 }
1b9e5b12 3105
4d9f07b4
LP
3106 r = dissect_image(image_fd,
3107 &root_device, &root_device_rw,
3108 &home_device, &home_device_rw,
3109 &srv_device, &srv_device_rw,
3110 &secondary);
1b9e5b12
LP
3111 if (r < 0)
3112 goto finish;
842f3b0f 3113 }
842f3b0f 3114
db7feb7e
LP
3115 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3116 if (master < 0) {
a258bf26
LP
3117 log_error("Failed to acquire pseudo tty: %m");
3118 goto finish;
3119 }
3120
db7feb7e
LP
3121 console = ptsname(master);
3122 if (!console) {
a258bf26
LP
3123 log_error("Failed to determine tty name: %m");
3124 goto finish;
3125 }
3126
284c0b91 3127 if (!arg_quiet)
45f1386c
ZJS
3128 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3129 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
3130
3131 if (unlockpt(master) < 0) {
3132 log_error("Failed to unlock tty: %m");
3133 goto finish;
3134 }
3135
eb91eb18
LP
3136 if (access("/dev/kdbus/control", F_OK) >= 0) {
3137
3138 if (arg_share_system) {
3139 kdbus_domain = strdup("/dev/kdbus");
3140 if (!kdbus_domain) {
3141 log_oom();
3142 goto finish;
3143 }
3144 } else {
3145 const char *ns;
3146
3147 ns = strappenda("machine-", arg_machine);
3148 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3149 if (r < 0)
3150 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3151 else
3152 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3153 }
3154 }
9bd37b40 3155
e58a1277 3156 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
3157 log_error("Failed to create kmsg socket pair: %m");
3158 goto finish;
3159 }
3160
af4ec430
LP
3161 sd_notify(false,
3162 "READY=1\n"
3163 "STATUS=Container running.");
05947bef 3164
a258bf26 3165 assert_se(sigemptyset(&mask) == 0);
e866af3a
DH
3166 assert_se(sigemptyset(&mask_chld) == 0);
3167 sigaddset(&mask_chld, SIGCHLD);
a258bf26
LP
3168 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3169 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3170
d87be9b0 3171 for (;;) {
113cea80 3172 ContainerStatus container_status;
7566e267 3173 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3174 struct sigaction sa = {
3175 .sa_handler = nop_handler,
3176 .sa_flags = SA_NOCLDSTOP,
3177 };
3178
7566e267 3179 r = barrier_create(&barrier);
a2da110b
DH
3180 if (r < 0) {
3181 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3182 goto finish;
3183 }
3184
e866af3a
DH
3185 /* Child can be killed before execv(), so handle SIGCHLD
3186 * in order to interrupt parent's blocking calls and
3187 * give it a chance to call wait() and terminate. */
3188 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3189 if (r < 0) {
3190 log_error("Failed to change the signal mask: %m");
d96c1ecf
LP
3191 goto finish;
3192 }
3193
e866af3a
DH
3194 r = sigaction(SIGCHLD, &sa, NULL);
3195 if (r < 0) {
3196 log_error("Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3197 goto finish;
3198 }
3199
a2da110b
DH
3200 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3201 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3202 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3203 if (pid < 0) {
3204 if (errno == EINVAL)
3205 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3206 else
3207 log_error("clone() failed: %m");
a258bf26 3208
e866af3a 3209 r = pid;
d87be9b0
LP
3210 goto finish;
3211 }
a258bf26 3212
d87be9b0
LP
3213 if (pid == 0) {
3214 /* child */
0cb9fbcd 3215 _cleanup_free_ char *home = NULL;
5674767e 3216 unsigned n_env = 2;
d87be9b0 3217 const char *envp[] = {
e10a55fd 3218 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3219 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3220 NULL, /* TERM */
3221 NULL, /* HOME */
3222 NULL, /* USER */
3223 NULL, /* LOGNAME */
3224 NULL, /* container_uuid */
842f3b0f
LP
3225 NULL, /* LISTEN_FDS */
3226 NULL, /* LISTEN_PID */
d87be9b0
LP
3227 NULL
3228 };
f4889f65 3229 char **env_use;
a258bf26 3230
a2da110b
DH
3231 barrier_set_role(&barrier, BARRIER_CHILD);
3232
5674767e
ZJS
3233 envp[n_env] = strv_find_prefix(environ, "TERM=");
3234 if (envp[n_env])
3235 n_env ++;
a258bf26 3236
03e334a1 3237 master = safe_close(master);
a258bf26 3238
d87be9b0
LP
3239 close_nointr(STDIN_FILENO);
3240 close_nointr(STDOUT_FILENO);
3241 close_nointr(STDERR_FILENO);
db7feb7e 3242
03e334a1 3243 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3244
d87be9b0 3245 reset_all_signal_handlers();
1b6d7fa7 3246 reset_signal_mask();
f5c1b9ee 3247
842f3b0f
LP
3248 k = open_terminal(console, O_RDWR);
3249 if (k != STDIN_FILENO) {
3250 if (k >= 0) {
03e334a1 3251 safe_close(k);
842f3b0f
LP
3252 k = -EINVAL;
3253 }
3254
3255 log_error("Failed to open console: %s", strerror(-k));
a2da110b 3256 _exit(EXIT_FAILURE);
842f3b0f
LP
3257 }
3258
3259 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3260 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3261 log_error("Failed to duplicate console: %m");
a2da110b 3262 _exit(EXIT_FAILURE);
842f3b0f 3263 }
bc2f673e 3264
d87be9b0
LP
3265 if (setsid() < 0) {
3266 log_error("setsid() failed: %m");
a2da110b 3267 _exit(EXIT_FAILURE);
bc2f673e
LP
3268 }
3269
db999e0f 3270 if (reset_audit_loginuid() < 0)
a2da110b 3271 _exit(EXIT_FAILURE);
db999e0f 3272
d87be9b0
LP
3273 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3274 log_error("PR_SET_PDEATHSIG failed: %m");
a2da110b 3275 _exit(EXIT_FAILURE);
d87be9b0 3276 }
e58a1277 3277
d87be9b0
LP
3278 /* Mark everything as slave, so that we still
3279 * receive mounts from the real root, but don't
3280 * propagate mounts to the real root. */
3281 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3282 log_error("MS_SLAVE|MS_REC failed: %m");
a2da110b 3283 _exit(EXIT_FAILURE);
d87be9b0 3284 }
04bc4a3f 3285
727fd4fd
LP
3286 if (mount_devices(arg_directory,
3287 root_device, root_device_rw,
3288 home_device, home_device_rw,
3289 srv_device, srv_device_rw) < 0)
a2da110b 3290 _exit(EXIT_FAILURE);
1b9e5b12 3291
d87be9b0
LP
3292 /* Turn directory into bind mount */
3293 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
d6797c92 3294 log_error("Failed to make bind mount: %m");
a2da110b 3295 _exit(EXIT_FAILURE);
d87be9b0 3296 }
88213476 3297
4d9f07b4
LP
3298 r = setup_volatile(arg_directory);
3299 if (r < 0)
a2da110b 3300 _exit(EXIT_FAILURE);
4d9f07b4
LP
3301
3302 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3303 _exit(EXIT_FAILURE);
4d9f07b4
LP
3304
3305 r = base_filesystem_create(arg_directory);
3306 if (r < 0)
a2da110b 3307 _exit(EXIT_FAILURE);
4d9f07b4 3308
d6797c92
LP
3309 if (arg_read_only) {
3310 k = bind_remount_recursive(arg_directory, true);
3311 if (k < 0) {
3312 log_error("Failed to make tree read-only: %s", strerror(-k));
a2da110b 3313 _exit(EXIT_FAILURE);
d87be9b0 3314 }
d6797c92 3315 }
2547bb41 3316
d87be9b0 3317 if (mount_all(arg_directory) < 0)
a2da110b 3318 _exit(EXIT_FAILURE);
57fb9fb5 3319
d87be9b0 3320 if (copy_devnodes(arg_directory) < 0)
a2da110b 3321 _exit(EXIT_FAILURE);
a258bf26 3322
f2d88580 3323 if (setup_ptmx(arg_directory) < 0)
a2da110b 3324 _exit(EXIT_FAILURE);
f2d88580 3325
d87be9b0 3326 dev_setup(arg_directory);
88213476 3327
28650077 3328 if (setup_seccomp() < 0)
a2da110b 3329 _exit(EXIT_FAILURE);
24fb1112 3330
d87be9b0 3331 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3332 _exit(EXIT_FAILURE);
88213476 3333
d87be9b0 3334 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3335 _exit(EXIT_FAILURE);
88213476 3336
03e334a1 3337 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3338
d87be9b0 3339 if (setup_boot_id(arg_directory) < 0)
a2da110b 3340 _exit(EXIT_FAILURE);
a41fe3a2 3341
d87be9b0 3342 if (setup_timezone(arg_directory) < 0)
a2da110b 3343 _exit(EXIT_FAILURE);
88213476 3344
d87be9b0 3345 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3346 _exit(EXIT_FAILURE);
687d0825 3347
d87be9b0 3348 if (setup_journal(arg_directory) < 0)
a2da110b 3349 _exit(EXIT_FAILURE);
687d0825 3350
d6797c92 3351 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3352 _exit(EXIT_FAILURE);
17fe0523 3353
d6797c92 3354 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3355 _exit(EXIT_FAILURE);
17fe0523 3356
06c17c39 3357 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3358 _exit(EXIT_FAILURE);
06c17c39 3359
486e99a3 3360 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
a2da110b 3361 _exit(EXIT_FAILURE);
9bd37b40 3362
d96c1ecf
LP
3363 /* Tell the parent that we are ready, and that
3364 * it can cgroupify us to that we lack access
3365 * to certain devices and resources. */
a2da110b 3366 barrier_place(&barrier);
d96c1ecf 3367
d87be9b0
LP
3368 if (chdir(arg_directory) < 0) {
3369 log_error("chdir(%s) failed: %m", arg_directory);
a2da110b 3370 _exit(EXIT_FAILURE);
687d0825
MV
3371 }
3372
d87be9b0
LP
3373 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3374 log_error("mount(MS_MOVE) failed: %m");
a2da110b 3375 _exit(EXIT_FAILURE);
687d0825
MV
3376 }
3377
d87be9b0
LP
3378 if (chroot(".") < 0) {
3379 log_error("chroot() failed: %m");
a2da110b 3380 _exit(EXIT_FAILURE);
687d0825
MV
3381 }
3382
d87be9b0
LP
3383 if (chdir("/") < 0) {
3384 log_error("chdir() failed: %m");
a2da110b 3385 _exit(EXIT_FAILURE);
687d0825
MV
3386 }
3387
d87be9b0
LP
3388 umask(0022);
3389
eb91eb18
LP
3390 if (arg_private_network)
3391 loopback_setup();
d87be9b0
LP
3392
3393 if (drop_capabilities() < 0) {
3394 log_error("drop_capabilities() failed: %m");
a2da110b 3395 _exit(EXIT_FAILURE);
687d0825 3396 }
687d0825 3397
0cb9fbcd
LP
3398 r = change_uid_gid(&home);
3399 if (r < 0)
a2da110b 3400 _exit(EXIT_FAILURE);
d87be9b0 3401
842f3b0f
LP
3402 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3403 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3404 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3405 log_oom();
a2da110b 3406 _exit(EXIT_FAILURE);
144f0fc0 3407 }
687d0825 3408
9444b1f2 3409 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3410 char as_uuid[37];
3411
3412 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3413 log_oom();
a2da110b 3414 _exit(EXIT_FAILURE);
842f3b0f
LP
3415 }
3416 }
3417
3418 if (fdset_size(fds) > 0) {
3419 k = fdset_cloexec(fds, false);
3420 if (k < 0) {
3421 log_error("Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3422 _exit(EXIT_FAILURE);
842f3b0f
LP
3423 }
3424
3425 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3426 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3427 log_oom();
a2da110b 3428 _exit(EXIT_FAILURE);
d87be9b0
LP
3429 }
3430 }
3431
3432 setup_hostname();
3433
6afc95b7
LP
3434 if (arg_personality != 0xffffffffLU) {
3435 if (personality(arg_personality) < 0) {
3436 log_error("personality() failed: %m");
a2da110b 3437 _exit(EXIT_FAILURE);
6afc95b7 3438 }
1b9e5b12
LP
3439 } else if (secondary) {
3440 if (personality(PER_LINUX32) < 0) {
3441 log_error("personality() failed: %m");
a2da110b 3442 _exit(EXIT_FAILURE);
1b9e5b12 3443 }
6afc95b7
LP
3444 }
3445
d96c1ecf
LP
3446#ifdef HAVE_SELINUX
3447 if (arg_selinux_context)
0cb9fbcd 3448 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 3449 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3450 _exit(EXIT_FAILURE);
0cb9fbcd 3451 }
d96c1ecf 3452#endif
354bfd2b 3453
f4889f65
LP
3454 if (!strv_isempty(arg_setenv)) {
3455 char **n;
3456
3457 n = strv_env_merge(2, envp, arg_setenv);
3458 if (!n) {
3459 log_oom();
a2da110b 3460 _exit(EXIT_FAILURE);
f4889f65
LP
3461 }
3462
3463 env_use = n;
3464 } else
3465 env_use = (char**) envp;
3466
d96c1ecf 3467 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3468 if (!barrier_place_and_sync(&barrier))
3469 _exit(EXIT_FAILURE);
d96c1ecf 3470
d87be9b0
LP
3471 if (arg_boot) {
3472 char **a;
3473 size_t l;
88213476 3474
d87be9b0 3475 /* Automatically search for the init system */
0f0dbc46 3476
d87be9b0
LP
3477 l = 1 + argc - optind;
3478 a = newa(char*, l + 1);
3479 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3480
d87be9b0 3481 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3482 execve(a[0], a, env_use);
0f0dbc46 3483
d87be9b0 3484 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3485 execve(a[0], a, env_use);
0f0dbc46 3486
d87be9b0 3487 a[0] = (char*) "/sbin/init";
f4889f65 3488 execve(a[0], a, env_use);
d87be9b0 3489 } else if (argc > optind)
f4889f65 3490 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3491 else {
3492 chdir(home ? home : "/root");
f4889f65 3493 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3494 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3495 }
3496
3497 log_error("execv() failed: %m");
d87be9b0 3498 _exit(EXIT_FAILURE);
da5b3bad 3499 }
88213476 3500
a2da110b 3501 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3502 fdset_free(fds);
3503 fds = NULL;
3504
a2da110b
DH
3505 /* wait for child-setup to be done */
3506 if (barrier_place_and_sync(&barrier)) {
5aa4bb6b 3507 int ifi = 0;
354bfd2b 3508
840295fc
LP
3509 r = move_network_interfaces(pid);
3510 if (r < 0)
3511 goto finish;
aa28aefe 3512
5aa4bb6b 3513 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3514 if (r < 0)
3515 goto finish;
ab046dde 3516
5aa4bb6b 3517 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3518 if (r < 0)
3519 goto finish;
ab046dde 3520
840295fc
LP
3521 r = setup_macvlan(pid);
3522 if (r < 0)
3523 goto finish;
c74e630d 3524
5aa4bb6b
LP
3525 r = register_machine(pid, ifi);
3526 if (r < 0)
3527 goto finish;
3528
840295fc
LP
3529 /* Block SIGCHLD here, before notifying child.
3530 * process_pty() will handle it with the other signals. */
3531 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3532 if (r < 0)
3533 goto finish;
e866af3a 3534
840295fc
LP
3535 /* Reset signal to default */
3536 r = default_signals(SIGCHLD, -1);
3537 if (r < 0)
3538 goto finish;
e866af3a 3539
840295fc
LP
3540 /* Notify the child that the parent is ready with all
3541 * its setup, and that the child can now hand over
3542 * control to the code to run inside the container. */
a2da110b 3543 barrier_place(&barrier);
354bfd2b 3544
840295fc
LP
3545 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3546 if (k < 0) {
3547 r = EXIT_FAILURE;
3548 break;
3549 }
88213476 3550
840295fc
LP
3551 if (!arg_quiet)
3552 putc('\n', stdout);
04d39279 3553
840295fc
LP
3554 /* Kill if it is not dead yet anyway */
3555 terminate_machine(pid);
3556 }
1f0cd86b 3557
840295fc 3558 /* Normally redundant, but better safe than sorry */
04d39279 3559 kill(pid, SIGKILL);
a258bf26 3560
113cea80 3561 r = wait_for_container(pid, &container_status);
04d39279
LP
3562 pid = 0;
3563
ce9f1527
LP
3564 if (r < 0) {
3565 /* We failed to wait for the container, or the
3566 * container exited abnormally */
3567 r = EXIT_FAILURE;
d87be9b0 3568 break;
ce9f1527
LP
3569 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3570 /* The container exited with a non-zero
3571 * status, or with zero status and no reboot
3572 * was requested. */
d87be9b0 3573 break;
88213476 3574
113cea80 3575 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3576
3577 if (arg_keep_unit) {
3578 /* Special handling if we are running as a
3579 * service: instead of simply restarting the
3580 * machine we want to restart the entire
3581 * service, so let's inform systemd about this
3582 * with the special exit code 133. The service
3583 * file uses RestartForceExitStatus=133 so
3584 * that this results in a full nspawn
3585 * restart. This is necessary since we might
3586 * have cgroup parameters set we want to have
3587 * flushed out. */
3588 r = 133;
3589 break;
3590 }
d87be9b0 3591 }
88213476
LP
3592
3593finish:
af4ec430
LP
3594 sd_notify(false,
3595 "STOPPING=1\n"
3596 "STATUS=Terminating...");
3597
1b9e5b12
LP
3598 loop_remove(loop_nr, &image_fd);
3599
9444b1f2
LP
3600 if (pid > 0)
3601 kill(pid, SIGKILL);
88213476 3602
04d391da 3603 free(arg_directory);
7027ff61 3604 free(arg_machine);
c74e630d
LP
3605 free(arg_user);
3606 strv_free(arg_setenv);
3607 strv_free(arg_network_interfaces);
3608 strv_free(arg_network_macvlan);
3609 strv_free(arg_bind);
3610 strv_free(arg_bind_ro);
06c17c39 3611 strv_free(arg_tmpfs);
88213476
LP
3612
3613 return r;
3614}