]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
machined: fix address API signatures
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <net/if.h>
69c79d3c 44#include <linux/veth.h>
6afc95b7 45#include <sys/personality.h>
1b9e5b12 46#include <linux/loop.h>
aa28aefe 47
5d63309c 48#ifdef HAVE_SELINUX
a8828ed9
DW
49#include <selinux/selinux.h>
50#endif
88213476 51
24fb1112
LP
52#ifdef HAVE_SECCOMP
53#include <seccomp.h>
54#endif
55
1b9e5b12
LP
56#ifdef HAVE_BLKID
57#include <blkid/blkid.h>
58#endif
59
1f0cd86b
LP
60#include "sd-daemon.h"
61#include "sd-bus.h"
62#include "sd-id128.h"
aa28aefe 63#include "sd-rtnl.h"
88213476
LP
64#include "log.h"
65#include "util.h"
49e942b2 66#include "mkdir.h"
6b2d0e85 67#include "macro.h"
d7832d2c 68#include "audit.h"
94d82985 69#include "missing.h"
04d391da 70#include "cgroup-util.h"
a258bf26 71#include "strv.h"
9eb977db 72#include "path-util.h"
a41fe3a2 73#include "loopback-setup.h"
4fc9982c 74#include "dev-setup.h"
842f3b0f 75#include "fdset.h"
acbeb427 76#include "build.h"
a5c32cff 77#include "fileio.h"
40ca29a1 78#include "bus-util.h"
1f0cd86b 79#include "bus-error.h"
4ba93280 80#include "ptyfwd.h"
9bd37b40 81#include "bus-kernel.h"
f4889f65 82#include "env-util.h"
7f112f50 83#include "def.h"
aa28aefe 84#include "rtnl-util.h"
7e227024 85#include "udev-util.h"
1b9e5b12
LP
86#include "blkid-util.h"
87#include "gpt.h"
01dde061 88#include "siphash24.h"
849958d1 89#include "copy.h"
3577de7a 90#include "base-filesystem.h"
a2da110b 91#include "barrier.h"
f2d88580 92
e9642be2
LP
93#ifdef HAVE_SECCOMP
94#include "seccomp-util.h"
95#endif
96
113cea80
DH
97typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100} ContainerStatus;
101
57fb9fb5
LP
102typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107} LinkJournal;
88213476 108
4d9f07b4
LP
109typedef enum Volatile {
110 VOLATILE_NO,
111 VOLATILE_YES,
112 VOLATILE_STATE,
113} Volatile;
114
88213476 115static char *arg_directory = NULL;
687d0825 116static char *arg_user = NULL;
9444b1f2 117static sd_id128_t arg_uuid = {};
7027ff61 118static char *arg_machine = NULL;
c74e630d
LP
119static const char *arg_selinux_context = NULL;
120static const char *arg_selinux_apifs_context = NULL;
9444b1f2 121static const char *arg_slice = NULL;
ff01d048 122static bool arg_private_network = false;
bc2f673e 123static bool arg_read_only = false;
0f0dbc46 124static bool arg_boot = false;
57fb9fb5 125static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
126static uint64_t arg_retain =
127 (1ULL << CAP_CHOWN) |
128 (1ULL << CAP_DAC_OVERRIDE) |
129 (1ULL << CAP_DAC_READ_SEARCH) |
130 (1ULL << CAP_FOWNER) |
131 (1ULL << CAP_FSETID) |
132 (1ULL << CAP_IPC_OWNER) |
133 (1ULL << CAP_KILL) |
134 (1ULL << CAP_LEASE) |
135 (1ULL << CAP_LINUX_IMMUTABLE) |
136 (1ULL << CAP_NET_BIND_SERVICE) |
137 (1ULL << CAP_NET_BROADCAST) |
138 (1ULL << CAP_NET_RAW) |
139 (1ULL << CAP_SETGID) |
140 (1ULL << CAP_SETFCAP) |
141 (1ULL << CAP_SETPCAP) |
142 (1ULL << CAP_SETUID) |
143 (1ULL << CAP_SYS_ADMIN) |
144 (1ULL << CAP_SYS_CHROOT) |
145 (1ULL << CAP_SYS_NICE) |
146 (1ULL << CAP_SYS_PTRACE) |
147 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 148 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
149 (1ULL << CAP_SYS_BOOT) |
150 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_MKNOD);
17fe0523
LP
153static char **arg_bind = NULL;
154static char **arg_bind_ro = NULL;
06c17c39 155static char **arg_tmpfs = NULL;
f4889f65 156static char **arg_setenv = NULL;
284c0b91 157static bool arg_quiet = false;
8a96d94e 158static bool arg_share_system = false;
eb91eb18 159static bool arg_register = true;
89f7c846 160static bool arg_keep_unit = false;
aa28aefe 161static char **arg_network_interfaces = NULL;
c74e630d 162static char **arg_network_macvlan = NULL;
69c79d3c 163static bool arg_network_veth = false;
c74e630d 164static const char *arg_network_bridge = NULL;
6afc95b7 165static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 166static const char *arg_image = NULL;
4d9f07b4 167static Volatile arg_volatile = VOLATILE_NO;
88213476 168
601185b4 169static void help(void) {
88213476
LP
170 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
172 " -h --help Show this help\n"
173 " --version Print version string\n"
69c79d3c 174 " -q --quiet Do not show status information\n"
1b9e5b12
LP
175 " -D --directory=PATH Root directory for the container\n"
176 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
177 " -b --boot Boot up full system (i.e. invoke init)\n"
178 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 179 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 180 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 181 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
182 " --private-network Disable network in container\n"
183 " --network-interface=INTERFACE\n"
184 " Assign an existing network interface to the\n"
185 " container\n"
c74e630d
LP
186 " --network-macvlan=INTERFACE\n"
187 " Create a macvlan network interface based on an\n"
188 " existing network interface to the container\n"
32457153 189 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 190 " and container\n"
ab046dde 191 " --network-bridge=INTERFACE\n"
32457153 192 " Add a virtual ethernet connection between host\n"
ab046dde
TG
193 " and container and add it to an existing bridge on\n"
194 " the host\n"
82adf6af
LP
195 " -Z --selinux-context=SECLABEL\n"
196 " Set the SELinux security context to be used by\n"
197 " processes in the container\n"
198 " -L --selinux-apifs-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " API/tmpfs file systems in the container\n"
a8828ed9
DW
201 " --capability=CAP In addition to the default, retain specified\n"
202 " capability\n"
203 " --drop-capability=CAP Drop the specified capability from the default set\n"
204 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
205 " -j Equivalent to --link-journal=host\n"
69c79d3c 206 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
207 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
208 " the container\n"
209 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 210 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 211 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 212 " --share-system Share system namespaces with host\n"
eb91eb18 213 " --register=BOOLEAN Register container as machine\n"
89f7c846 214 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
215 " the service unit nspawn is running in\n"
216 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 217 program_invocation_short_name);
88213476
LP
218}
219
220static int parse_argv(int argc, char *argv[]) {
221
a41fe3a2 222 enum {
acbeb427
ZJS
223 ARG_VERSION = 0x100,
224 ARG_PRIVATE_NETWORK,
bc2f673e 225 ARG_UUID,
5076f0cc 226 ARG_READ_ONLY,
57fb9fb5 227 ARG_CAPABILITY,
420c7379 228 ARG_DROP_CAPABILITY,
17fe0523
LP
229 ARG_LINK_JOURNAL,
230 ARG_BIND,
f4889f65 231 ARG_BIND_RO,
06c17c39 232 ARG_TMPFS,
f4889f65 233 ARG_SETENV,
eb91eb18 234 ARG_SHARE_SYSTEM,
89f7c846 235 ARG_REGISTER,
aa28aefe 236 ARG_KEEP_UNIT,
69c79d3c 237 ARG_NETWORK_INTERFACE,
c74e630d 238 ARG_NETWORK_MACVLAN,
69c79d3c 239 ARG_NETWORK_VETH,
ab046dde 240 ARG_NETWORK_BRIDGE,
6afc95b7 241 ARG_PERSONALITY,
4d9f07b4 242 ARG_VOLATILE,
a41fe3a2
LP
243 };
244
88213476 245 static const struct option options[] = {
aa28aefe
LP
246 { "help", no_argument, NULL, 'h' },
247 { "version", no_argument, NULL, ARG_VERSION },
248 { "directory", required_argument, NULL, 'D' },
249 { "user", required_argument, NULL, 'u' },
250 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
251 { "boot", no_argument, NULL, 'b' },
252 { "uuid", required_argument, NULL, ARG_UUID },
253 { "read-only", no_argument, NULL, ARG_READ_ONLY },
254 { "capability", required_argument, NULL, ARG_CAPABILITY },
255 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
256 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
257 { "bind", required_argument, NULL, ARG_BIND },
258 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 259 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
260 { "machine", required_argument, NULL, 'M' },
261 { "slice", required_argument, NULL, 'S' },
262 { "setenv", required_argument, NULL, ARG_SETENV },
263 { "selinux-context", required_argument, NULL, 'Z' },
264 { "selinux-apifs-context", required_argument, NULL, 'L' },
265 { "quiet", no_argument, NULL, 'q' },
266 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
267 { "register", required_argument, NULL, ARG_REGISTER },
268 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
269 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 270 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
271 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
272 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 273 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 274 { "image", required_argument, NULL, 'i' },
4d9f07b4 275 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 276 {}
88213476
LP
277 };
278
9444b1f2 279 int c, r;
a42c8b54 280 uint64_t plus = 0, minus = 0;
88213476
LP
281
282 assert(argc >= 0);
283 assert(argv);
284
601185b4 285 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
88213476
LP
286
287 switch (c) {
288
289 case 'h':
601185b4
ZJS
290 help();
291 return 0;
88213476 292
acbeb427
ZJS
293 case ARG_VERSION:
294 puts(PACKAGE_STRING);
295 puts(SYSTEMD_FEATURES);
296 return 0;
297
88213476
LP
298 case 'D':
299 free(arg_directory);
3a74cea5
LP
300 arg_directory = canonicalize_file_name(optarg);
301 if (!arg_directory) {
898d5c91 302 log_error("Invalid root directory: %m");
88213476
LP
303 return -ENOMEM;
304 }
305
306 break;
307
1b9e5b12
LP
308 case 'i':
309 arg_image = optarg;
310 break;
311
687d0825
MV
312 case 'u':
313 free(arg_user);
7027ff61
LP
314 arg_user = strdup(optarg);
315 if (!arg_user)
316 return log_oom();
687d0825
MV
317
318 break;
319
ab046dde 320 case ARG_NETWORK_BRIDGE:
c74e630d 321 arg_network_bridge = optarg;
ab046dde
TG
322
323 /* fall through */
324
69c79d3c
LP
325 case ARG_NETWORK_VETH:
326 arg_network_veth = true;
327 arg_private_network = true;
328 break;
329
aa28aefe 330 case ARG_NETWORK_INTERFACE:
c74e630d
LP
331 if (strv_extend(&arg_network_interfaces, optarg) < 0)
332 return log_oom();
333
334 arg_private_network = true;
335 break;
336
337 case ARG_NETWORK_MACVLAN:
338 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
339 return log_oom();
340
341 /* fall through */
342
ff01d048
LP
343 case ARG_PRIVATE_NETWORK:
344 arg_private_network = true;
a41fe3a2
LP
345 break;
346
0f0dbc46
LP
347 case 'b':
348 arg_boot = true;
349 break;
350
144f0fc0 351 case ARG_UUID:
9444b1f2
LP
352 r = sd_id128_from_string(optarg, &arg_uuid);
353 if (r < 0) {
aa96c6cb 354 log_error("Invalid UUID: %s", optarg);
9444b1f2 355 return r;
aa96c6cb 356 }
9444b1f2 357 break;
aa96c6cb 358
9444b1f2 359 case 'S':
c74e630d 360 arg_slice = optarg;
144f0fc0
LP
361 break;
362
7027ff61 363 case 'M':
eb91eb18
LP
364 if (isempty(optarg)) {
365 free(arg_machine);
366 arg_machine = NULL;
367 } else {
7027ff61 368
eb91eb18
LP
369 if (!hostname_is_valid(optarg)) {
370 log_error("Invalid machine name: %s", optarg);
371 return -EINVAL;
372 }
7027ff61 373
eb91eb18
LP
374 free(arg_machine);
375 arg_machine = strdup(optarg);
376 if (!arg_machine)
377 return log_oom();
378
379 break;
380 }
7027ff61 381
82adf6af
LP
382 case 'Z':
383 arg_selinux_context = optarg;
a8828ed9
DW
384 break;
385
82adf6af
LP
386 case 'L':
387 arg_selinux_apifs_context = optarg;
a8828ed9
DW
388 break;
389
bc2f673e
LP
390 case ARG_READ_ONLY:
391 arg_read_only = true;
392 break;
393
420c7379
LP
394 case ARG_CAPABILITY:
395 case ARG_DROP_CAPABILITY: {
a2a5291b 396 const char *state, *word;
5076f0cc
LP
397 size_t length;
398
399 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 400 _cleanup_free_ char *t;
5076f0cc 401 cap_value_t cap;
5076f0cc
LP
402
403 t = strndup(word, length);
0d0f0c50
SL
404 if (!t)
405 return log_oom();
5076f0cc 406
39ed67d1
LP
407 if (streq(t, "all")) {
408 if (c == ARG_CAPABILITY)
a42c8b54 409 plus = (uint64_t) -1;
39ed67d1 410 else
a42c8b54 411 minus = (uint64_t) -1;
39ed67d1
LP
412 } else {
413 if (cap_from_name(t, &cap) < 0) {
414 log_error("Failed to parse capability %s.", t);
415 return -EINVAL;
416 }
417
418 if (c == ARG_CAPABILITY)
a42c8b54 419 plus |= 1ULL << (uint64_t) cap;
39ed67d1 420 else
a42c8b54 421 minus |= 1ULL << (uint64_t) cap;
5076f0cc 422 }
5076f0cc
LP
423 }
424
425 break;
426 }
427
57fb9fb5
LP
428 case 'j':
429 arg_link_journal = LINK_GUEST;
430 break;
431
432 case ARG_LINK_JOURNAL:
433 if (streq(optarg, "auto"))
434 arg_link_journal = LINK_AUTO;
435 else if (streq(optarg, "no"))
436 arg_link_journal = LINK_NO;
437 else if (streq(optarg, "guest"))
438 arg_link_journal = LINK_GUEST;
439 else if (streq(optarg, "host"))
440 arg_link_journal = LINK_HOST;
441 else {
442 log_error("Failed to parse link journal mode %s", optarg);
443 return -EINVAL;
444 }
445
446 break;
447
17fe0523
LP
448 case ARG_BIND:
449 case ARG_BIND_RO: {
450 _cleanup_free_ char *a = NULL, *b = NULL;
451 char *e;
452 char ***x;
17fe0523
LP
453
454 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456 e = strchr(optarg, ':');
457 if (e) {
458 a = strndup(optarg, e - optarg);
459 b = strdup(e + 1);
460 } else {
461 a = strdup(optarg);
462 b = strdup(optarg);
463 }
464
465 if (!a || !b)
466 return log_oom();
467
468 if (!path_is_absolute(a) || !path_is_absolute(b)) {
469 log_error("Invalid bind mount specification: %s", optarg);
470 return -EINVAL;
471 }
472
473 r = strv_extend(x, a);
474 if (r < 0)
b3451bed 475 return log_oom();
17fe0523
LP
476
477 r = strv_extend(x, b);
478 if (r < 0)
b3451bed 479 return log_oom();
17fe0523
LP
480
481 break;
482 }
483
06c17c39
LP
484 case ARG_TMPFS: {
485 _cleanup_free_ char *a = NULL, *b = NULL;
486 char *e;
487
488 e = strchr(optarg, ':');
489 if (e) {
490 a = strndup(optarg, e - optarg);
491 b = strdup(e + 1);
492 } else {
493 a = strdup(optarg);
494 b = strdup("mode=0755");
495 }
496
497 if (!a || !b)
498 return log_oom();
499
500 if (!path_is_absolute(a)) {
501 log_error("Invalid tmpfs specification: %s", optarg);
502 return -EINVAL;
503 }
504
505 r = strv_push(&arg_tmpfs, a);
506 if (r < 0)
507 return log_oom();
508
509 a = NULL;
510
511 r = strv_push(&arg_tmpfs, b);
512 if (r < 0)
513 return log_oom();
514
515 b = NULL;
516
517 break;
518 }
519
f4889f65
LP
520 case ARG_SETENV: {
521 char **n;
522
523 if (!env_assignment_is_valid(optarg)) {
524 log_error("Environment variable assignment '%s' is not valid.", optarg);
525 return -EINVAL;
526 }
527
528 n = strv_env_set(arg_setenv, optarg);
529 if (!n)
530 return log_oom();
531
532 strv_free(arg_setenv);
533 arg_setenv = n;
534 break;
535 }
536
284c0b91
LP
537 case 'q':
538 arg_quiet = true;
539 break;
540
8a96d94e
LP
541 case ARG_SHARE_SYSTEM:
542 arg_share_system = true;
543 break;
544
eb91eb18
LP
545 case ARG_REGISTER:
546 r = parse_boolean(optarg);
547 if (r < 0) {
548 log_error("Failed to parse --register= argument: %s", optarg);
549 return r;
550 }
551
552 arg_register = r;
553 break;
554
89f7c846
LP
555 case ARG_KEEP_UNIT:
556 arg_keep_unit = true;
557 break;
558
6afc95b7
LP
559 case ARG_PERSONALITY:
560
ac45f971 561 arg_personality = personality_from_string(optarg);
6afc95b7
LP
562 if (arg_personality == 0xffffffffLU) {
563 log_error("Unknown or unsupported personality '%s'.", optarg);
564 return -EINVAL;
565 }
566
567 break;
568
4d9f07b4
LP
569 case ARG_VOLATILE:
570
571 if (!optarg)
572 arg_volatile = VOLATILE_YES;
573 else {
574 r = parse_boolean(optarg);
575 if (r < 0) {
576 if (streq(optarg, "state"))
577 arg_volatile = VOLATILE_STATE;
578 else {
579 log_error("Failed to parse --volatile= argument: %s", optarg);
580 return r;
581 }
582 } else
583 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584 }
585
586 break;
587
88213476
LP
588 case '?':
589 return -EINVAL;
590
591 default:
eb9da376 592 assert_not_reached("Unhandled option");
88213476 593 }
88213476 594
eb91eb18
LP
595 if (arg_share_system)
596 arg_register = false;
597
598 if (arg_boot && arg_share_system) {
599 log_error("--boot and --share-system may not be combined.");
600 return -EINVAL;
601 }
602
89f7c846
LP
603 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604 log_error("--keep-unit may not be used when invoked from a user session.");
605 return -EINVAL;
606 }
607
1b9e5b12
LP
608 if (arg_directory && arg_image) {
609 log_error("--directory= and --image= may not be combined.");
610 return -EINVAL;
611 }
612
4d9f07b4
LP
613 if (arg_volatile != VOLATILE_NO && arg_read_only) {
614 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615 return -EINVAL;
616 }
617
a42c8b54
LP
618 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
88213476
LP
620 return 1;
621}
622
623static int mount_all(const char *dest) {
624
625 typedef struct MountPoint {
626 const char *what;
627 const char *where;
628 const char *type;
629 const char *options;
630 unsigned long flags;
3bd66c05 631 bool fatal;
88213476
LP
632 } MountPoint;
633
634 static const MountPoint mount_table[] = {
06c17c39
LP
635 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
636 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
637 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
638 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
639 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 640 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
641 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
642 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 643#ifdef HAVE_SELINUX
06c17c39
LP
644 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
645 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 646#endif
88213476
LP
647 };
648
649 unsigned k;
650 int r = 0;
651
652 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 653 _cleanup_free_ char *where = NULL;
d002827b 654#ifdef HAVE_SELINUX
a8828ed9 655 _cleanup_free_ char *options = NULL;
d002827b
LP
656#endif
657 const char *o;
88213476
LP
658 int t;
659
17fe0523
LP
660 where = strjoin(dest, "/", mount_table[k].where, NULL);
661 if (!where)
662 return log_oom();
88213476 663
e65aec12 664 t = path_is_mount_point(where, true);
68fb0892 665 if (t < 0) {
88213476 666 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
667
668 if (r == 0)
669 r = t;
670
671 continue;
672 }
673
9c1c7f71
LP
674 /* Skip this entry if it is not a remount. */
675 if (mount_table[k].what && t > 0)
014a9c77
LP
676 continue;
677
17fe0523 678 mkdir_p(where, 0755);
88213476 679
a8828ed9 680#ifdef HAVE_SELINUX
82adf6af
LP
681 if (arg_selinux_apifs_context &&
682 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
683 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
684 if (!options)
685 return log_oom();
686
687 o = options;
688 } else
a8828ed9 689#endif
d002827b 690 o = mount_table[k].options;
a8828ed9 691
a8828ed9 692
88213476
LP
693 if (mount(mount_table[k].what,
694 where,
695 mount_table[k].type,
696 mount_table[k].flags,
d002827b 697 o) < 0 &&
3bd66c05 698 mount_table[k].fatal) {
88213476
LP
699
700 log_error("mount(%s) failed: %m", where);
701
702 if (r == 0)
703 r = -errno;
704 }
88213476
LP
705 }
706
e58a1277
LP
707 return r;
708}
f8440af5 709
d6797c92 710static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
711 char **x, **y;
712
713 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 714 _cleanup_free_ char *where = NULL;
d2421337 715 struct stat source_st, dest_st;
2ed4e5e0 716 int r;
d2421337
DR
717
718 if (stat(*x, &source_st) < 0) {
1b9e5b12 719 log_error("Failed to stat %s: %m", *x);
d2421337
DR
720 return -errno;
721 }
17fe0523 722
06c17c39
LP
723 where = strappend(dest, *y);
724 if (!where)
725 return log_oom();
726
2ed4e5e0
SL
727 r = stat(where, &dest_st);
728 if (r == 0) {
d2421337 729 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 730 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
731 return -EINVAL;
732 }
2ed4e5e0
SL
733 } else if (errno == ENOENT) {
734 r = mkdir_parents_label(where, 0755);
735 if (r < 0) {
736 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
737 return r;
d2421337 738 }
2ed4e5e0 739 } else {
590b6b91 740 log_error("Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
741 return -errno;
742 }
06c17c39 743
2ed4e5e0 744 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 745 * and char devices. */
2ed4e5e0
SL
746 if (S_ISDIR(source_st.st_mode))
747 mkdir_label(where, 0755);
748 else if (S_ISFIFO(source_st.st_mode))
749 mkfifo(where, 0644);
750 else if (S_ISSOCK(source_st.st_mode))
751 mknod(where, 0644 | S_IFSOCK, 0);
752 else if (S_ISREG(source_st.st_mode))
753 touch(where);
754 else {
755 log_error("Refusing to create mountpoint for file: %s", *x);
756 return -ENOTSUP;
d2421337 757 }
17fe0523
LP
758
759 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
760 log_error("mount(%s) failed: %m", where);
761 return -errno;
762 }
763
d6797c92
LP
764 if (ro) {
765 r = bind_remount_recursive(where, true);
766 if (r < 0) {
767 log_error("Read-Only bind mount failed: %s", strerror(-r));
768 return r;
769 }
17fe0523
LP
770 }
771 }
772
773 return 0;
774}
775
06c17c39
LP
776static int mount_tmpfs(const char *dest) {
777 char **i, **o;
778
779 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
780 _cleanup_free_ char *where = NULL;
781
782 where = strappend(dest, *i);
783 if (!where)
784 return log_oom();
785
786 mkdir_label(where, 0755);
787
788 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
789 log_error("tmpfs mount to %s failed: %m", where);
790 return -errno;
791 }
792 }
793
794 return 0;
795}
796
e58a1277 797static int setup_timezone(const char *dest) {
d4036145
LP
798 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
799 char *z, *y;
800 int r;
f8440af5 801
e58a1277
LP
802 assert(dest);
803
804 /* Fix the timezone, if possible */
d4036145
LP
805 r = readlink_malloc("/etc/localtime", &p);
806 if (r < 0) {
807 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
808 return 0;
809 }
810
811 z = path_startswith(p, "../usr/share/zoneinfo/");
812 if (!z)
813 z = path_startswith(p, "/usr/share/zoneinfo/");
814 if (!z) {
815 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
816 return 0;
817 }
818
04bc4a3f
LP
819 where = strappend(dest, "/etc/localtime");
820 if (!where)
0d0f0c50 821 return log_oom();
715ac17a 822
d4036145
LP
823 r = readlink_malloc(where, &q);
824 if (r >= 0) {
825 y = path_startswith(q, "../usr/share/zoneinfo/");
826 if (!y)
827 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 828
d4036145
LP
829 /* Already pointing to the right place? Then do nothing .. */
830 if (y && streq(y, z))
831 return 0;
832 }
833
834 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
835 if (!check)
0d0f0c50 836 return log_oom();
4d1c38b8 837
d4036145
LP
838 if (access(check, F_OK) < 0) {
839 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
840 return 0;
841 }
68fb0892 842
d4036145
LP
843 what = strappend("../usr/share/zoneinfo/", z);
844 if (!what)
845 return log_oom();
846
4d9f07b4 847 mkdir_parents(where, 0755);
d4036145 848 unlink(where);
4d9f07b4 849
d4036145
LP
850 if (symlink(what, where) < 0) {
851 log_error("Failed to correct timezone of container: %m");
852 return 0;
853 }
e58a1277
LP
854
855 return 0;
88213476
LP
856}
857
2547bb41 858static int setup_resolv_conf(const char *dest) {
c8b32e11 859 _cleanup_free_ char *where = NULL;
2547bb41
LP
860
861 assert(dest);
862
863 if (arg_private_network)
864 return 0;
865
866 /* Fix resolv.conf, if possible */
04bc4a3f
LP
867 where = strappend(dest, "/etc/resolv.conf");
868 if (!where)
0d0f0c50 869 return log_oom();
2547bb41 870
77e63faf
LP
871 /* We don't really care for the results of this really. If it
872 * fails, it fails, but meh... */
4d9f07b4 873 mkdir_parents(where, 0755);
849958d1 874 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
2547bb41
LP
875
876 return 0;
877}
878
4d9f07b4
LP
879static int setup_volatile_state(const char *directory) {
880 const char *p;
881 int r;
882
883 assert(directory);
884
885 if (arg_volatile != VOLATILE_STATE)
886 return 0;
887
888 /* --volatile=state means we simply overmount /var
889 with a tmpfs, and the rest read-only. */
890
891 r = bind_remount_recursive(directory, true);
892 if (r < 0) {
893 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
894 return r;
895 }
896
897 p = strappenda(directory, "/var");
898 mkdir(p, 0755);
899
900 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
901 log_error("Failed to mount tmpfs to /var: %m");
902 return -errno;
903 }
904
905 return 0;
906}
907
908static int setup_volatile(const char *directory) {
909 bool tmpfs_mounted = false, bind_mounted = false;
910 char template[] = "/tmp/nspawn-volatile-XXXXXX";
911 const char *f, *t;
912 int r;
913
914 assert(directory);
915
916 if (arg_volatile != VOLATILE_YES)
917 return 0;
918
919 /* --volatile=yes means we mount a tmpfs to the root dir, and
920 the original /usr to use inside it, and that read-only. */
921
922 if (!mkdtemp(template)) {
923 log_error("Failed to create temporary directory: %m");
924 return -errno;
925 }
926
927 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
928 log_error("Failed to mount tmpfs for root directory: %m");
929 r = -errno;
930 goto fail;
931 }
932
933 tmpfs_mounted = true;
934
935 f = strappenda(directory, "/usr");
936 t = strappenda(template, "/usr");
937
938 mkdir(t, 0755);
939 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
940 log_error("Failed to create /usr bind mount: %m");
941 r = -errno;
942 goto fail;
943 }
944
945 bind_mounted = true;
946
947 r = bind_remount_recursive(t, true);
948 if (r < 0) {
949 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
950 goto fail;
951 }
952
953 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
954 log_error("Failed to move root mount: %m");
955 r = -errno;
956 goto fail;
957 }
958
959 rmdir(template);
960
961 return 0;
962
963fail:
964 if (bind_mounted)
965 umount(t);
966 if (tmpfs_mounted)
967 umount(template);
968 rmdir(template);
969 return r;
970}
971
9f24adc2
LP
972static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
973
974 snprintf(s, 37,
975 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
976 SD_ID128_FORMAT_VAL(id));
977
978 return s;
979}
980
04bc4a3f 981static int setup_boot_id(const char *dest) {
7fd1b19b 982 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 983 sd_id128_t rnd = {};
04bc4a3f
LP
984 char as_uuid[37];
985 int r;
986
987 assert(dest);
988
eb91eb18
LP
989 if (arg_share_system)
990 return 0;
991
04bc4a3f
LP
992 /* Generate a new randomized boot ID, so that each boot-up of
993 * the container gets a new one */
994
995 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 996 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
997 if (!from || !to)
998 return log_oom();
04bc4a3f
LP
999
1000 r = sd_id128_randomize(&rnd);
1001 if (r < 0) {
1002 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 1003 return r;
04bc4a3f
LP
1004 }
1005
9f24adc2 1006 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1007
574d5f2d 1008 r = write_string_file(from, as_uuid);
04bc4a3f
LP
1009 if (r < 0) {
1010 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 1011 return r;
04bc4a3f
LP
1012 }
1013
1014 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1015 log_error("Failed to bind mount boot id: %m");
1016 r = -errno;
10d18763
ZJS
1017 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1018 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
1019
1020 unlink(from);
04bc4a3f
LP
1021 return r;
1022}
1023
e58a1277 1024static int copy_devnodes(const char *dest) {
88213476
LP
1025
1026 static const char devnodes[] =
1027 "null\0"
1028 "zero\0"
1029 "full\0"
1030 "random\0"
1031 "urandom\0"
f2d88580 1032 "tty\0";
88213476
LP
1033
1034 const char *d;
e58a1277 1035 int r = 0;
7fd1b19b 1036 _cleanup_umask_ mode_t u;
a258bf26
LP
1037
1038 assert(dest);
124640f1
LP
1039
1040 u = umask(0000);
88213476
LP
1041
1042 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1043 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1044 struct stat st;
88213476 1045
7f112f50
LP
1046 from = strappend("/dev/", d);
1047 to = strjoin(dest, "/dev/", d, NULL);
1048 if (!from || !to)
1049 return log_oom();
88213476
LP
1050
1051 if (stat(from, &st) < 0) {
1052
1053 if (errno != ENOENT) {
1054 log_error("Failed to stat %s: %m", from);
7f112f50 1055 return -errno;
88213476
LP
1056 }
1057
a258bf26 1058 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1059
ed8b7a3e 1060 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1061 return -EIO;
a258bf26
LP
1062
1063 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1064
1065 log_error("mknod(%s) failed: %m", dest);
7f112f50 1066 return -errno;
88213476 1067 }
88213476
LP
1068 }
1069
e58a1277
LP
1070 return r;
1071}
88213476 1072
f2d88580
LP
1073static int setup_ptmx(const char *dest) {
1074 _cleanup_free_ char *p = NULL;
1075
1076 p = strappend(dest, "/dev/ptmx");
1077 if (!p)
1078 return log_oom();
1079
1080 if (symlink("pts/ptmx", p) < 0) {
1081 log_error("Failed to create /dev/ptmx symlink: %m");
1082 return -errno;
1083 }
1084
1085 return 0;
1086}
1087
e58a1277 1088static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1089 _cleanup_umask_ mode_t u;
1090 const char *to;
e58a1277 1091 struct stat st;
e58a1277 1092 int r;
e58a1277
LP
1093
1094 assert(dest);
1095 assert(console);
1096
1097 u = umask(0000);
1098
eb0f0863
LP
1099 if (stat("/dev/null", &st) < 0) {
1100 log_error("Failed to stat /dev/null: %m");
25ea79fe 1101 return -errno;
e58a1277 1102 }
88213476 1103
e58a1277
LP
1104 r = chmod_and_chown(console, 0600, 0, 0);
1105 if (r < 0) {
1106 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 1107 return r;
a258bf26 1108 }
88213476 1109
a258bf26
LP
1110 /* We need to bind mount the right tty to /dev/console since
1111 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1112 * to bind mount things on we create a device node first, and
1113 * use /dev/null for that since we the cgroups device policy
1114 * allows us to create that freely, while we cannot create
1115 * /dev/console. (Note that the major minor doesn't actually
1116 * matter here, since we mount it over anyway). */
a258bf26 1117
eb0f0863 1118 to = strappenda(dest, "/dev/console");
e58a1277
LP
1119 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1120 log_error("mknod() for /dev/console failed: %m");
25ea79fe 1121 return -errno;
e58a1277 1122 }
a258bf26
LP
1123
1124 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 1125 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 1126 return -errno;
a258bf26
LP
1127 }
1128
25ea79fe 1129 return 0;
e58a1277
LP
1130}
1131
1132static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1133 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1134 int r, fd, k;
7fd1b19b 1135 _cleanup_umask_ mode_t u;
e58a1277
LP
1136 union {
1137 struct cmsghdr cmsghdr;
1138 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1139 } control = {};
1140 struct msghdr mh = {
1141 .msg_control = &control,
1142 .msg_controllen = sizeof(control),
1143 };
e58a1277
LP
1144 struct cmsghdr *cmsg;
1145
1146 assert(dest);
1147 assert(kmsg_socket >= 0);
a258bf26 1148
e58a1277 1149 u = umask(0000);
a258bf26 1150
f1e5dfe2
LP
1151 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1152 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1153 * on the reading side behave very similar to /proc/kmsg,
1154 * their writing side behaves differently from /dev/kmsg in
1155 * that writing blocks when nothing is reading. In order to
1156 * avoid any problems with containers deadlocking due to this
1157 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1158 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1159 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1160 return log_oom();
e58a1277
LP
1161
1162 if (mkfifo(from, 0600) < 0) {
1163 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 1164 return -errno;
e58a1277
LP
1165 }
1166
1167 r = chmod_and_chown(from, 0600, 0, 0);
1168 if (r < 0) {
1169 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 1170 return r;
e58a1277
LP
1171 }
1172
1173 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1174 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 1175 return -errno;
e58a1277
LP
1176 }
1177
1178 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1179 if (fd < 0) {
1180 log_error("Failed to open fifo: %m");
25ea79fe 1181 return -errno;
e58a1277
LP
1182 }
1183
e58a1277
LP
1184 cmsg = CMSG_FIRSTHDR(&mh);
1185 cmsg->cmsg_level = SOL_SOCKET;
1186 cmsg->cmsg_type = SCM_RIGHTS;
1187 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1188 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1189
1190 mh.msg_controllen = cmsg->cmsg_len;
1191
1192 /* Store away the fd in the socket, so that it stays open as
1193 * long as we run the child */
1194 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1195 safe_close(fd);
e58a1277
LP
1196
1197 if (k < 0) {
1198 log_error("Failed to send FIFO fd: %m");
25ea79fe 1199 return -errno;
a258bf26
LP
1200 }
1201
f1e5dfe2
LP
1202 /* And now make the FIFO unavailable as /dev/kmsg... */
1203 unlink(from);
25ea79fe 1204 return 0;
88213476
LP
1205}
1206
3a74cea5 1207static int setup_hostname(void) {
3a74cea5 1208
eb91eb18
LP
1209 if (arg_share_system)
1210 return 0;
1211
7027ff61
LP
1212 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1213 return -errno;
3a74cea5 1214
7027ff61 1215 return 0;
3a74cea5
LP
1216}
1217
57fb9fb5 1218static int setup_journal(const char *directory) {
4d680aee 1219 sd_id128_t machine_id, this_id;
7fd1b19b 1220 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1221 char *id;
57fb9fb5
LP
1222 int r;
1223
57fb9fb5 1224 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1225 if (!p)
1226 return log_oom();
57fb9fb5
LP
1227
1228 r = read_one_line_file(p, &b);
27407a01
ZJS
1229 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1230 return 0;
1231 else if (r < 0) {
1232 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1233 return r;
1234 }
1235
27407a01
ZJS
1236 id = strstrip(b);
1237 if (isempty(id) && arg_link_journal == LINK_AUTO)
1238 return 0;
57fb9fb5 1239
27407a01
ZJS
1240 /* Verify validity */
1241 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1242 if (r < 0) {
27407a01
ZJS
1243 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1244 return r;
57fb9fb5
LP
1245 }
1246
4d680aee
ZJS
1247 r = sd_id128_get_machine(&this_id);
1248 if (r < 0) {
1249 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1250 return r;
1251 }
1252
1253 if (sd_id128_equal(machine_id, this_id)) {
1254 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1255 "Host and machine ids are equal (%s): refusing to link journals", id);
1256 if (arg_link_journal == LINK_AUTO)
1257 return 0;
1258 return
1259 -EEXIST;
1260 }
1261
1262 if (arg_link_journal == LINK_NO)
1263 return 0;
1264
57fb9fb5 1265 free(p);
27407a01
ZJS
1266 p = strappend("/var/log/journal/", id);
1267 q = strjoin(directory, "/var/log/journal/", id, NULL);
1268 if (!p || !q)
1269 return log_oom();
1270
1271 if (path_is_mount_point(p, false) > 0) {
1272 if (arg_link_journal != LINK_AUTO) {
1273 log_error("%s: already a mount point, refusing to use for journal", p);
1274 return -EEXIST;
1275 }
1276
1277 return 0;
57fb9fb5
LP
1278 }
1279
27407a01 1280 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1281 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1282 log_error("%s: already a mount point, refusing to use for journal", q);
1283 return -EEXIST;
57fb9fb5
LP
1284 }
1285
27407a01 1286 return 0;
57fb9fb5
LP
1287 }
1288
1289 r = readlink_and_make_absolute(p, &d);
1290 if (r >= 0) {
1291 if ((arg_link_journal == LINK_GUEST ||
1292 arg_link_journal == LINK_AUTO) &&
1293 path_equal(d, q)) {
1294
27407a01
ZJS
1295 r = mkdir_p(q, 0755);
1296 if (r < 0)
1297 log_warning("failed to create directory %s: %m", q);
1298 return 0;
57fb9fb5
LP
1299 }
1300
1301 if (unlink(p) < 0) {
1302 log_error("Failed to remove symlink %s: %m", p);
27407a01 1303 return -errno;
57fb9fb5
LP
1304 }
1305 } else if (r == -EINVAL) {
1306
1307 if (arg_link_journal == LINK_GUEST &&
1308 rmdir(p) < 0) {
1309
27407a01
ZJS
1310 if (errno == ENOTDIR) {
1311 log_error("%s already exists and is neither a symlink nor a directory", p);
1312 return r;
1313 } else {
57fb9fb5 1314 log_error("Failed to remove %s: %m", p);
27407a01 1315 return -errno;
57fb9fb5 1316 }
57fb9fb5
LP
1317 }
1318 } else if (r != -ENOENT) {
1319 log_error("readlink(%s) failed: %m", p);
27407a01 1320 return r;
57fb9fb5
LP
1321 }
1322
1323 if (arg_link_journal == LINK_GUEST) {
1324
1325 if (symlink(q, p) < 0) {
1326 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1327 return -errno;
57fb9fb5
LP
1328 }
1329
27407a01
ZJS
1330 r = mkdir_p(q, 0755);
1331 if (r < 0)
1332 log_warning("failed to create directory %s: %m", q);
1333 return 0;
57fb9fb5
LP
1334 }
1335
1336 if (arg_link_journal == LINK_HOST) {
1337 r = mkdir_p(p, 0755);
1338 if (r < 0) {
1339 log_error("Failed to create %s: %m", p);
27407a01 1340 return r;
57fb9fb5
LP
1341 }
1342
27407a01
ZJS
1343 } else if (access(p, F_OK) < 0)
1344 return 0;
57fb9fb5 1345
cdb2b9d0
LP
1346 if (dir_is_empty(q) == 0)
1347 log_warning("%s is not empty, proceeding anyway.", q);
1348
57fb9fb5
LP
1349 r = mkdir_p(q, 0755);
1350 if (r < 0) {
1351 log_error("Failed to create %s: %m", q);
27407a01 1352 return r;
57fb9fb5
LP
1353 }
1354
1355 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1356 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1357 return -errno;
57fb9fb5
LP
1358 }
1359
27407a01 1360 return 0;
57fb9fb5
LP
1361}
1362
9bd37b40
LP
1363static int setup_kdbus(const char *dest, const char *path) {
1364 const char *p;
1365
1366 if (!path)
1367 return 0;
1368
1369 p = strappenda(dest, "/dev/kdbus");
1370 if (mkdir(p, 0755) < 0) {
1371 log_error("Failed to create kdbus path: %m");
1372 return -errno;
1373 }
1374
1375 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1376 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1377 return -errno;
1378 }
1379
1380 return 0;
1381}
1382
88213476 1383static int drop_capabilities(void) {
5076f0cc 1384 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1385}
1386
5aa4bb6b 1387static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1388 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1389 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1390 int r;
1391
eb91eb18
LP
1392 if (!arg_register)
1393 return 0;
1394
1c03020c 1395 r = sd_bus_default_system(&bus);
9444b1f2
LP
1396 if (r < 0) {
1397 log_error("Failed to open system bus: %s", strerror(-r));
1398 return r;
1399 }
1400
89f7c846
LP
1401 if (arg_keep_unit) {
1402 r = sd_bus_call_method(
1403 bus,
1404 "org.freedesktop.machine1",
1405 "/org/freedesktop/machine1",
1406 "org.freedesktop.machine1.Manager",
5aa4bb6b 1407 "RegisterMachineWithNetwork",
89f7c846
LP
1408 &error,
1409 NULL,
5aa4bb6b 1410 "sayssusai",
89f7c846
LP
1411 arg_machine,
1412 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1413 "nspawn",
1414 "container",
1415 (uint32_t) pid,
5aa4bb6b
LP
1416 strempty(arg_directory),
1417 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1418 } else {
9457ac5b
LP
1419 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1420
1421 r = sd_bus_message_new_method_call(
89f7c846 1422 bus,
9457ac5b 1423 &m,
89f7c846
LP
1424 "org.freedesktop.machine1",
1425 "/org/freedesktop/machine1",
1426 "org.freedesktop.machine1.Manager",
5aa4bb6b 1427 "CreateMachineWithNetwork");
9457ac5b
LP
1428 if (r < 0) {
1429 log_error("Failed to create message: %s", strerror(-r));
1430 return r;
1431 }
1432
1433 r = sd_bus_message_append(
1434 m,
5aa4bb6b 1435 "sayssusai",
89f7c846
LP
1436 arg_machine,
1437 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1438 "nspawn",
1439 "container",
1440 (uint32_t) pid,
5aa4bb6b
LP
1441 strempty(arg_directory),
1442 local_ifindex > 0 ? 1 : 0, local_ifindex);
9457ac5b
LP
1443 if (r < 0) {
1444 log_error("Failed to append message arguments: %s", strerror(-r));
1445 return r;
1446 }
1447
1448 r = sd_bus_message_open_container(m, 'a', "(sv)");
1449 if (r < 0) {
1450 log_error("Failed to open container: %s", strerror(-r));
1451 return r;
1452 }
1453
1454 if (!isempty(arg_slice)) {
1455 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1456 if (r < 0) {
1457 log_error("Failed to append slice: %s", strerror(-r));
1458 return r;
1459 }
1460 }
1461
1462 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1463 if (r < 0) {
1464 log_error("Failed to add device policy: %s", strerror(-r));
1465 return r;
1466 }
1467
a07f961e 1468 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
9457ac5b
LP
1469 /* Allow the container to
1470 * access and create the API
1471 * device nodes, so that
1472 * PrivateDevices= in the
1473 * container can work
1474 * fine */
1475 "/dev/null", "rwm",
1476 "/dev/zero", "rwm",
1477 "/dev/full", "rwm",
1478 "/dev/random", "rwm",
1479 "/dev/urandom", "rwm",
1480 "/dev/tty", "rwm",
1481 /* Allow the container
1482 * access to ptys. However,
1483 * do not permit the
1484 * container to ever create
1485 * these device nodes. */
1486 "/dev/pts/ptmx", "rw",
a07f961e
LP
1487 "char-pts", "rw",
1488 /* Allow the container
1489 * access to all kdbus
1490 * devices. Again, the
1491 * container cannot create
1492 * these nodes, only use
1493 * them. We use a pretty
1494 * open match here, so that
1495 * the kernel API can still
1496 * change. */
1497 "char-kdbus", "rw",
1498 "char-kdbus/*", "rw");
9457ac5b
LP
1499 if (r < 0) {
1500 log_error("Failed to add device whitelist: %s", strerror(-r));
1501 return r;
1502 }
1503
1504 r = sd_bus_message_close_container(m);
1505 if (r < 0) {
1506 log_error("Failed to close container: %s", strerror(-r));
1507 return r;
1508 }
1509
1510 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1511 }
1512
9444b1f2 1513 if (r < 0) {
1f0cd86b
LP
1514 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1515 return r;
1516 }
1517
1518 return 0;
1519}
1520
1521static int terminate_machine(pid_t pid) {
1522 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1523 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1524 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1525 const char *path;
1526 int r;
1527
eb91eb18
LP
1528 if (!arg_register)
1529 return 0;
1530
76b54375 1531 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1532 if (r < 0) {
1533 log_error("Failed to open system bus: %s", strerror(-r));
1534 return r;
1535 }
1536
1537 r = sd_bus_call_method(
1538 bus,
1539 "org.freedesktop.machine1",
1540 "/org/freedesktop/machine1",
1541 "org.freedesktop.machine1.Manager",
1542 "GetMachineByPID",
1543 &error,
1544 &reply,
1545 "u",
1546 (uint32_t) pid);
1547 if (r < 0) {
1548 /* Note that the machine might already have been
1549 * cleaned up automatically, hence don't consider it a
1550 * failure if we cannot get the machine object. */
1551 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1552 return 0;
1553 }
1554
1555 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1556 if (r < 0)
1557 return bus_log_parse_error(r);
9444b1f2 1558
1f0cd86b
LP
1559 r = sd_bus_call_method(
1560 bus,
1561 "org.freedesktop.machine1",
1562 path,
1563 "org.freedesktop.machine1.Machine",
1564 "Terminate",
1565 &error,
1566 NULL,
1567 NULL);
1568 if (r < 0) {
1569 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1570 return 0;
1571 }
1572
9444b1f2
LP
1573 return 0;
1574}
1575
db999e0f
LP
1576static int reset_audit_loginuid(void) {
1577 _cleanup_free_ char *p = NULL;
1578 int r;
1579
1580 if (arg_share_system)
1581 return 0;
1582
1583 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1584 if (r == -ENOENT)
db999e0f
LP
1585 return 0;
1586 if (r < 0) {
1587 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1588 return r;
1589 }
1590
1591 /* Already reset? */
1592 if (streq(p, "4294967295"))
1593 return 0;
1594
1595 r = write_string_file("/proc/self/loginuid", "4294967295");
1596 if (r < 0) {
1597 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1598 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1599 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1600 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1601 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1602
db999e0f 1603 sleep(5);
77b6e194 1604 }
db999e0f
LP
1605
1606 return 0;
77b6e194
LP
1607}
1608
01dde061
TG
1609#define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1610
1611static int get_mac(struct ether_addr *mac) {
1612 int r;
1613
1614 uint8_t result[8];
1615 size_t l, sz;
1616 uint8_t *v;
1617
1618 l = strlen(arg_machine);
1619 sz = sizeof(sd_id128_t) + l;
1620 v = alloca(sz);
1621
1622 /* fetch some persistent data unique to the host */
1623 r = sd_id128_get_machine((sd_id128_t*) v);
1624 if (r < 0)
1625 return r;
1626
1627 /* combine with some data unique (on this host) to this
1628 * container instance */
1629 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1630
1631 /* Let's hash the host machine ID plus the container name. We
1632 * use a fixed, but originally randomly created hash key here. */
1633 siphash24(result, v, sz, HASH_KEY.bytes);
1634
1635 assert_cc(ETH_ALEN <= sizeof(result));
1636 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1637
1638 /* see eth_random_addr in the kernel */
1639 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1640 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1641
1642 return 0;
1643}
1644
5aa4bb6b 1645static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1646 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1647 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
01dde061 1648 struct ether_addr mac;
5aa4bb6b 1649 int r, i;
69c79d3c
LP
1650
1651 if (!arg_private_network)
1652 return 0;
1653
1654 if (!arg_network_veth)
1655 return 0;
1656
08af0da2
LP
1657 /* Use two different interface name prefixes depending whether
1658 * we are in bridge mode or not. */
4212a337
ZJS
1659 snprintf(iface_name, IFNAMSIZ, "%s-%s",
1660 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1661
01dde061
TG
1662 r = get_mac(&mac);
1663 if (r < 0) {
1664 log_error("Failed to generate predictable MAC address for host0");
1665 return r;
1666 }
1667
151b9b96 1668 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1669 if (r < 0) {
1670 log_error("Failed to connect to netlink: %s", strerror(-r));
1671 return r;
1672 }
1673
151b9b96 1674 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1675 if (r < 0) {
1676 log_error("Failed to allocate netlink message: %s", strerror(-r));
1677 return r;
1678 }
1679
ab046dde 1680 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1681 if (r < 0) {
ab046dde 1682 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1683 return r;
1684 }
1685
ee3a6a51 1686 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1687 if (r < 0) {
1688 log_error("Failed to open netlink container: %s", strerror(-r));
1689 return r;
1690 }
1691
d8e538ec 1692 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
69c79d3c
LP
1693 if (r < 0) {
1694 log_error("Failed to open netlink container: %s", strerror(-r));
1695 return r;
1696 }
1697
ee3a6a51 1698 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1699 if (r < 0) {
ab046dde 1700 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1701 return r;
1702 }
1703
ab046dde 1704 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1705 if (r < 0) {
ab046dde 1706 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1707 return r;
1708 }
01dde061
TG
1709
1710 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1711 if (r < 0) {
1712 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1713 return r;
1714 }
69c79d3c 1715
ab046dde 1716 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1717 if (r < 0) {
1718 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1719 return r;
1720 }
1721
1722 r = sd_rtnl_message_close_container(m);
1723 if (r < 0) {
1724 log_error("Failed to close netlink container: %s", strerror(-r));
1725 return r;
1726 }
1727
1728 r = sd_rtnl_message_close_container(m);
1729 if (r < 0) {
1730 log_error("Failed to close netlink container: %s", strerror(-r));
1731 return r;
1732 }
1733
1734 r = sd_rtnl_message_close_container(m);
1735 if (r < 0) {
1736 log_error("Failed to close netlink container: %s", strerror(-r));
1737 return r;
1738 }
1739
1740 r = sd_rtnl_call(rtnl, m, 0, NULL);
1741 if (r < 0) {
1742 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1743 return r;
1744 }
1745
5aa4bb6b
LP
1746 i = (int) if_nametoindex(iface_name);
1747 if (i <= 0) {
1748 log_error("Failed to resolve interface %s: %m", iface_name);
1749 return -errno;
1750 }
1751
1752 *ifi = i;
1753
69c79d3c
LP
1754 return 0;
1755}
1756
5aa4bb6b 1757static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1758 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1759 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1760 int r, bridge;
1761
1762 if (!arg_private_network)
1763 return 0;
1764
1765 if (!arg_network_veth)
1766 return 0;
1767
1768 if (!arg_network_bridge)
1769 return 0;
1770
1771 bridge = (int) if_nametoindex(arg_network_bridge);
1772 if (bridge <= 0) {
1773 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1774 return -errno;
1775 }
1776
5aa4bb6b
LP
1777 *ifi = bridge;
1778
151b9b96 1779 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1780 if (r < 0) {
1781 log_error("Failed to connect to netlink: %s", strerror(-r));
1782 return r;
1783 }
1784
151b9b96 1785 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1786 if (r < 0) {
1787 log_error("Failed to allocate netlink message: %s", strerror(-r));
1788 return r;
1789 }
1790
039dd4af
TG
1791 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1792 if (r < 0) {
1793 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1794 return r;
1795 }
1796
ab046dde
TG
1797 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1798 if (r < 0) {
1799 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1800 return r;
1801 }
1802
1803 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1804 if (r < 0) {
1805 log_error("Failed to add netlink master field: %s", strerror(-r));
1806 return r;
1807 }
1808
1809 r = sd_rtnl_call(rtnl, m, 0, NULL);
1810 if (r < 0) {
1811 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1812 return r;
1813 }
1814
1815 return 0;
1816}
1817
c74e630d
LP
1818static int parse_interface(struct udev *udev, const char *name) {
1819 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1820 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1821 int ifi;
1822
1823 ifi = (int) if_nametoindex(name);
1824 if (ifi <= 0) {
1825 log_error("Failed to resolve interface %s: %m", name);
1826 return -errno;
1827 }
1828
1829 sprintf(ifi_str, "n%i", ifi);
1830 d = udev_device_new_from_device_id(udev, ifi_str);
1831 if (!d) {
1832 log_error("Failed to get udev device for interface %s: %m", name);
1833 return -errno;
1834 }
1835
1836 if (udev_device_get_is_initialized(d) <= 0) {
1837 log_error("Network interface %s is not initialized yet.", name);
1838 return -EBUSY;
1839 }
1840
1841 return ifi;
1842}
1843
69c79d3c 1844static int move_network_interfaces(pid_t pid) {
7e227024 1845 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1846 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1847 char **i;
1848 int r;
1849
1850 if (!arg_private_network)
1851 return 0;
1852
1853 if (strv_isempty(arg_network_interfaces))
1854 return 0;
1855
151b9b96 1856 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1857 if (r < 0) {
1858 log_error("Failed to connect to netlink: %s", strerror(-r));
1859 return r;
1860 }
1861
7e227024
LP
1862 udev = udev_new();
1863 if (!udev) {
1864 log_error("Failed to connect to udev.");
1865 return -ENOMEM;
1866 }
1867
aa28aefe 1868 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1869 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1870 int ifi;
aa28aefe 1871
c74e630d
LP
1872 ifi = parse_interface(udev, *i);
1873 if (ifi < 0)
1874 return ifi;
1875
1876 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1877 if (r < 0) {
1878 log_error("Failed to allocate netlink message: %s", strerror(-r));
1879 return r;
aa28aefe
LP
1880 }
1881
c74e630d
LP
1882 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1883 if (r < 0) {
1884 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1885 return r;
7e227024
LP
1886 }
1887
c74e630d
LP
1888 r = sd_rtnl_call(rtnl, m, 0, NULL);
1889 if (r < 0) {
1890 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1891 return r;
7e227024 1892 }
c74e630d 1893 }
7e227024 1894
c74e630d
LP
1895 return 0;
1896}
1897
1898static int setup_macvlan(pid_t pid) {
1899 _cleanup_udev_unref_ struct udev *udev = NULL;
1900 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1901 char **i;
1902 int r;
1903
1904 if (!arg_private_network)
1905 return 0;
1906
1907 if (strv_isempty(arg_network_macvlan))
1908 return 0;
1909
1910 r = sd_rtnl_open(&rtnl, 0);
1911 if (r < 0) {
1912 log_error("Failed to connect to netlink: %s", strerror(-r));
1913 return r;
1914 }
1915
1916 udev = udev_new();
1917 if (!udev) {
1918 log_error("Failed to connect to udev.");
1919 return -ENOMEM;
1920 }
1921
1922 STRV_FOREACH(i, arg_network_macvlan) {
1923 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1924 _cleanup_free_ char *n = NULL;
1925 int ifi;
1926
1927 ifi = parse_interface(udev, *i);
1928 if (ifi < 0)
1929 return ifi;
1930
1931 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
1932 if (r < 0) {
1933 log_error("Failed to allocate netlink message: %s", strerror(-r));
1934 return r;
1935 }
1936
c74e630d
LP
1937 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1938 if (r < 0) {
1939 log_error("Failed to add netlink interface index: %s", strerror(-r));
1940 return r;
1941 }
1942
1943 n = strappend("mv-", *i);
1944 if (!n)
1945 return log_oom();
1946
1947 strshorten(n, IFNAMSIZ-1);
1948
1949 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1950 if (r < 0) {
1951 log_error("Failed to add netlink interface name: %s", strerror(-r));
1952 return r;
1953 }
1954
aa28aefe
LP
1955 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1956 if (r < 0) {
c74e630d
LP
1957 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1958 return r;
1959 }
1960
1961 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1962 if (r < 0) {
1963 log_error("Failed to open netlink container: %s", strerror(-r));
1964 return r;
1965 }
1966
d8e538ec 1967 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
c74e630d
LP
1968 if (r < 0) {
1969 log_error("Failed to open netlink container: %s", strerror(-r));
1970 return r;
1971 }
1972
1973 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1974 if (r < 0) {
1975 log_error("Failed to append macvlan mode: %s", strerror(-r));
1976 return r;
1977 }
1978
1979 r = sd_rtnl_message_close_container(m);
1980 if (r < 0) {
1981 log_error("Failed to close netlink container: %s", strerror(-r));
1982 return r;
1983 }
1984
1985 r = sd_rtnl_message_close_container(m);
1986 if (r < 0) {
1987 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
1988 return r;
1989 }
1990
1991 r = sd_rtnl_call(rtnl, m, 0, NULL);
1992 if (r < 0) {
c74e630d 1993 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
1994 return r;
1995 }
1996 }
1997
1998 return 0;
1999}
2000
28650077 2001static int setup_seccomp(void) {
24fb1112
LP
2002
2003#ifdef HAVE_SECCOMP
28650077
LP
2004 static const int blacklist[] = {
2005 SCMP_SYS(kexec_load),
2006 SCMP_SYS(open_by_handle_at),
2007 SCMP_SYS(init_module),
2008 SCMP_SYS(finit_module),
2009 SCMP_SYS(delete_module),
2010 SCMP_SYS(iopl),
2011 SCMP_SYS(ioperm),
2012 SCMP_SYS(swapon),
2013 SCMP_SYS(swapoff),
2014 };
2015
24fb1112 2016 scmp_filter_ctx seccomp;
28650077 2017 unsigned i;
24fb1112
LP
2018 int r;
2019
24fb1112
LP
2020 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2021 if (!seccomp)
2022 return log_oom();
2023
e9642be2 2024 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2025 if (r < 0) {
e9642be2
LP
2026 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2027 goto finish;
2028 }
2029
28650077
LP
2030 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2031 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2032 if (r == -EFAULT)
2033 continue; /* unknown syscall */
2034 if (r < 0) {
2035 log_error("Failed to block syscall: %s", strerror(-r));
2036 goto finish;
2037 }
2038 }
2039
2040 /*
2041 Audit is broken in containers, much of the userspace audit
2042 hookup will fail if running inside a container. We don't
2043 care and just turn off creation of audit sockets.
2044
2045 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2046 with EAFNOSUPPORT which audit userspace uses as indication
2047 that audit is disabled in the kernel.
2048 */
2049
3302da46 2050 r = seccomp_rule_add(
24fb1112
LP
2051 seccomp,
2052 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2053 SCMP_SYS(socket),
2054 2,
2055 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2056 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2057 if (r < 0) {
2058 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2059 goto finish;
2060 }
2061
2062 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2063 if (r < 0) {
2064 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2065 goto finish;
2066 }
2067
2068 r = seccomp_load(seccomp);
2069 if (r < 0)
2070 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2071
2072finish:
2073 seccomp_release(seccomp);
2074 return r;
2075#else
2076 return 0;
2077#endif
2078
2079}
2080
1b9e5b12
LP
2081static int setup_image(char **device_path, int *loop_nr) {
2082 struct loop_info64 info = {
2083 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2084 };
2085 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2086 _cleanup_free_ char* loopdev = NULL;
2087 struct stat st;
2088 int r, nr;
2089
2090 assert(device_path);
2091 assert(loop_nr);
2092
2093 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2094 if (fd < 0) {
2095 log_error("Failed to open %s: %m", arg_image);
2096 return -errno;
2097 }
2098
2099 if (fstat(fd, &st) < 0) {
2100 log_error("Failed to stat %s: %m", arg_image);
2101 return -errno;
2102 }
2103
2104 if (S_ISBLK(st.st_mode)) {
2105 char *p;
2106
2107 p = strdup(arg_image);
2108 if (!p)
2109 return log_oom();
2110
2111 *device_path = p;
2112
2113 *loop_nr = -1;
2114
2115 r = fd;
2116 fd = -1;
2117
2118 return r;
2119 }
2120
2121 if (!S_ISREG(st.st_mode)) {
2122 log_error("%s is not a regular file or block device: %m", arg_image);
2123 return -EINVAL;
2124 }
2125
2126 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2127 if (control < 0) {
2128 log_error("Failed to open /dev/loop-control: %m");
2129 return -errno;
2130 }
2131
2132 nr = ioctl(control, LOOP_CTL_GET_FREE);
2133 if (nr < 0) {
2134 log_error("Failed to allocate loop device: %m");
2135 return -errno;
2136 }
2137
2138 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2139 return log_oom();
2140
2141 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2142 if (loop < 0) {
2143 log_error("Failed to open loop device %s: %m", loopdev);
2144 return -errno;
2145 }
2146
2147 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2148 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2149 return -errno;
2150 }
2151
2152 if (arg_read_only)
2153 info.lo_flags |= LO_FLAGS_READ_ONLY;
2154
2155 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2156 log_error("Failed to set loopback settings on %s: %m", loopdev);
2157 return -errno;
2158 }
2159
2160 *device_path = loopdev;
2161 loopdev = NULL;
2162
2163 *loop_nr = nr;
2164
2165 r = loop;
2166 loop = -1;
2167
2168 return r;
2169}
2170
2171static int dissect_image(
2172 int fd,
727fd4fd
LP
2173 char **root_device, bool *root_device_rw,
2174 char **home_device, bool *home_device_rw,
2175 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2176 bool *secondary) {
2177
2178#ifdef HAVE_BLKID
2179 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2180 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2181 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2182 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2183 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2184 _cleanup_udev_unref_ struct udev *udev = NULL;
2185 struct udev_list_entry *first, *item;
727fd4fd 2186 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2187 const char *pttype = NULL;
2188 blkid_partlist pl;
2189 struct stat st;
2190 int r;
2191
2192 assert(fd >= 0);
2193 assert(root_device);
2194 assert(home_device);
2195 assert(srv_device);
2196 assert(secondary);
2197
2198 b = blkid_new_probe();
2199 if (!b)
2200 return log_oom();
2201
2202 errno = 0;
2203 r = blkid_probe_set_device(b, fd, 0, 0);
2204 if (r != 0) {
2205 if (errno == 0)
2206 return log_oom();
2207
2208 log_error("Failed to set device on blkid probe: %m");
2209 return -errno;
2210 }
2211
2212 blkid_probe_enable_partitions(b, 1);
2213 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2214
2215 errno = 0;
2216 r = blkid_do_safeprobe(b);
2217 if (r == -2 || r == 1) {
2218 log_error("Failed to identify any partition table on %s.\n"
2219 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2220 return -EINVAL;
2221 } else if (r != 0) {
2222 if (errno == 0)
2223 errno = EIO;
2224 log_error("Failed to probe: %m");
2225 return -errno;
2226 }
2227
2228 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2229 if (!streq_ptr(pttype, "gpt")) {
2230 log_error("Image %s does not carry a GUID Partition Table.\n"
2231 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2232 return -EINVAL;
2233 }
2234
2235 errno = 0;
2236 pl = blkid_probe_get_partitions(b);
2237 if (!pl) {
2238 if (errno == 0)
2239 return log_oom();
2240
2241 log_error("Failed to list partitions of %s", arg_image);
2242 return -errno;
2243 }
2244
2245 udev = udev_new();
2246 if (!udev)
2247 return log_oom();
2248
2249 if (fstat(fd, &st) < 0) {
2250 log_error("Failed to stat block device: %m");
2251 return -errno;
2252 }
2253
2254 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2255 if (!d)
2256 return log_oom();
2257
2258 e = udev_enumerate_new(udev);
2259 if (!e)
2260 return log_oom();
2261
2262 r = udev_enumerate_add_match_parent(e, d);
2263 if (r < 0)
2264 return log_oom();
2265
2266 r = udev_enumerate_scan_devices(e);
2267 if (r < 0) {
2268 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2269 return r;
2270 }
2271
2272 first = udev_enumerate_get_list_entry(e);
2273 udev_list_entry_foreach(item, first) {
2274 _cleanup_udev_device_unref_ struct udev_device *q;
2275 const char *stype, *node;
727fd4fd 2276 unsigned long long flags;
1b9e5b12
LP
2277 sd_id128_t type_id;
2278 blkid_partition pp;
2279 dev_t qn;
2280 int nr;
2281
2282 errno = 0;
2283 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2284 if (!q) {
2285 if (!errno)
2286 errno = ENOMEM;
2287
2288 log_error("Failed to get partition device of %s: %m", arg_image);
2289 return -errno;
2290 }
2291
2292 qn = udev_device_get_devnum(q);
2293 if (major(qn) == 0)
2294 continue;
2295
2296 if (st.st_rdev == qn)
2297 continue;
2298
2299 node = udev_device_get_devnode(q);
2300 if (!node)
2301 continue;
2302
2303 pp = blkid_partlist_devno_to_partition(pl, qn);
2304 if (!pp)
2305 continue;
2306
727fd4fd
LP
2307 flags = blkid_partition_get_flags(pp);
2308 if (flags & GPT_FLAG_NO_AUTO)
2309 continue;
2310
1b9e5b12
LP
2311 nr = blkid_partition_get_partno(pp);
2312 if (nr < 0)
2313 continue;
2314
2315 stype = blkid_partition_get_type_string(pp);
2316 if (!stype)
2317 continue;
2318
2319 if (sd_id128_from_string(stype, &type_id) < 0)
2320 continue;
2321
2322 if (sd_id128_equal(type_id, GPT_HOME)) {
2323
2324 if (home && nr >= home_nr)
2325 continue;
2326
2327 home_nr = nr;
727fd4fd
LP
2328 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2329
1b9e5b12
LP
2330 free(home);
2331 home = strdup(node);
2332 if (!home)
2333 return log_oom();
2334 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2335
2336 if (srv && nr >= srv_nr)
2337 continue;
2338
2339 srv_nr = nr;
727fd4fd
LP
2340 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2341
1b9e5b12
LP
2342 free(srv);
2343 srv = strdup(node);
2344 if (!srv)
2345 return log_oom();
2346 }
2347#ifdef GPT_ROOT_NATIVE
2348 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2349
2350 if (root && nr >= root_nr)
2351 continue;
2352
2353 root_nr = nr;
727fd4fd
LP
2354 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2355
1b9e5b12
LP
2356 free(root);
2357 root = strdup(node);
2358 if (!root)
2359 return log_oom();
2360 }
2361#endif
2362#ifdef GPT_ROOT_SECONDARY
2363 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2364
2365 if (secondary_root && nr >= secondary_root_nr)
2366 continue;
2367
2368 secondary_root_nr = nr;
727fd4fd
LP
2369 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2370
2371
1b9e5b12
LP
2372 free(secondary_root);
2373 secondary_root = strdup(node);
2374 if (!secondary_root)
2375 return log_oom();
2376 }
2377#endif
2378 }
2379
2380 if (!root && !secondary_root) {
2381 log_error("Failed to identify root partition in disk image %s.\n"
2382 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2383 return -EINVAL;
2384 }
2385
2386 if (root) {
2387 *root_device = root;
2388 root = NULL;
727fd4fd
LP
2389
2390 *root_device_rw = root_rw;
1b9e5b12
LP
2391 *secondary = false;
2392 } else if (secondary_root) {
2393 *root_device = secondary_root;
2394 secondary_root = NULL;
727fd4fd
LP
2395
2396 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2397 *secondary = true;
2398 }
2399
2400 if (home) {
2401 *home_device = home;
2402 home = NULL;
727fd4fd
LP
2403
2404 *home_device_rw = home_rw;
1b9e5b12
LP
2405 }
2406
2407 if (srv) {
2408 *srv_device = srv;
2409 srv = NULL;
727fd4fd
LP
2410
2411 *srv_device_rw = srv_rw;
1b9e5b12
LP
2412 }
2413
2414 return 0;
2415#else
2416 log_error("--image= is not supported, compiled without blkid support.");
2417 return -ENOTSUP;
2418#endif
2419}
2420
727fd4fd 2421static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2422#ifdef HAVE_BLKID
2423 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2424 const char *fstype, *p;
2425 int r;
2426
2427 assert(what);
2428 assert(where);
2429
727fd4fd
LP
2430 if (arg_read_only)
2431 rw = false;
2432
1b9e5b12
LP
2433 if (directory)
2434 p = strappenda(where, directory);
2435 else
2436 p = where;
2437
2438 errno = 0;
2439 b = blkid_new_probe_from_filename(what);
2440 if (!b) {
2441 if (errno == 0)
2442 return log_oom();
2443 log_error("Failed to allocate prober for %s: %m", what);
2444 return -errno;
2445 }
2446
2447 blkid_probe_enable_superblocks(b, 1);
2448 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2449
2450 errno = 0;
2451 r = blkid_do_safeprobe(b);
2452 if (r == -1 || r == 1) {
2453 log_error("Cannot determine file system type of %s", what);
2454 return -EINVAL;
2455 } else if (r != 0) {
2456 if (errno == 0)
2457 errno = EIO;
2458 log_error("Failed to probe %s: %m", what);
2459 return -errno;
2460 }
2461
2462 errno = 0;
2463 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2464 if (errno == 0)
2465 errno = EINVAL;
2466 log_error("Failed to determine file system type of %s", what);
2467 return -errno;
2468 }
2469
2470 if (streq(fstype, "crypto_LUKS")) {
2471 log_error("nspawn currently does not support LUKS disk images.");
2472 return -ENOTSUP;
2473 }
2474
727fd4fd 2475 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
1b9e5b12
LP
2476 log_error("Failed to mount %s: %m", what);
2477 return -errno;
2478 }
2479
2480 return 0;
2481#else
2482 log_error("--image= is not supported, compiled without blkid support.");
2483 return -ENOTSUP;
2484#endif
2485}
2486
727fd4fd
LP
2487static int mount_devices(
2488 const char *where,
2489 const char *root_device, bool root_device_rw,
2490 const char *home_device, bool home_device_rw,
2491 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2492 int r;
2493
2494 assert(where);
2495
2496 if (root_device) {
727fd4fd 2497 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
1b9e5b12
LP
2498 if (r < 0) {
2499 log_error("Failed to mount root directory: %s", strerror(-r));
2500 return r;
2501 }
2502 }
2503
2504 if (home_device) {
727fd4fd 2505 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
1b9e5b12
LP
2506 if (r < 0) {
2507 log_error("Failed to mount home directory: %s", strerror(-r));
2508 return r;
2509 }
2510 }
2511
2512 if (srv_device) {
727fd4fd 2513 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
1b9e5b12
LP
2514 if (r < 0) {
2515 log_error("Failed to mount server data directory: %s", strerror(-r));
2516 return r;
2517 }
2518 }
2519
2520 return 0;
2521}
2522
2523static void loop_remove(int nr, int *image_fd) {
2524 _cleanup_close_ int control = -1;
2525
2526 if (nr < 0)
2527 return;
2528
2529 if (image_fd && *image_fd >= 0) {
2530 ioctl(*image_fd, LOOP_CLR_FD);
03e334a1 2531 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2532 }
2533
2534 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2535 if (control < 0)
2536 return;
2537
2538 ioctl(control, LOOP_CTL_REMOVE, nr);
2539}
2540
0cb9fbcd
LP
2541static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2542 int pipe_fds[2];
2543 pid_t pid;
2544
2545 assert(database);
2546 assert(key);
2547 assert(rpid);
2548
2549 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2550 log_error("Failed to allocate pipe: %m");
2551 return -errno;
2552 }
2553
2554 pid = fork();
2555 if (pid < 0) {
2556 log_error("Failed to fork getent child: %m");
2557 return -errno;
2558 } else if (pid == 0) {
2559 int nullfd;
2560 char *empty_env = NULL;
2561
2562 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2563 _exit(EXIT_FAILURE);
2564
2565 if (pipe_fds[0] > 2)
03e334a1 2566 safe_close(pipe_fds[0]);
0cb9fbcd 2567 if (pipe_fds[1] > 2)
03e334a1 2568 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2569
2570 nullfd = open("/dev/null", O_RDWR);
2571 if (nullfd < 0)
2572 _exit(EXIT_FAILURE);
2573
2574 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2575 _exit(EXIT_FAILURE);
2576
2577 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2578 _exit(EXIT_FAILURE);
2579
2580 if (nullfd > 2)
03e334a1 2581 safe_close(nullfd);
0cb9fbcd
LP
2582
2583 reset_all_signal_handlers();
2584 close_all_fds(NULL, 0);
2585
4de82926
MM
2586 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2587 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2588 _exit(EXIT_FAILURE);
2589 }
2590
03e334a1 2591 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2592
2593 *rpid = pid;
2594
2595 return pipe_fds[0];
2596}
2597
2598static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2599 char line[LINE_MAX], *x, *u, *g, *h;
2600 const char *word, *state;
0cb9fbcd
LP
2601 _cleanup_free_ uid_t *uids = NULL;
2602 _cleanup_free_ char *home = NULL;
2603 _cleanup_fclose_ FILE *f = NULL;
2604 _cleanup_close_ int fd = -1;
2605 unsigned n_uids = 0;
70f539ca 2606 size_t sz = 0, l;
0cb9fbcd
LP
2607 uid_t uid;
2608 gid_t gid;
2609 pid_t pid;
2610 int r;
2611
2612 assert(_home);
2613
2614 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2615 /* Reset everything fully to 0, just in case */
2616
2617 if (setgroups(0, NULL) < 0) {
2618 log_error("setgroups() failed: %m");
2619 return -errno;
2620 }
2621
2622 if (setresgid(0, 0, 0) < 0) {
2623 log_error("setregid() failed: %m");
2624 return -errno;
2625 }
2626
2627 if (setresuid(0, 0, 0) < 0) {
2628 log_error("setreuid() failed: %m");
2629 return -errno;
2630 }
2631
2632 *_home = NULL;
2633 return 0;
2634 }
2635
2636 /* First, get user credentials */
2637 fd = spawn_getent("passwd", arg_user, &pid);
2638 if (fd < 0)
2639 return fd;
2640
2641 f = fdopen(fd, "r");
2642 if (!f)
2643 return log_oom();
2644 fd = -1;
2645
2646 if (!fgets(line, sizeof(line), f)) {
2647
2648 if (!ferror(f)) {
2649 log_error("Failed to resolve user %s.", arg_user);
2650 return -ESRCH;
2651 }
2652
2653 log_error("Failed to read from getent: %m");
2654 return -errno;
2655 }
2656
2657 truncate_nl(line);
2658
2659 wait_for_terminate_and_warn("getent passwd", pid);
2660
2661 x = strchr(line, ':');
2662 if (!x) {
2663 log_error("/etc/passwd entry has invalid user field.");
2664 return -EIO;
2665 }
2666
2667 u = strchr(x+1, ':');
2668 if (!u) {
2669 log_error("/etc/passwd entry has invalid password field.");
2670 return -EIO;
2671 }
2672
2673 u++;
2674 g = strchr(u, ':');
2675 if (!g) {
2676 log_error("/etc/passwd entry has invalid UID field.");
2677 return -EIO;
2678 }
2679
2680 *g = 0;
2681 g++;
2682 x = strchr(g, ':');
2683 if (!x) {
2684 log_error("/etc/passwd entry has invalid GID field.");
2685 return -EIO;
2686 }
2687
2688 *x = 0;
2689 h = strchr(x+1, ':');
2690 if (!h) {
2691 log_error("/etc/passwd entry has invalid GECOS field.");
2692 return -EIO;
2693 }
2694
2695 h++;
2696 x = strchr(h, ':');
2697 if (!x) {
2698 log_error("/etc/passwd entry has invalid home directory field.");
2699 return -EIO;
2700 }
2701
2702 *x = 0;
2703
2704 r = parse_uid(u, &uid);
2705 if (r < 0) {
2706 log_error("Failed to parse UID of user.");
2707 return -EIO;
2708 }
2709
2710 r = parse_gid(g, &gid);
2711 if (r < 0) {
2712 log_error("Failed to parse GID of user.");
2713 return -EIO;
2714 }
2715
2716 home = strdup(h);
2717 if (!home)
2718 return log_oom();
2719
2720 /* Second, get group memberships */
2721 fd = spawn_getent("initgroups", arg_user, &pid);
2722 if (fd < 0)
2723 return fd;
2724
2725 fclose(f);
2726 f = fdopen(fd, "r");
2727 if (!f)
2728 return log_oom();
2729 fd = -1;
2730
2731 if (!fgets(line, sizeof(line), f)) {
2732 if (!ferror(f)) {
2733 log_error("Failed to resolve user %s.", arg_user);
2734 return -ESRCH;
2735 }
2736
2737 log_error("Failed to read from getent: %m");
2738 return -errno;
2739 }
2740
2741 truncate_nl(line);
2742
2743 wait_for_terminate_and_warn("getent initgroups", pid);
2744
2745 /* Skip over the username and subsequent separator whitespace */
2746 x = line;
2747 x += strcspn(x, WHITESPACE);
2748 x += strspn(x, WHITESPACE);
2749
a2a5291b 2750 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2751 char c[l+1];
2752
a2a5291b 2753 memcpy(c, word, l);
0cb9fbcd
LP
2754 c[l] = 0;
2755
2756 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2757 return log_oom();
2758
2759 r = parse_uid(c, &uids[n_uids++]);
2760 if (r < 0) {
2761 log_error("Failed to parse group data from getent.");
2762 return -EIO;
2763 }
2764 }
2765
2766 r = mkdir_parents(home, 0775);
2767 if (r < 0) {
2768 log_error("Failed to make home root directory: %s", strerror(-r));
2769 return r;
2770 }
2771
2772 r = mkdir_safe(home, 0755, uid, gid);
f418f31d 2773 if (r < 0 && r != -EEXIST) {
0cb9fbcd
LP
2774 log_error("Failed to make home directory: %s", strerror(-r));
2775 return r;
2776 }
2777
2778 fchown(STDIN_FILENO, uid, gid);
2779 fchown(STDOUT_FILENO, uid, gid);
2780 fchown(STDERR_FILENO, uid, gid);
2781
2782 if (setgroups(n_uids, uids) < 0) {
2783 log_error("Failed to set auxiliary groups: %m");
2784 return -errno;
2785 }
2786
2787 if (setresgid(gid, gid, gid) < 0) {
2788 log_error("setregid() failed: %m");
2789 return -errno;
2790 }
2791
2792 if (setresuid(uid, uid, uid) < 0) {
2793 log_error("setreuid() failed: %m");
2794 return -errno;
2795 }
2796
2797 if (_home) {
2798 *_home = home;
2799 home = NULL;
2800 }
2801
2802 return 0;
2803}
2804
113cea80 2805/*
6d416b9c
LS
2806 * Return values:
2807 * < 0 : wait_for_terminate() failed to get the state of the
2808 * container, the container was terminated by a signal, or
2809 * failed for an unknown reason. No change is made to the
2810 * container argument.
2811 * > 0 : The program executed in the container terminated with an
2812 * error. The exit code of the program executed in the
2813 * container is returned. No change is made to the container
2814 * argument.
2815 * 0 : The container is being rebooted, has been shut down or exited
2816 * successfully. The container argument has been set to either
2817 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2818 *
6d416b9c
LS
2819 * That is, success is indicated by a return value of zero, and an
2820 * error is indicated by a non-zero value.
113cea80
DH
2821 */
2822static int wait_for_container(pid_t pid, ContainerStatus *container) {
2823 int r;
2824 siginfo_t status;
2825
2826 r = wait_for_terminate(pid, &status);
8baaf7a3
LS
2827 if (r < 0) {
2828 log_warning("Failed to wait for container: %s", strerror(-r));
113cea80 2829 return r;
8baaf7a3 2830 }
113cea80
DH
2831
2832 switch (status.si_code) {
2833 case CLD_EXITED:
2834 r = status.si_status;
2835 if (r == 0) {
2836 if (!arg_quiet)
2837 log_debug("Container %s exited successfully.",
2838 arg_machine);
2839
2840 *container = CONTAINER_TERMINATED;
2841 } else {
2842 log_error("Container %s failed with error code %i.",
2843 arg_machine, status.si_status);
113cea80
DH
2844 }
2845 break;
2846
2847 case CLD_KILLED:
2848 if (status.si_status == SIGINT) {
2849 if (!arg_quiet)
2850 log_info("Container %s has been shut down.",
2851 arg_machine);
2852
2853 *container = CONTAINER_TERMINATED;
2854 r = 0;
2855 break;
2856 } else if (status.si_status == SIGHUP) {
2857 if (!arg_quiet)
2858 log_info("Container %s is being rebooted.",
2859 arg_machine);
2860
2861 *container = CONTAINER_REBOOTED;
2862 r = 0;
2863 break;
2864 }
2865 /* CLD_KILLED fallthrough */
2866
2867 case CLD_DUMPED:
2868 log_error("Container %s terminated by signal %s.",
2869 arg_machine, signal_to_string(status.si_status));
2870 r = -1;
2871 break;
2872
2873 default:
2874 log_error("Container %s failed due to unknown reason.",
2875 arg_machine);
2876 r = -1;
2877 break;
2878 }
2879
2880 return r;
2881}
2882
e866af3a
DH
2883static void nop_handler(int sig) {}
2884
88213476 2885int main(int argc, char *argv[]) {
69c79d3c 2886
1b9e5b12 2887 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2888 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
d96c1ecf 2889 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
3d94f76c 2890 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2891 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2892 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2893 const char *console = NULL;
1b9e5b12
LP
2894 char veth_name[IFNAMSIZ];
2895 bool secondary = false;
e866af3a 2896 sigset_t mask, mask_chld;
69c79d3c 2897 pid_t pid = 0;
88213476
LP
2898
2899 log_parse_environment();
2900 log_open();
2901
05947bef
LP
2902 k = parse_argv(argc, argv);
2903 if (k < 0)
88213476 2904 goto finish;
05947bef
LP
2905 else if (k == 0) {
2906 r = EXIT_SUCCESS;
2907 goto finish;
2908 }
88213476 2909
1b9e5b12
LP
2910 if (!arg_image) {
2911 if (arg_directory) {
2912 char *p;
88213476 2913
1b9e5b12
LP
2914 p = path_make_absolute_cwd(arg_directory);
2915 free(arg_directory);
2916 arg_directory = p;
2917 } else
2918 arg_directory = get_current_dir_name();
88213476 2919
1b9e5b12
LP
2920 if (!arg_directory) {
2921 log_error("Failed to determine path, please use -D.");
2922 goto finish;
2923 }
2924 path_kill_slashes(arg_directory);
88213476
LP
2925 }
2926
7027ff61 2927 if (!arg_machine) {
1b9e5b12 2928 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2929 if (!arg_machine) {
2930 log_oom();
2931 goto finish;
2932 }
2933
e724b063 2934 hostname_cleanup(arg_machine, false);
7027ff61
LP
2935 if (isempty(arg_machine)) {
2936 log_error("Failed to determine machine name automatically, please use -M.");
2937 goto finish;
2938 }
2939 }
2940
88213476
LP
2941 if (geteuid() != 0) {
2942 log_error("Need to be root.");
2943 goto finish;
2944 }
2945
04d391da
LP
2946 if (sd_booted() <= 0) {
2947 log_error("Not running on a systemd system.");
2948 goto finish;
2949 }
2950
1b9e5b12
LP
2951 log_close();
2952 n_fd_passed = sd_listen_fds(false);
2953 if (n_fd_passed > 0) {
2954 k = fdset_new_listen_fds(&fds, false);
2955 if (k < 0) {
2956 log_error("Failed to collect file descriptors: %s", strerror(-k));
2957 goto finish;
2958 }
88213476 2959 }
1b9e5b12
LP
2960 fdset_close_others(fds);
2961 log_open();
88213476 2962
1b9e5b12
LP
2963 if (arg_directory) {
2964 if (path_equal(arg_directory, "/")) {
2965 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2966 goto finish;
2967 }
1b9e5b12
LP
2968
2969 if (arg_boot) {
2970 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 2971 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
2972 goto finish;
2973 }
2974 } else {
2975 const char *p;
2976
2977 p = strappenda(arg_directory,
2978 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2979 if (access(p, F_OK) < 0) {
2980 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2981 goto finish;
2982
2983 }
2984 }
6b9132a9 2985 } else {
1b9e5b12 2986 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2987
1b9e5b12
LP
2988 if (!mkdtemp(template)) {
2989 log_error("Failed to create temporary directory: %m");
2990 r = -errno;
6b9132a9 2991 goto finish;
1b9e5b12 2992 }
6b9132a9 2993
1b9e5b12
LP
2994 arg_directory = strdup(template);
2995 if (!arg_directory) {
2996 r = log_oom();
2997 goto finish;
6b9132a9 2998 }
88213476 2999
1b9e5b12
LP
3000 image_fd = setup_image(&device_path, &loop_nr);
3001 if (image_fd < 0) {
3002 r = image_fd;
842f3b0f
LP
3003 goto finish;
3004 }
1b9e5b12 3005
4d9f07b4
LP
3006 r = dissect_image(image_fd,
3007 &root_device, &root_device_rw,
3008 &home_device, &home_device_rw,
3009 &srv_device, &srv_device_rw,
3010 &secondary);
1b9e5b12
LP
3011 if (r < 0)
3012 goto finish;
842f3b0f 3013 }
842f3b0f 3014
db7feb7e
LP
3015 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3016 if (master < 0) {
a258bf26
LP
3017 log_error("Failed to acquire pseudo tty: %m");
3018 goto finish;
3019 }
3020
db7feb7e
LP
3021 console = ptsname(master);
3022 if (!console) {
a258bf26
LP
3023 log_error("Failed to determine tty name: %m");
3024 goto finish;
3025 }
3026
284c0b91 3027 if (!arg_quiet)
45f1386c
ZJS
3028 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3029 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
3030
3031 if (unlockpt(master) < 0) {
3032 log_error("Failed to unlock tty: %m");
3033 goto finish;
3034 }
3035
eb91eb18
LP
3036 if (access("/dev/kdbus/control", F_OK) >= 0) {
3037
3038 if (arg_share_system) {
3039 kdbus_domain = strdup("/dev/kdbus");
3040 if (!kdbus_domain) {
3041 log_oom();
3042 goto finish;
3043 }
3044 } else {
3045 const char *ns;
3046
3047 ns = strappenda("machine-", arg_machine);
3048 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3049 if (r < 0)
3050 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3051 else
3052 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3053 }
3054 }
9bd37b40 3055
e58a1277 3056 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
3057 log_error("Failed to create kmsg socket pair: %m");
3058 goto finish;
3059 }
3060
05947bef
LP
3061 sd_notify(0, "READY=1");
3062
a258bf26 3063 assert_se(sigemptyset(&mask) == 0);
e866af3a
DH
3064 assert_se(sigemptyset(&mask_chld) == 0);
3065 sigaddset(&mask_chld, SIGCHLD);
a258bf26
LP
3066 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3067 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3068
d87be9b0 3069 for (;;) {
113cea80 3070 ContainerStatus container_status;
7566e267 3071 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3072 struct sigaction sa = {
3073 .sa_handler = nop_handler,
3074 .sa_flags = SA_NOCLDSTOP,
3075 };
3076
7566e267 3077 r = barrier_create(&barrier);
a2da110b
DH
3078 if (r < 0) {
3079 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3080 goto finish;
3081 }
3082
e866af3a
DH
3083 /* Child can be killed before execv(), so handle SIGCHLD
3084 * in order to interrupt parent's blocking calls and
3085 * give it a chance to call wait() and terminate. */
3086 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3087 if (r < 0) {
3088 log_error("Failed to change the signal mask: %m");
d96c1ecf
LP
3089 goto finish;
3090 }
3091
e866af3a
DH
3092 r = sigaction(SIGCHLD, &sa, NULL);
3093 if (r < 0) {
3094 log_error("Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3095 goto finish;
3096 }
3097
a2da110b
DH
3098 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3099 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3100 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3101 if (pid < 0) {
3102 if (errno == EINVAL)
3103 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3104 else
3105 log_error("clone() failed: %m");
a258bf26 3106
e866af3a 3107 r = pid;
d87be9b0
LP
3108 goto finish;
3109 }
a258bf26 3110
d87be9b0
LP
3111 if (pid == 0) {
3112 /* child */
0cb9fbcd 3113 _cleanup_free_ char *home = NULL;
5674767e 3114 unsigned n_env = 2;
d87be9b0 3115 const char *envp[] = {
e10a55fd 3116 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3117 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3118 NULL, /* TERM */
3119 NULL, /* HOME */
3120 NULL, /* USER */
3121 NULL, /* LOGNAME */
3122 NULL, /* container_uuid */
842f3b0f
LP
3123 NULL, /* LISTEN_FDS */
3124 NULL, /* LISTEN_PID */
d87be9b0
LP
3125 NULL
3126 };
f4889f65 3127 char **env_use;
a258bf26 3128
a2da110b
DH
3129 barrier_set_role(&barrier, BARRIER_CHILD);
3130
5674767e
ZJS
3131 envp[n_env] = strv_find_prefix(environ, "TERM=");
3132 if (envp[n_env])
3133 n_env ++;
a258bf26 3134
03e334a1 3135 master = safe_close(master);
a258bf26 3136
d87be9b0
LP
3137 close_nointr(STDIN_FILENO);
3138 close_nointr(STDOUT_FILENO);
3139 close_nointr(STDERR_FILENO);
db7feb7e 3140
03e334a1 3141 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3142
d87be9b0 3143 reset_all_signal_handlers();
88213476 3144
d87be9b0
LP
3145 assert_se(sigemptyset(&mask) == 0);
3146 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 3147
842f3b0f
LP
3148 k = open_terminal(console, O_RDWR);
3149 if (k != STDIN_FILENO) {
3150 if (k >= 0) {
03e334a1 3151 safe_close(k);
842f3b0f
LP
3152 k = -EINVAL;
3153 }
3154
3155 log_error("Failed to open console: %s", strerror(-k));
a2da110b 3156 _exit(EXIT_FAILURE);
842f3b0f
LP
3157 }
3158
3159 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3160 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3161 log_error("Failed to duplicate console: %m");
a2da110b 3162 _exit(EXIT_FAILURE);
842f3b0f 3163 }
bc2f673e 3164
d87be9b0
LP
3165 if (setsid() < 0) {
3166 log_error("setsid() failed: %m");
a2da110b 3167 _exit(EXIT_FAILURE);
bc2f673e
LP
3168 }
3169
db999e0f 3170 if (reset_audit_loginuid() < 0)
a2da110b 3171 _exit(EXIT_FAILURE);
db999e0f 3172
d87be9b0
LP
3173 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3174 log_error("PR_SET_PDEATHSIG failed: %m");
a2da110b 3175 _exit(EXIT_FAILURE);
d87be9b0 3176 }
e58a1277 3177
d87be9b0
LP
3178 /* Mark everything as slave, so that we still
3179 * receive mounts from the real root, but don't
3180 * propagate mounts to the real root. */
3181 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3182 log_error("MS_SLAVE|MS_REC failed: %m");
a2da110b 3183 _exit(EXIT_FAILURE);
d87be9b0 3184 }
04bc4a3f 3185
727fd4fd
LP
3186 if (mount_devices(arg_directory,
3187 root_device, root_device_rw,
3188 home_device, home_device_rw,
3189 srv_device, srv_device_rw) < 0)
a2da110b 3190 _exit(EXIT_FAILURE);
1b9e5b12 3191
d87be9b0
LP
3192 /* Turn directory into bind mount */
3193 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
d6797c92 3194 log_error("Failed to make bind mount: %m");
a2da110b 3195 _exit(EXIT_FAILURE);
d87be9b0 3196 }
88213476 3197
4d9f07b4
LP
3198 r = setup_volatile(arg_directory);
3199 if (r < 0)
a2da110b 3200 _exit(EXIT_FAILURE);
4d9f07b4
LP
3201
3202 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3203 _exit(EXIT_FAILURE);
4d9f07b4
LP
3204
3205 r = base_filesystem_create(arg_directory);
3206 if (r < 0)
a2da110b 3207 _exit(EXIT_FAILURE);
4d9f07b4 3208
d6797c92
LP
3209 if (arg_read_only) {
3210 k = bind_remount_recursive(arg_directory, true);
3211 if (k < 0) {
3212 log_error("Failed to make tree read-only: %s", strerror(-k));
a2da110b 3213 _exit(EXIT_FAILURE);
d87be9b0 3214 }
d6797c92 3215 }
2547bb41 3216
d87be9b0 3217 if (mount_all(arg_directory) < 0)
a2da110b 3218 _exit(EXIT_FAILURE);
57fb9fb5 3219
d87be9b0 3220 if (copy_devnodes(arg_directory) < 0)
a2da110b 3221 _exit(EXIT_FAILURE);
a258bf26 3222
f2d88580 3223 if (setup_ptmx(arg_directory) < 0)
a2da110b 3224 _exit(EXIT_FAILURE);
f2d88580 3225
d87be9b0 3226 dev_setup(arg_directory);
88213476 3227
28650077 3228 if (setup_seccomp() < 0)
a2da110b 3229 _exit(EXIT_FAILURE);
24fb1112 3230
d87be9b0 3231 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3232 _exit(EXIT_FAILURE);
88213476 3233
d87be9b0 3234 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3235 _exit(EXIT_FAILURE);
88213476 3236
03e334a1 3237 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3238
d87be9b0 3239 if (setup_boot_id(arg_directory) < 0)
a2da110b 3240 _exit(EXIT_FAILURE);
a41fe3a2 3241
d87be9b0 3242 if (setup_timezone(arg_directory) < 0)
a2da110b 3243 _exit(EXIT_FAILURE);
88213476 3244
d87be9b0 3245 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3246 _exit(EXIT_FAILURE);
687d0825 3247
d87be9b0 3248 if (setup_journal(arg_directory) < 0)
a2da110b 3249 _exit(EXIT_FAILURE);
687d0825 3250
d6797c92 3251 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3252 _exit(EXIT_FAILURE);
17fe0523 3253
d6797c92 3254 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3255 _exit(EXIT_FAILURE);
17fe0523 3256
06c17c39 3257 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3258 _exit(EXIT_FAILURE);
06c17c39 3259
486e99a3 3260 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
a2da110b 3261 _exit(EXIT_FAILURE);
9bd37b40 3262
d96c1ecf
LP
3263 /* Tell the parent that we are ready, and that
3264 * it can cgroupify us to that we lack access
3265 * to certain devices and resources. */
a2da110b 3266 barrier_place(&barrier);
d96c1ecf 3267
d87be9b0
LP
3268 if (chdir(arg_directory) < 0) {
3269 log_error("chdir(%s) failed: %m", arg_directory);
a2da110b 3270 _exit(EXIT_FAILURE);
687d0825
MV
3271 }
3272
d87be9b0
LP
3273 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3274 log_error("mount(MS_MOVE) failed: %m");
a2da110b 3275 _exit(EXIT_FAILURE);
687d0825
MV
3276 }
3277
d87be9b0
LP
3278 if (chroot(".") < 0) {
3279 log_error("chroot() failed: %m");
a2da110b 3280 _exit(EXIT_FAILURE);
687d0825
MV
3281 }
3282
d87be9b0
LP
3283 if (chdir("/") < 0) {
3284 log_error("chdir() failed: %m");
a2da110b 3285 _exit(EXIT_FAILURE);
687d0825
MV
3286 }
3287
d87be9b0
LP
3288 umask(0022);
3289
eb91eb18
LP
3290 if (arg_private_network)
3291 loopback_setup();
d87be9b0
LP
3292
3293 if (drop_capabilities() < 0) {
3294 log_error("drop_capabilities() failed: %m");
a2da110b 3295 _exit(EXIT_FAILURE);
687d0825 3296 }
687d0825 3297
0cb9fbcd
LP
3298 r = change_uid_gid(&home);
3299 if (r < 0)
a2da110b 3300 _exit(EXIT_FAILURE);
d87be9b0 3301
842f3b0f
LP
3302 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3303 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3304 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3305 log_oom();
a2da110b 3306 _exit(EXIT_FAILURE);
144f0fc0 3307 }
687d0825 3308
9444b1f2 3309 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3310 char as_uuid[37];
3311
3312 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3313 log_oom();
a2da110b 3314 _exit(EXIT_FAILURE);
842f3b0f
LP
3315 }
3316 }
3317
3318 if (fdset_size(fds) > 0) {
3319 k = fdset_cloexec(fds, false);
3320 if (k < 0) {
3321 log_error("Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3322 _exit(EXIT_FAILURE);
842f3b0f
LP
3323 }
3324
3325 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3326 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3327 log_oom();
a2da110b 3328 _exit(EXIT_FAILURE);
d87be9b0
LP
3329 }
3330 }
3331
3332 setup_hostname();
3333
6afc95b7
LP
3334 if (arg_personality != 0xffffffffLU) {
3335 if (personality(arg_personality) < 0) {
3336 log_error("personality() failed: %m");
a2da110b 3337 _exit(EXIT_FAILURE);
6afc95b7 3338 }
1b9e5b12
LP
3339 } else if (secondary) {
3340 if (personality(PER_LINUX32) < 0) {
3341 log_error("personality() failed: %m");
a2da110b 3342 _exit(EXIT_FAILURE);
1b9e5b12 3343 }
6afc95b7
LP
3344 }
3345
d96c1ecf
LP
3346#ifdef HAVE_SELINUX
3347 if (arg_selinux_context)
0cb9fbcd 3348 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 3349 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3350 _exit(EXIT_FAILURE);
0cb9fbcd 3351 }
d96c1ecf 3352#endif
354bfd2b 3353
f4889f65
LP
3354 if (!strv_isempty(arg_setenv)) {
3355 char **n;
3356
3357 n = strv_env_merge(2, envp, arg_setenv);
3358 if (!n) {
3359 log_oom();
a2da110b 3360 _exit(EXIT_FAILURE);
f4889f65
LP
3361 }
3362
3363 env_use = n;
3364 } else
3365 env_use = (char**) envp;
3366
d96c1ecf 3367 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3368 if (!barrier_place_and_sync(&barrier))
3369 _exit(EXIT_FAILURE);
d96c1ecf 3370
d87be9b0
LP
3371 if (arg_boot) {
3372 char **a;
3373 size_t l;
88213476 3374
d87be9b0 3375 /* Automatically search for the init system */
0f0dbc46 3376
d87be9b0
LP
3377 l = 1 + argc - optind;
3378 a = newa(char*, l + 1);
3379 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3380
d87be9b0 3381 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3382 execve(a[0], a, env_use);
0f0dbc46 3383
d87be9b0 3384 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3385 execve(a[0], a, env_use);
0f0dbc46 3386
d87be9b0 3387 a[0] = (char*) "/sbin/init";
f4889f65 3388 execve(a[0], a, env_use);
d87be9b0 3389 } else if (argc > optind)
f4889f65 3390 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3391 else {
3392 chdir(home ? home : "/root");
f4889f65 3393 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3394 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3395 }
3396
3397 log_error("execv() failed: %m");
d87be9b0 3398 _exit(EXIT_FAILURE);
da5b3bad 3399 }
88213476 3400
a2da110b 3401 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3402 fdset_free(fds);
3403 fds = NULL;
3404
a2da110b
DH
3405 /* wait for child-setup to be done */
3406 if (barrier_place_and_sync(&barrier)) {
5aa4bb6b 3407 int ifi = 0;
354bfd2b 3408
840295fc
LP
3409 r = move_network_interfaces(pid);
3410 if (r < 0)
3411 goto finish;
aa28aefe 3412
5aa4bb6b 3413 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3414 if (r < 0)
3415 goto finish;
ab046dde 3416
5aa4bb6b 3417 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3418 if (r < 0)
3419 goto finish;
ab046dde 3420
840295fc
LP
3421 r = setup_macvlan(pid);
3422 if (r < 0)
3423 goto finish;
c74e630d 3424
5aa4bb6b
LP
3425 r = register_machine(pid, ifi);
3426 if (r < 0)
3427 goto finish;
3428
840295fc
LP
3429 /* Block SIGCHLD here, before notifying child.
3430 * process_pty() will handle it with the other signals. */
3431 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3432 if (r < 0)
3433 goto finish;
e866af3a 3434
840295fc
LP
3435 /* Reset signal to default */
3436 r = default_signals(SIGCHLD, -1);
3437 if (r < 0)
3438 goto finish;
e866af3a 3439
840295fc
LP
3440 /* Notify the child that the parent is ready with all
3441 * its setup, and that the child can now hand over
3442 * control to the code to run inside the container. */
a2da110b 3443 barrier_place(&barrier);
354bfd2b 3444
840295fc
LP
3445 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3446 if (k < 0) {
3447 r = EXIT_FAILURE;
3448 break;
3449 }
88213476 3450
840295fc
LP
3451 if (!arg_quiet)
3452 putc('\n', stdout);
04d39279 3453
840295fc
LP
3454 /* Kill if it is not dead yet anyway */
3455 terminate_machine(pid);
3456 }
1f0cd86b 3457
840295fc 3458 /* Normally redundant, but better safe than sorry */
04d39279 3459 kill(pid, SIGKILL);
a258bf26 3460
113cea80 3461 r = wait_for_container(pid, &container_status);
04d39279
LP
3462 pid = 0;
3463
ce9f1527
LP
3464 if (r < 0) {
3465 /* We failed to wait for the container, or the
3466 * container exited abnormally */
3467 r = EXIT_FAILURE;
d87be9b0 3468 break;
ce9f1527
LP
3469 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3470 /* The container exited with a non-zero
3471 * status, or with zero status and no reboot
3472 * was requested. */
d87be9b0 3473 break;
88213476 3474
113cea80 3475 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3476
3477 if (arg_keep_unit) {
3478 /* Special handling if we are running as a
3479 * service: instead of simply restarting the
3480 * machine we want to restart the entire
3481 * service, so let's inform systemd about this
3482 * with the special exit code 133. The service
3483 * file uses RestartForceExitStatus=133 so
3484 * that this results in a full nspawn
3485 * restart. This is necessary since we might
3486 * have cgroup parameters set we want to have
3487 * flushed out. */
3488 r = 133;
3489 break;
3490 }
d87be9b0 3491 }
88213476
LP
3492
3493finish:
1b9e5b12
LP
3494 loop_remove(loop_nr, &image_fd);
3495
9444b1f2
LP
3496 if (pid > 0)
3497 kill(pid, SIGKILL);
88213476 3498
04d391da 3499 free(arg_directory);
7027ff61 3500 free(arg_machine);
c74e630d
LP
3501 free(arg_user);
3502 strv_free(arg_setenv);
3503 strv_free(arg_network_interfaces);
3504 strv_free(arg_network_macvlan);
3505 strv_free(arg_bind);
3506 strv_free(arg_bind_ro);
06c17c39 3507 strv_free(arg_tmpfs);
88213476
LP
3508
3509 return r;
3510}