]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
os-release: define /usr/lib/os-release as fallback for /etc/os-release
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
354bfd2b 43#include <sys/eventfd.h>
aa28aefe 44#include <net/if.h>
69c79d3c 45#include <linux/veth.h>
6afc95b7 46#include <sys/personality.h>
1b9e5b12 47#include <linux/loop.h>
aa28aefe 48
5d63309c 49#ifdef HAVE_SELINUX
a8828ed9
DW
50#include <selinux/selinux.h>
51#endif
88213476 52
24fb1112
LP
53#ifdef HAVE_SECCOMP
54#include <seccomp.h>
55#endif
56
1b9e5b12
LP
57#ifdef HAVE_BLKID
58#include <blkid/blkid.h>
59#endif
60
1f0cd86b
LP
61#include "sd-daemon.h"
62#include "sd-bus.h"
63#include "sd-id128.h"
aa28aefe 64#include "sd-rtnl.h"
88213476
LP
65#include "log.h"
66#include "util.h"
49e942b2 67#include "mkdir.h"
6b2d0e85 68#include "macro.h"
d7832d2c 69#include "audit.h"
94d82985 70#include "missing.h"
04d391da 71#include "cgroup-util.h"
a258bf26 72#include "strv.h"
9eb977db 73#include "path-util.h"
a41fe3a2 74#include "loopback-setup.h"
4fc9982c 75#include "dev-setup.h"
842f3b0f 76#include "fdset.h"
acbeb427 77#include "build.h"
a5c32cff 78#include "fileio.h"
40ca29a1 79#include "bus-util.h"
1f0cd86b 80#include "bus-error.h"
4ba93280 81#include "ptyfwd.h"
9bd37b40 82#include "bus-kernel.h"
f4889f65 83#include "env-util.h"
7f112f50 84#include "def.h"
aa28aefe 85#include "rtnl-util.h"
7e227024 86#include "udev-util.h"
e866af3a 87#include "eventfd-util.h"
1b9e5b12
LP
88#include "blkid-util.h"
89#include "gpt.h"
01dde061 90#include "siphash24.h"
849958d1 91#include "copy.h"
f2d88580 92
e9642be2
LP
93#ifdef HAVE_SECCOMP
94#include "seccomp-util.h"
95#endif
96
113cea80
DH
97typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100} ContainerStatus;
101
57fb9fb5
LP
102typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107} LinkJournal;
88213476
LP
108
109static char *arg_directory = NULL;
687d0825 110static char *arg_user = NULL;
9444b1f2 111static sd_id128_t arg_uuid = {};
7027ff61 112static char *arg_machine = NULL;
c74e630d
LP
113static const char *arg_selinux_context = NULL;
114static const char *arg_selinux_apifs_context = NULL;
9444b1f2 115static const char *arg_slice = NULL;
ff01d048 116static bool arg_private_network = false;
bc2f673e 117static bool arg_read_only = false;
0f0dbc46 118static bool arg_boot = false;
57fb9fb5 119static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
120static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 142 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
17fe0523
LP
147static char **arg_bind = NULL;
148static char **arg_bind_ro = NULL;
06c17c39 149static char **arg_tmpfs = NULL;
f4889f65 150static char **arg_setenv = NULL;
284c0b91 151static bool arg_quiet = false;
8a96d94e 152static bool arg_share_system = false;
eb91eb18 153static bool arg_register = true;
89f7c846 154static bool arg_keep_unit = false;
aa28aefe 155static char **arg_network_interfaces = NULL;
c74e630d 156static char **arg_network_macvlan = NULL;
69c79d3c 157static bool arg_network_veth = false;
c74e630d 158static const char *arg_network_bridge = NULL;
6afc95b7 159static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 160static const char *arg_image = NULL;
88213476
LP
161
162static int help(void) {
163
164 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
165 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
166 " -h --help Show this help\n"
167 " --version Print version string\n"
69c79d3c 168 " -q --quiet Do not show status information\n"
1b9e5b12
LP
169 " -D --directory=PATH Root directory for the container\n"
170 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
171 " -b --boot Boot up full system (i.e. invoke init)\n"
172 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 173 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 174 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 175 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
176 " --private-network Disable network in container\n"
177 " --network-interface=INTERFACE\n"
178 " Assign an existing network interface to the\n"
179 " container\n"
c74e630d
LP
180 " --network-macvlan=INTERFACE\n"
181 " Create a macvlan network interface based on an\n"
182 " existing network interface to the container\n"
32457153 183 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 184 " and container\n"
ab046dde 185 " --network-bridge=INTERFACE\n"
32457153 186 " Add a virtual ethernet connection between host\n"
ab046dde
TG
187 " and container and add it to an existing bridge on\n"
188 " the host\n"
82adf6af
LP
189 " -Z --selinux-context=SECLABEL\n"
190 " Set the SELinux security context to be used by\n"
191 " processes in the container\n"
192 " -L --selinux-apifs-context=SECLABEL\n"
193 " Set the SELinux security context to be used by\n"
194 " API/tmpfs file systems in the container\n"
a8828ed9
DW
195 " --capability=CAP In addition to the default, retain specified\n"
196 " capability\n"
197 " --drop-capability=CAP Drop the specified capability from the default set\n"
198 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
199 " -j Equivalent to --link-journal=host\n"
69c79d3c 200 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
201 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
202 " the container\n"
203 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 204 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 205 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 206 " --share-system Share system namespaces with host\n"
eb91eb18 207 " --register=BOOLEAN Register container as machine\n"
89f7c846 208 " --keep-unit Do not register a scope for the machine, reuse\n"
69c79d3c 209 " the service unit nspawn is running in\n",
88213476
LP
210 program_invocation_short_name);
211
212 return 0;
213}
214
215static int parse_argv(int argc, char *argv[]) {
216
a41fe3a2 217 enum {
acbeb427
ZJS
218 ARG_VERSION = 0x100,
219 ARG_PRIVATE_NETWORK,
bc2f673e 220 ARG_UUID,
5076f0cc 221 ARG_READ_ONLY,
57fb9fb5 222 ARG_CAPABILITY,
420c7379 223 ARG_DROP_CAPABILITY,
17fe0523
LP
224 ARG_LINK_JOURNAL,
225 ARG_BIND,
f4889f65 226 ARG_BIND_RO,
06c17c39 227 ARG_TMPFS,
f4889f65 228 ARG_SETENV,
eb91eb18 229 ARG_SHARE_SYSTEM,
89f7c846 230 ARG_REGISTER,
aa28aefe 231 ARG_KEEP_UNIT,
69c79d3c 232 ARG_NETWORK_INTERFACE,
c74e630d 233 ARG_NETWORK_MACVLAN,
69c79d3c 234 ARG_NETWORK_VETH,
ab046dde 235 ARG_NETWORK_BRIDGE,
6afc95b7 236 ARG_PERSONALITY,
a41fe3a2
LP
237 };
238
88213476 239 static const struct option options[] = {
aa28aefe
LP
240 { "help", no_argument, NULL, 'h' },
241 { "version", no_argument, NULL, ARG_VERSION },
242 { "directory", required_argument, NULL, 'D' },
243 { "user", required_argument, NULL, 'u' },
244 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
245 { "boot", no_argument, NULL, 'b' },
246 { "uuid", required_argument, NULL, ARG_UUID },
247 { "read-only", no_argument, NULL, ARG_READ_ONLY },
248 { "capability", required_argument, NULL, ARG_CAPABILITY },
249 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
250 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
251 { "bind", required_argument, NULL, ARG_BIND },
252 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 253 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
254 { "machine", required_argument, NULL, 'M' },
255 { "slice", required_argument, NULL, 'S' },
256 { "setenv", required_argument, NULL, ARG_SETENV },
257 { "selinux-context", required_argument, NULL, 'Z' },
258 { "selinux-apifs-context", required_argument, NULL, 'L' },
259 { "quiet", no_argument, NULL, 'q' },
260 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
261 { "register", required_argument, NULL, ARG_REGISTER },
262 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
263 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 264 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
265 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
266 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 267 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 268 { "image", required_argument, NULL, 'i' },
eb9da376 269 {}
88213476
LP
270 };
271
9444b1f2 272 int c, r;
a42c8b54 273 uint64_t plus = 0, minus = 0;
88213476
LP
274
275 assert(argc >= 0);
276 assert(argv);
277
1b9e5b12 278 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
88213476
LP
279
280 switch (c) {
281
282 case 'h':
eb9da376 283 return help();
88213476 284
acbeb427
ZJS
285 case ARG_VERSION:
286 puts(PACKAGE_STRING);
287 puts(SYSTEMD_FEATURES);
288 return 0;
289
88213476
LP
290 case 'D':
291 free(arg_directory);
3a74cea5
LP
292 arg_directory = canonicalize_file_name(optarg);
293 if (!arg_directory) {
898d5c91 294 log_error("Invalid root directory: %m");
88213476
LP
295 return -ENOMEM;
296 }
297
298 break;
299
1b9e5b12
LP
300 case 'i':
301 arg_image = optarg;
302 break;
303
687d0825
MV
304 case 'u':
305 free(arg_user);
7027ff61
LP
306 arg_user = strdup(optarg);
307 if (!arg_user)
308 return log_oom();
687d0825
MV
309
310 break;
311
ab046dde 312 case ARG_NETWORK_BRIDGE:
c74e630d 313 arg_network_bridge = optarg;
ab046dde
TG
314
315 /* fall through */
316
69c79d3c
LP
317 case ARG_NETWORK_VETH:
318 arg_network_veth = true;
319 arg_private_network = true;
320 break;
321
aa28aefe 322 case ARG_NETWORK_INTERFACE:
c74e630d
LP
323 if (strv_extend(&arg_network_interfaces, optarg) < 0)
324 return log_oom();
325
326 arg_private_network = true;
327 break;
328
329 case ARG_NETWORK_MACVLAN:
330 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
331 return log_oom();
332
333 /* fall through */
334
ff01d048
LP
335 case ARG_PRIVATE_NETWORK:
336 arg_private_network = true;
a41fe3a2
LP
337 break;
338
0f0dbc46
LP
339 case 'b':
340 arg_boot = true;
341 break;
342
144f0fc0 343 case ARG_UUID:
9444b1f2
LP
344 r = sd_id128_from_string(optarg, &arg_uuid);
345 if (r < 0) {
aa96c6cb 346 log_error("Invalid UUID: %s", optarg);
9444b1f2 347 return r;
aa96c6cb 348 }
9444b1f2 349 break;
aa96c6cb 350
9444b1f2 351 case 'S':
c74e630d 352 arg_slice = optarg;
144f0fc0
LP
353 break;
354
7027ff61 355 case 'M':
eb91eb18
LP
356 if (isempty(optarg)) {
357 free(arg_machine);
358 arg_machine = NULL;
359 } else {
7027ff61 360
eb91eb18
LP
361 if (!hostname_is_valid(optarg)) {
362 log_error("Invalid machine name: %s", optarg);
363 return -EINVAL;
364 }
7027ff61 365
eb91eb18
LP
366 free(arg_machine);
367 arg_machine = strdup(optarg);
368 if (!arg_machine)
369 return log_oom();
370
371 break;
372 }
7027ff61 373
82adf6af
LP
374 case 'Z':
375 arg_selinux_context = optarg;
a8828ed9
DW
376 break;
377
82adf6af
LP
378 case 'L':
379 arg_selinux_apifs_context = optarg;
a8828ed9
DW
380 break;
381
bc2f673e
LP
382 case ARG_READ_ONLY:
383 arg_read_only = true;
384 break;
385
420c7379
LP
386 case ARG_CAPABILITY:
387 case ARG_DROP_CAPABILITY: {
5076f0cc
LP
388 char *state, *word;
389 size_t length;
390
391 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 392 _cleanup_free_ char *t;
5076f0cc 393 cap_value_t cap;
5076f0cc
LP
394
395 t = strndup(word, length);
0d0f0c50
SL
396 if (!t)
397 return log_oom();
5076f0cc 398
39ed67d1
LP
399 if (streq(t, "all")) {
400 if (c == ARG_CAPABILITY)
a42c8b54 401 plus = (uint64_t) -1;
39ed67d1 402 else
a42c8b54 403 minus = (uint64_t) -1;
39ed67d1
LP
404 } else {
405 if (cap_from_name(t, &cap) < 0) {
406 log_error("Failed to parse capability %s.", t);
407 return -EINVAL;
408 }
409
410 if (c == ARG_CAPABILITY)
a42c8b54 411 plus |= 1ULL << (uint64_t) cap;
39ed67d1 412 else
a42c8b54 413 minus |= 1ULL << (uint64_t) cap;
5076f0cc 414 }
5076f0cc
LP
415 }
416
417 break;
418 }
419
57fb9fb5
LP
420 case 'j':
421 arg_link_journal = LINK_GUEST;
422 break;
423
424 case ARG_LINK_JOURNAL:
425 if (streq(optarg, "auto"))
426 arg_link_journal = LINK_AUTO;
427 else if (streq(optarg, "no"))
428 arg_link_journal = LINK_NO;
429 else if (streq(optarg, "guest"))
430 arg_link_journal = LINK_GUEST;
431 else if (streq(optarg, "host"))
432 arg_link_journal = LINK_HOST;
433 else {
434 log_error("Failed to parse link journal mode %s", optarg);
435 return -EINVAL;
436 }
437
438 break;
439
17fe0523
LP
440 case ARG_BIND:
441 case ARG_BIND_RO: {
442 _cleanup_free_ char *a = NULL, *b = NULL;
443 char *e;
444 char ***x;
17fe0523
LP
445
446 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
447
448 e = strchr(optarg, ':');
449 if (e) {
450 a = strndup(optarg, e - optarg);
451 b = strdup(e + 1);
452 } else {
453 a = strdup(optarg);
454 b = strdup(optarg);
455 }
456
457 if (!a || !b)
458 return log_oom();
459
460 if (!path_is_absolute(a) || !path_is_absolute(b)) {
461 log_error("Invalid bind mount specification: %s", optarg);
462 return -EINVAL;
463 }
464
465 r = strv_extend(x, a);
466 if (r < 0)
b3451bed 467 return log_oom();
17fe0523
LP
468
469 r = strv_extend(x, b);
470 if (r < 0)
b3451bed 471 return log_oom();
17fe0523
LP
472
473 break;
474 }
475
06c17c39
LP
476 case ARG_TMPFS: {
477 _cleanup_free_ char *a = NULL, *b = NULL;
478 char *e;
479
480 e = strchr(optarg, ':');
481 if (e) {
482 a = strndup(optarg, e - optarg);
483 b = strdup(e + 1);
484 } else {
485 a = strdup(optarg);
486 b = strdup("mode=0755");
487 }
488
489 if (!a || !b)
490 return log_oom();
491
492 if (!path_is_absolute(a)) {
493 log_error("Invalid tmpfs specification: %s", optarg);
494 return -EINVAL;
495 }
496
497 r = strv_push(&arg_tmpfs, a);
498 if (r < 0)
499 return log_oom();
500
501 a = NULL;
502
503 r = strv_push(&arg_tmpfs, b);
504 if (r < 0)
505 return log_oom();
506
507 b = NULL;
508
509 break;
510 }
511
f4889f65
LP
512 case ARG_SETENV: {
513 char **n;
514
515 if (!env_assignment_is_valid(optarg)) {
516 log_error("Environment variable assignment '%s' is not valid.", optarg);
517 return -EINVAL;
518 }
519
520 n = strv_env_set(arg_setenv, optarg);
521 if (!n)
522 return log_oom();
523
524 strv_free(arg_setenv);
525 arg_setenv = n;
526 break;
527 }
528
284c0b91
LP
529 case 'q':
530 arg_quiet = true;
531 break;
532
8a96d94e
LP
533 case ARG_SHARE_SYSTEM:
534 arg_share_system = true;
535 break;
536
eb91eb18
LP
537 case ARG_REGISTER:
538 r = parse_boolean(optarg);
539 if (r < 0) {
540 log_error("Failed to parse --register= argument: %s", optarg);
541 return r;
542 }
543
544 arg_register = r;
545 break;
546
89f7c846
LP
547 case ARG_KEEP_UNIT:
548 arg_keep_unit = true;
549 break;
550
6afc95b7
LP
551 case ARG_PERSONALITY:
552
ac45f971 553 arg_personality = personality_from_string(optarg);
6afc95b7
LP
554 if (arg_personality == 0xffffffffLU) {
555 log_error("Unknown or unsupported personality '%s'.", optarg);
556 return -EINVAL;
557 }
558
559 break;
560
88213476
LP
561 case '?':
562 return -EINVAL;
563
564 default:
eb9da376 565 assert_not_reached("Unhandled option");
88213476
LP
566 }
567 }
568
eb91eb18
LP
569 if (arg_share_system)
570 arg_register = false;
571
572 if (arg_boot && arg_share_system) {
573 log_error("--boot and --share-system may not be combined.");
574 return -EINVAL;
575 }
576
89f7c846
LP
577 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
578 log_error("--keep-unit may not be used when invoked from a user session.");
579 return -EINVAL;
580 }
581
1b9e5b12
LP
582 if (arg_directory && arg_image) {
583 log_error("--directory= and --image= may not be combined.");
584 return -EINVAL;
585 }
586
a42c8b54
LP
587 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
588
88213476
LP
589 return 1;
590}
591
592static int mount_all(const char *dest) {
593
594 typedef struct MountPoint {
595 const char *what;
596 const char *where;
597 const char *type;
598 const char *options;
599 unsigned long flags;
3bd66c05 600 bool fatal;
88213476
LP
601 } MountPoint;
602
603 static const MountPoint mount_table[] = {
06c17c39
LP
604 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
605 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
606 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
607 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
608 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 609 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
610 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
611 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 612#ifdef HAVE_SELINUX
06c17c39
LP
613 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
614 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 615#endif
88213476
LP
616 };
617
618 unsigned k;
619 int r = 0;
620
621 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 622 _cleanup_free_ char *where = NULL;
d002827b 623#ifdef HAVE_SELINUX
a8828ed9 624 _cleanup_free_ char *options = NULL;
d002827b
LP
625#endif
626 const char *o;
88213476
LP
627 int t;
628
17fe0523
LP
629 where = strjoin(dest, "/", mount_table[k].where, NULL);
630 if (!where)
631 return log_oom();
88213476 632
e65aec12 633 t = path_is_mount_point(where, true);
68fb0892 634 if (t < 0) {
88213476 635 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
636
637 if (r == 0)
638 r = t;
639
640 continue;
641 }
642
9c1c7f71
LP
643 /* Skip this entry if it is not a remount. */
644 if (mount_table[k].what && t > 0)
014a9c77
LP
645 continue;
646
17fe0523 647 mkdir_p(where, 0755);
88213476 648
a8828ed9 649#ifdef HAVE_SELINUX
82adf6af
LP
650 if (arg_selinux_apifs_context &&
651 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
652 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
653 if (!options)
654 return log_oom();
655
656 o = options;
657 } else
a8828ed9 658#endif
d002827b 659 o = mount_table[k].options;
a8828ed9 660
a8828ed9 661
88213476
LP
662 if (mount(mount_table[k].what,
663 where,
664 mount_table[k].type,
665 mount_table[k].flags,
d002827b 666 o) < 0 &&
3bd66c05 667 mount_table[k].fatal) {
88213476
LP
668
669 log_error("mount(%s) failed: %m", where);
670
671 if (r == 0)
672 r = -errno;
673 }
88213476
LP
674 }
675
e58a1277
LP
676 return r;
677}
f8440af5 678
d6797c92 679static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
680 char **x, **y;
681
682 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 683 _cleanup_free_ char *where = NULL;
d2421337 684 struct stat source_st, dest_st;
2ed4e5e0 685 int r;
d2421337
DR
686
687 if (stat(*x, &source_st) < 0) {
1b9e5b12 688 log_error("Failed to stat %s: %m", *x);
d2421337
DR
689 return -errno;
690 }
17fe0523 691
06c17c39
LP
692 where = strappend(dest, *y);
693 if (!where)
694 return log_oom();
695
2ed4e5e0
SL
696 r = stat(where, &dest_st);
697 if (r == 0) {
d2421337 698 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 699 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
700 return -EINVAL;
701 }
2ed4e5e0
SL
702 } else if (errno == ENOENT) {
703 r = mkdir_parents_label(where, 0755);
704 if (r < 0) {
705 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
706 return r;
d2421337 707 }
2ed4e5e0 708 } else {
590b6b91 709 log_error("Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
710 return -errno;
711 }
06c17c39 712
2ed4e5e0
SL
713 /* Create the mount point, but be conservative -- refuse to create block
714 * and char devices. */
715 if (S_ISDIR(source_st.st_mode))
716 mkdir_label(where, 0755);
717 else if (S_ISFIFO(source_st.st_mode))
718 mkfifo(where, 0644);
719 else if (S_ISSOCK(source_st.st_mode))
720 mknod(where, 0644 | S_IFSOCK, 0);
721 else if (S_ISREG(source_st.st_mode))
722 touch(where);
723 else {
724 log_error("Refusing to create mountpoint for file: %s", *x);
725 return -ENOTSUP;
d2421337 726 }
17fe0523
LP
727
728 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
729 log_error("mount(%s) failed: %m", where);
730 return -errno;
731 }
732
d6797c92
LP
733 if (ro) {
734 r = bind_remount_recursive(where, true);
735 if (r < 0) {
736 log_error("Read-Only bind mount failed: %s", strerror(-r));
737 return r;
738 }
17fe0523
LP
739 }
740 }
741
742 return 0;
743}
744
06c17c39
LP
745static int mount_tmpfs(const char *dest) {
746 char **i, **o;
747
748 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
749 _cleanup_free_ char *where = NULL;
750
751 where = strappend(dest, *i);
752 if (!where)
753 return log_oom();
754
755 mkdir_label(where, 0755);
756
757 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
758 log_error("tmpfs mount to %s failed: %m", where);
759 return -errno;
760 }
761 }
762
763 return 0;
764}
765
e58a1277 766static int setup_timezone(const char *dest) {
d4036145
LP
767 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
768 char *z, *y;
769 int r;
f8440af5 770
e58a1277
LP
771 assert(dest);
772
773 /* Fix the timezone, if possible */
d4036145
LP
774 r = readlink_malloc("/etc/localtime", &p);
775 if (r < 0) {
776 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
777 return 0;
778 }
779
780 z = path_startswith(p, "../usr/share/zoneinfo/");
781 if (!z)
782 z = path_startswith(p, "/usr/share/zoneinfo/");
783 if (!z) {
784 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
785 return 0;
786 }
787
04bc4a3f
LP
788 where = strappend(dest, "/etc/localtime");
789 if (!where)
0d0f0c50 790 return log_oom();
715ac17a 791
d4036145
LP
792 r = readlink_malloc(where, &q);
793 if (r >= 0) {
794 y = path_startswith(q, "../usr/share/zoneinfo/");
795 if (!y)
796 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 797
4d1c38b8 798
d4036145
LP
799 /* Already pointing to the right place? Then do nothing .. */
800 if (y && streq(y, z))
801 return 0;
802 }
803
804 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
805 if (!check)
0d0f0c50 806 return log_oom();
4d1c38b8 807
d4036145
LP
808 if (access(check, F_OK) < 0) {
809 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
810 return 0;
811 }
68fb0892 812
d4036145
LP
813 what = strappend("../usr/share/zoneinfo/", z);
814 if (!what)
815 return log_oom();
816
817 unlink(where);
818 if (symlink(what, where) < 0) {
819 log_error("Failed to correct timezone of container: %m");
820 return 0;
821 }
e58a1277
LP
822
823 return 0;
88213476
LP
824}
825
2547bb41 826static int setup_resolv_conf(const char *dest) {
f333fbb1 827 char _cleanup_free_ *where = NULL;
2547bb41
LP
828
829 assert(dest);
830
831 if (arg_private_network)
832 return 0;
833
834 /* Fix resolv.conf, if possible */
04bc4a3f
LP
835 where = strappend(dest, "/etc/resolv.conf");
836 if (!where)
0d0f0c50 837 return log_oom();
2547bb41 838
77e63faf
LP
839 /* We don't really care for the results of this really. If it
840 * fails, it fails, but meh... */
849958d1 841 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
2547bb41
LP
842
843 return 0;
844}
845
9f24adc2
LP
846static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
847
848 snprintf(s, 37,
849 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
850 SD_ID128_FORMAT_VAL(id));
851
852 return s;
853}
854
04bc4a3f 855static int setup_boot_id(const char *dest) {
7fd1b19b 856 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 857 sd_id128_t rnd = {};
04bc4a3f
LP
858 char as_uuid[37];
859 int r;
860
861 assert(dest);
862
eb91eb18
LP
863 if (arg_share_system)
864 return 0;
865
04bc4a3f
LP
866 /* Generate a new randomized boot ID, so that each boot-up of
867 * the container gets a new one */
868
869 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 870 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
871 if (!from || !to)
872 return log_oom();
04bc4a3f
LP
873
874 r = sd_id128_randomize(&rnd);
875 if (r < 0) {
876 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 877 return r;
04bc4a3f
LP
878 }
879
9f24adc2 880 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 881
574d5f2d 882 r = write_string_file(from, as_uuid);
04bc4a3f
LP
883 if (r < 0) {
884 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 885 return r;
04bc4a3f
LP
886 }
887
888 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
889 log_error("Failed to bind mount boot id: %m");
890 r = -errno;
10d18763
ZJS
891 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
892 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
893
894 unlink(from);
04bc4a3f
LP
895 return r;
896}
897
e58a1277 898static int copy_devnodes(const char *dest) {
88213476
LP
899
900 static const char devnodes[] =
901 "null\0"
902 "zero\0"
903 "full\0"
904 "random\0"
905 "urandom\0"
f2d88580 906 "tty\0";
88213476
LP
907
908 const char *d;
e58a1277 909 int r = 0;
7fd1b19b 910 _cleanup_umask_ mode_t u;
a258bf26
LP
911
912 assert(dest);
124640f1
LP
913
914 u = umask(0000);
88213476
LP
915
916 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 917 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 918 struct stat st;
88213476 919
7f112f50
LP
920 from = strappend("/dev/", d);
921 to = strjoin(dest, "/dev/", d, NULL);
922 if (!from || !to)
923 return log_oom();
88213476
LP
924
925 if (stat(from, &st) < 0) {
926
927 if (errno != ENOENT) {
928 log_error("Failed to stat %s: %m", from);
7f112f50 929 return -errno;
88213476
LP
930 }
931
a258bf26 932 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 933
ed8b7a3e 934 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 935 return -EIO;
a258bf26
LP
936
937 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
938
939 log_error("mknod(%s) failed: %m", dest);
7f112f50 940 return -errno;
88213476 941 }
88213476
LP
942 }
943
e58a1277
LP
944 return r;
945}
88213476 946
f2d88580
LP
947static int setup_ptmx(const char *dest) {
948 _cleanup_free_ char *p = NULL;
949
950 p = strappend(dest, "/dev/ptmx");
951 if (!p)
952 return log_oom();
953
954 if (symlink("pts/ptmx", p) < 0) {
955 log_error("Failed to create /dev/ptmx symlink: %m");
956 return -errno;
957 }
958
959 return 0;
960}
961
e58a1277 962static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
963 _cleanup_umask_ mode_t u;
964 const char *to;
e58a1277 965 struct stat st;
e58a1277 966 int r;
e58a1277
LP
967
968 assert(dest);
969 assert(console);
970
971 u = umask(0000);
972
eb0f0863
LP
973 if (stat("/dev/null", &st) < 0) {
974 log_error("Failed to stat /dev/null: %m");
25ea79fe 975 return -errno;
e58a1277 976 }
88213476 977
e58a1277
LP
978 r = chmod_and_chown(console, 0600, 0, 0);
979 if (r < 0) {
980 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 981 return r;
a258bf26 982 }
88213476 983
a258bf26
LP
984 /* We need to bind mount the right tty to /dev/console since
985 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
986 * to bind mount things on we create a device node first, and
987 * use /dev/null for that since we the cgroups device policy
988 * allows us to create that freely, while we cannot create
989 * /dev/console. (Note that the major minor doesn't actually
990 * matter here, since we mount it over anyway). */
a258bf26 991
eb0f0863 992 to = strappenda(dest, "/dev/console");
e58a1277
LP
993 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
994 log_error("mknod() for /dev/console failed: %m");
25ea79fe 995 return -errno;
e58a1277 996 }
a258bf26
LP
997
998 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 999 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 1000 return -errno;
a258bf26
LP
1001 }
1002
25ea79fe 1003 return 0;
e58a1277
LP
1004}
1005
1006static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1007 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1008 int r, fd, k;
7fd1b19b 1009 _cleanup_umask_ mode_t u;
e58a1277
LP
1010 union {
1011 struct cmsghdr cmsghdr;
1012 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1013 } control = {};
1014 struct msghdr mh = {
1015 .msg_control = &control,
1016 .msg_controllen = sizeof(control),
1017 };
e58a1277
LP
1018 struct cmsghdr *cmsg;
1019
1020 assert(dest);
1021 assert(kmsg_socket >= 0);
a258bf26 1022
e58a1277 1023 u = umask(0000);
a258bf26 1024
f1e5dfe2
LP
1025 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1026 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1027 * on the reading side behave very similar to /proc/kmsg,
1028 * their writing side behaves differently from /dev/kmsg in
1029 * that writing blocks when nothing is reading. In order to
1030 * avoid any problems with containers deadlocking due to this
1031 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1032 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1033 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1034 return log_oom();
e58a1277
LP
1035
1036 if (mkfifo(from, 0600) < 0) {
1037 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 1038 return -errno;
e58a1277
LP
1039 }
1040
1041 r = chmod_and_chown(from, 0600, 0, 0);
1042 if (r < 0) {
1043 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 1044 return r;
e58a1277
LP
1045 }
1046
1047 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1048 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 1049 return -errno;
e58a1277
LP
1050 }
1051
1052 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1053 if (fd < 0) {
1054 log_error("Failed to open fifo: %m");
25ea79fe 1055 return -errno;
e58a1277
LP
1056 }
1057
e58a1277
LP
1058 cmsg = CMSG_FIRSTHDR(&mh);
1059 cmsg->cmsg_level = SOL_SOCKET;
1060 cmsg->cmsg_type = SCM_RIGHTS;
1061 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1062 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1063
1064 mh.msg_controllen = cmsg->cmsg_len;
1065
1066 /* Store away the fd in the socket, so that it stays open as
1067 * long as we run the child */
1068 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1069 safe_close(fd);
e58a1277
LP
1070
1071 if (k < 0) {
1072 log_error("Failed to send FIFO fd: %m");
25ea79fe 1073 return -errno;
a258bf26
LP
1074 }
1075
f1e5dfe2
LP
1076 /* And now make the FIFO unavailable as /dev/kmsg... */
1077 unlink(from);
25ea79fe 1078 return 0;
88213476
LP
1079}
1080
3a74cea5 1081static int setup_hostname(void) {
3a74cea5 1082
eb91eb18
LP
1083 if (arg_share_system)
1084 return 0;
1085
7027ff61
LP
1086 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1087 return -errno;
3a74cea5 1088
7027ff61 1089 return 0;
3a74cea5
LP
1090}
1091
57fb9fb5 1092static int setup_journal(const char *directory) {
4d680aee 1093 sd_id128_t machine_id, this_id;
7fd1b19b 1094 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1095 char *id;
57fb9fb5
LP
1096 int r;
1097
57fb9fb5 1098 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1099 if (!p)
1100 return log_oom();
57fb9fb5
LP
1101
1102 r = read_one_line_file(p, &b);
27407a01
ZJS
1103 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1104 return 0;
1105 else if (r < 0) {
1106 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1107 return r;
1108 }
1109
27407a01
ZJS
1110 id = strstrip(b);
1111 if (isempty(id) && arg_link_journal == LINK_AUTO)
1112 return 0;
57fb9fb5 1113
27407a01
ZJS
1114 /* Verify validity */
1115 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1116 if (r < 0) {
27407a01
ZJS
1117 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1118 return r;
57fb9fb5
LP
1119 }
1120
4d680aee
ZJS
1121 r = sd_id128_get_machine(&this_id);
1122 if (r < 0) {
1123 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1124 return r;
1125 }
1126
1127 if (sd_id128_equal(machine_id, this_id)) {
1128 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1129 "Host and machine ids are equal (%s): refusing to link journals", id);
1130 if (arg_link_journal == LINK_AUTO)
1131 return 0;
1132 return
1133 -EEXIST;
1134 }
1135
1136 if (arg_link_journal == LINK_NO)
1137 return 0;
1138
57fb9fb5 1139 free(p);
27407a01
ZJS
1140 p = strappend("/var/log/journal/", id);
1141 q = strjoin(directory, "/var/log/journal/", id, NULL);
1142 if (!p || !q)
1143 return log_oom();
1144
1145 if (path_is_mount_point(p, false) > 0) {
1146 if (arg_link_journal != LINK_AUTO) {
1147 log_error("%s: already a mount point, refusing to use for journal", p);
1148 return -EEXIST;
1149 }
1150
1151 return 0;
57fb9fb5
LP
1152 }
1153
27407a01 1154 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1155 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1156 log_error("%s: already a mount point, refusing to use for journal", q);
1157 return -EEXIST;
57fb9fb5
LP
1158 }
1159
27407a01 1160 return 0;
57fb9fb5
LP
1161 }
1162
1163 r = readlink_and_make_absolute(p, &d);
1164 if (r >= 0) {
1165 if ((arg_link_journal == LINK_GUEST ||
1166 arg_link_journal == LINK_AUTO) &&
1167 path_equal(d, q)) {
1168
27407a01
ZJS
1169 r = mkdir_p(q, 0755);
1170 if (r < 0)
1171 log_warning("failed to create directory %s: %m", q);
1172 return 0;
57fb9fb5
LP
1173 }
1174
1175 if (unlink(p) < 0) {
1176 log_error("Failed to remove symlink %s: %m", p);
27407a01 1177 return -errno;
57fb9fb5
LP
1178 }
1179 } else if (r == -EINVAL) {
1180
1181 if (arg_link_journal == LINK_GUEST &&
1182 rmdir(p) < 0) {
1183
27407a01
ZJS
1184 if (errno == ENOTDIR) {
1185 log_error("%s already exists and is neither a symlink nor a directory", p);
1186 return r;
1187 } else {
57fb9fb5 1188 log_error("Failed to remove %s: %m", p);
27407a01 1189 return -errno;
57fb9fb5 1190 }
57fb9fb5
LP
1191 }
1192 } else if (r != -ENOENT) {
1193 log_error("readlink(%s) failed: %m", p);
27407a01 1194 return r;
57fb9fb5
LP
1195 }
1196
1197 if (arg_link_journal == LINK_GUEST) {
1198
1199 if (symlink(q, p) < 0) {
1200 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1201 return -errno;
57fb9fb5
LP
1202 }
1203
27407a01
ZJS
1204 r = mkdir_p(q, 0755);
1205 if (r < 0)
1206 log_warning("failed to create directory %s: %m", q);
1207 return 0;
57fb9fb5
LP
1208 }
1209
1210 if (arg_link_journal == LINK_HOST) {
1211 r = mkdir_p(p, 0755);
1212 if (r < 0) {
1213 log_error("Failed to create %s: %m", p);
27407a01 1214 return r;
57fb9fb5
LP
1215 }
1216
27407a01
ZJS
1217 } else if (access(p, F_OK) < 0)
1218 return 0;
57fb9fb5 1219
cdb2b9d0
LP
1220 if (dir_is_empty(q) == 0)
1221 log_warning("%s is not empty, proceeding anyway.", q);
1222
57fb9fb5
LP
1223 r = mkdir_p(q, 0755);
1224 if (r < 0) {
1225 log_error("Failed to create %s: %m", q);
27407a01 1226 return r;
57fb9fb5
LP
1227 }
1228
1229 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1230 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1231 return -errno;
57fb9fb5
LP
1232 }
1233
27407a01 1234 return 0;
57fb9fb5
LP
1235}
1236
9bd37b40
LP
1237static int setup_kdbus(const char *dest, const char *path) {
1238 const char *p;
1239
1240 if (!path)
1241 return 0;
1242
1243 p = strappenda(dest, "/dev/kdbus");
1244 if (mkdir(p, 0755) < 0) {
1245 log_error("Failed to create kdbus path: %m");
1246 return -errno;
1247 }
1248
1249 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1250 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1251 return -errno;
1252 }
1253
1254 return 0;
1255}
1256
88213476 1257static int drop_capabilities(void) {
5076f0cc 1258 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1259}
1260
354bfd2b 1261static int register_machine(pid_t pid) {
9444b1f2
LP
1262 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1263 _cleanup_bus_unref_ sd_bus *bus = NULL;
1264 int r;
1265
eb91eb18
LP
1266 if (!arg_register)
1267 return 0;
1268
1c03020c 1269 r = sd_bus_default_system(&bus);
9444b1f2
LP
1270 if (r < 0) {
1271 log_error("Failed to open system bus: %s", strerror(-r));
1272 return r;
1273 }
1274
89f7c846
LP
1275 if (arg_keep_unit) {
1276 r = sd_bus_call_method(
1277 bus,
1278 "org.freedesktop.machine1",
1279 "/org/freedesktop/machine1",
1280 "org.freedesktop.machine1.Manager",
1281 "RegisterMachine",
1282 &error,
1283 NULL,
1284 "sayssus",
1285 arg_machine,
1286 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1287 "nspawn",
1288 "container",
1289 (uint32_t) pid,
1290 strempty(arg_directory));
1291 } else {
9457ac5b
LP
1292 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1293
1294 r = sd_bus_message_new_method_call(
89f7c846 1295 bus,
9457ac5b 1296 &m,
89f7c846
LP
1297 "org.freedesktop.machine1",
1298 "/org/freedesktop/machine1",
1299 "org.freedesktop.machine1.Manager",
9457ac5b
LP
1300 "CreateMachine");
1301 if (r < 0) {
1302 log_error("Failed to create message: %s", strerror(-r));
1303 return r;
1304 }
1305
1306 r = sd_bus_message_append(
1307 m,
1308 "sayssus",
89f7c846
LP
1309 arg_machine,
1310 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1311 "nspawn",
1312 "container",
1313 (uint32_t) pid,
9457ac5b
LP
1314 strempty(arg_directory));
1315 if (r < 0) {
1316 log_error("Failed to append message arguments: %s", strerror(-r));
1317 return r;
1318 }
1319
1320 r = sd_bus_message_open_container(m, 'a', "(sv)");
1321 if (r < 0) {
1322 log_error("Failed to open container: %s", strerror(-r));
1323 return r;
1324 }
1325
1326 if (!isempty(arg_slice)) {
1327 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1328 if (r < 0) {
1329 log_error("Failed to append slice: %s", strerror(-r));
1330 return r;
1331 }
1332 }
1333
1334 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1335 if (r < 0) {
1336 log_error("Failed to add device policy: %s", strerror(-r));
1337 return r;
1338 }
1339
a07f961e 1340 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
9457ac5b
LP
1341 /* Allow the container to
1342 * access and create the API
1343 * device nodes, so that
1344 * PrivateDevices= in the
1345 * container can work
1346 * fine */
1347 "/dev/null", "rwm",
1348 "/dev/zero", "rwm",
1349 "/dev/full", "rwm",
1350 "/dev/random", "rwm",
1351 "/dev/urandom", "rwm",
1352 "/dev/tty", "rwm",
1353 /* Allow the container
1354 * access to ptys. However,
1355 * do not permit the
1356 * container to ever create
1357 * these device nodes. */
1358 "/dev/pts/ptmx", "rw",
a07f961e
LP
1359 "char-pts", "rw",
1360 /* Allow the container
1361 * access to all kdbus
1362 * devices. Again, the
1363 * container cannot create
1364 * these nodes, only use
1365 * them. We use a pretty
1366 * open match here, so that
1367 * the kernel API can still
1368 * change. */
1369 "char-kdbus", "rw",
1370 "char-kdbus/*", "rw");
9457ac5b
LP
1371 if (r < 0) {
1372 log_error("Failed to add device whitelist: %s", strerror(-r));
1373 return r;
1374 }
1375
1376 r = sd_bus_message_close_container(m);
1377 if (r < 0) {
1378 log_error("Failed to close container: %s", strerror(-r));
1379 return r;
1380 }
1381
1382 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1383 }
1384
9444b1f2 1385 if (r < 0) {
1f0cd86b
LP
1386 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1387 return r;
1388 }
1389
1390 return 0;
1391}
1392
1393static int terminate_machine(pid_t pid) {
1394 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1395 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1396 _cleanup_bus_unref_ sd_bus *bus = NULL;
1397 const char *path;
1398 int r;
1399
eb91eb18
LP
1400 if (!arg_register)
1401 return 0;
1402
76b54375 1403 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1404 if (r < 0) {
1405 log_error("Failed to open system bus: %s", strerror(-r));
1406 return r;
1407 }
1408
1409 r = sd_bus_call_method(
1410 bus,
1411 "org.freedesktop.machine1",
1412 "/org/freedesktop/machine1",
1413 "org.freedesktop.machine1.Manager",
1414 "GetMachineByPID",
1415 &error,
1416 &reply,
1417 "u",
1418 (uint32_t) pid);
1419 if (r < 0) {
1420 /* Note that the machine might already have been
1421 * cleaned up automatically, hence don't consider it a
1422 * failure if we cannot get the machine object. */
1423 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1424 return 0;
1425 }
1426
1427 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1428 if (r < 0)
1429 return bus_log_parse_error(r);
9444b1f2 1430
1f0cd86b
LP
1431 r = sd_bus_call_method(
1432 bus,
1433 "org.freedesktop.machine1",
1434 path,
1435 "org.freedesktop.machine1.Machine",
1436 "Terminate",
1437 &error,
1438 NULL,
1439 NULL);
1440 if (r < 0) {
1441 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1442 return 0;
1443 }
1444
9444b1f2
LP
1445 return 0;
1446}
1447
db999e0f
LP
1448static int reset_audit_loginuid(void) {
1449 _cleanup_free_ char *p = NULL;
1450 int r;
1451
1452 if (arg_share_system)
1453 return 0;
1454
1455 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1456 if (r == -ENOENT)
db999e0f
LP
1457 return 0;
1458 if (r < 0) {
1459 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1460 return r;
1461 }
1462
1463 /* Already reset? */
1464 if (streq(p, "4294967295"))
1465 return 0;
1466
1467 r = write_string_file("/proc/self/loginuid", "4294967295");
1468 if (r < 0) {
1469 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1470 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1471 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1472 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1473 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1474
db999e0f 1475 sleep(5);
77b6e194 1476 }
db999e0f
LP
1477
1478 return 0;
77b6e194
LP
1479}
1480
01dde061
TG
1481#define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1482
1483static int get_mac(struct ether_addr *mac) {
1484 int r;
1485
1486 uint8_t result[8];
1487 size_t l, sz;
1488 uint8_t *v;
1489
1490 l = strlen(arg_machine);
1491 sz = sizeof(sd_id128_t) + l;
1492 v = alloca(sz);
1493
1494 /* fetch some persistent data unique to the host */
1495 r = sd_id128_get_machine((sd_id128_t*) v);
1496 if (r < 0)
1497 return r;
1498
1499 /* combine with some data unique (on this host) to this
1500 * container instance */
1501 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1502
1503 /* Let's hash the host machine ID plus the container name. We
1504 * use a fixed, but originally randomly created hash key here. */
1505 siphash24(result, v, sz, HASH_KEY.bytes);
1506
1507 assert_cc(ETH_ALEN <= sizeof(result));
1508 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1509
1510 /* see eth_random_addr in the kernel */
1511 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1512 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1513
1514 return 0;
1515}
1516
08af0da2 1517static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
69c79d3c 1518 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1519 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
01dde061 1520 struct ether_addr mac;
69c79d3c
LP
1521 int r;
1522
1523 if (!arg_private_network)
1524 return 0;
1525
1526 if (!arg_network_veth)
1527 return 0;
1528
08af0da2
LP
1529 /* Use two different interface name prefixes depending whether
1530 * we are in bridge mode or not. */
1531 if (arg_network_bridge)
1532 memcpy(iface_name, "vb-", 3);
1533 else
1534 memcpy(iface_name, "ve-", 3);
ab046dde 1535 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
69c79d3c 1536
01dde061
TG
1537 r = get_mac(&mac);
1538 if (r < 0) {
1539 log_error("Failed to generate predictable MAC address for host0");
1540 return r;
1541 }
1542
151b9b96 1543 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1544 if (r < 0) {
1545 log_error("Failed to connect to netlink: %s", strerror(-r));
1546 return r;
1547 }
1548
151b9b96 1549 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1550 if (r < 0) {
1551 log_error("Failed to allocate netlink message: %s", strerror(-r));
1552 return r;
1553 }
1554
ab046dde 1555 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1556 if (r < 0) {
ab046dde 1557 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1558 return r;
1559 }
1560
ee3a6a51 1561 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1562 if (r < 0) {
1563 log_error("Failed to open netlink container: %s", strerror(-r));
1564 return r;
1565 }
1566
d8e538ec 1567 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
69c79d3c
LP
1568 if (r < 0) {
1569 log_error("Failed to open netlink container: %s", strerror(-r));
1570 return r;
1571 }
1572
ee3a6a51 1573 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1574 if (r < 0) {
ab046dde 1575 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1576 return r;
1577 }
1578
ab046dde 1579 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1580 if (r < 0) {
ab046dde 1581 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1582 return r;
1583 }
01dde061
TG
1584
1585 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1586 if (r < 0) {
1587 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1588 return r;
1589 }
69c79d3c 1590
ab046dde 1591 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1592 if (r < 0) {
1593 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1594 return r;
1595 }
1596
1597 r = sd_rtnl_message_close_container(m);
1598 if (r < 0) {
1599 log_error("Failed to close netlink container: %s", strerror(-r));
1600 return r;
1601 }
1602
1603 r = sd_rtnl_message_close_container(m);
1604 if (r < 0) {
1605 log_error("Failed to close netlink container: %s", strerror(-r));
1606 return r;
1607 }
1608
1609 r = sd_rtnl_message_close_container(m);
1610 if (r < 0) {
1611 log_error("Failed to close netlink container: %s", strerror(-r));
1612 return r;
1613 }
1614
1615 r = sd_rtnl_call(rtnl, m, 0, NULL);
1616 if (r < 0) {
1617 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1618 return r;
1619 }
1620
1621 return 0;
1622}
1623
ab046dde
TG
1624static int setup_bridge(const char veth_name[]) {
1625 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1626 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1627 int r, bridge;
1628
1629 if (!arg_private_network)
1630 return 0;
1631
1632 if (!arg_network_veth)
1633 return 0;
1634
1635 if (!arg_network_bridge)
1636 return 0;
1637
1638 bridge = (int) if_nametoindex(arg_network_bridge);
1639 if (bridge <= 0) {
1640 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1641 return -errno;
1642 }
1643
151b9b96 1644 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1645 if (r < 0) {
1646 log_error("Failed to connect to netlink: %s", strerror(-r));
1647 return r;
1648 }
1649
151b9b96 1650 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1651 if (r < 0) {
1652 log_error("Failed to allocate netlink message: %s", strerror(-r));
1653 return r;
1654 }
1655
039dd4af
TG
1656 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1657 if (r < 0) {
1658 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1659 return r;
1660 }
1661
ab046dde
TG
1662 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1663 if (r < 0) {
1664 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1665 return r;
1666 }
1667
1668 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1669 if (r < 0) {
1670 log_error("Failed to add netlink master field: %s", strerror(-r));
1671 return r;
1672 }
1673
1674 r = sd_rtnl_call(rtnl, m, 0, NULL);
1675 if (r < 0) {
1676 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1677 return r;
1678 }
1679
1680 return 0;
1681}
1682
c74e630d
LP
1683static int parse_interface(struct udev *udev, const char *name) {
1684 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1685 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1686 int ifi;
1687
1688 ifi = (int) if_nametoindex(name);
1689 if (ifi <= 0) {
1690 log_error("Failed to resolve interface %s: %m", name);
1691 return -errno;
1692 }
1693
1694 sprintf(ifi_str, "n%i", ifi);
1695 d = udev_device_new_from_device_id(udev, ifi_str);
1696 if (!d) {
1697 log_error("Failed to get udev device for interface %s: %m", name);
1698 return -errno;
1699 }
1700
1701 if (udev_device_get_is_initialized(d) <= 0) {
1702 log_error("Network interface %s is not initialized yet.", name);
1703 return -EBUSY;
1704 }
1705
1706 return ifi;
1707}
1708
69c79d3c 1709static int move_network_interfaces(pid_t pid) {
7e227024 1710 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1711 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1712 char **i;
1713 int r;
1714
1715 if (!arg_private_network)
1716 return 0;
1717
1718 if (strv_isempty(arg_network_interfaces))
1719 return 0;
1720
151b9b96 1721 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1722 if (r < 0) {
1723 log_error("Failed to connect to netlink: %s", strerror(-r));
1724 return r;
1725 }
1726
7e227024
LP
1727 udev = udev_new();
1728 if (!udev) {
1729 log_error("Failed to connect to udev.");
1730 return -ENOMEM;
1731 }
1732
aa28aefe 1733 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1734 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1735 int ifi;
aa28aefe 1736
c74e630d
LP
1737 ifi = parse_interface(udev, *i);
1738 if (ifi < 0)
1739 return ifi;
1740
1741 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1742 if (r < 0) {
1743 log_error("Failed to allocate netlink message: %s", strerror(-r));
1744 return r;
aa28aefe
LP
1745 }
1746
c74e630d
LP
1747 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1748 if (r < 0) {
1749 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1750 return r;
7e227024
LP
1751 }
1752
c74e630d
LP
1753 r = sd_rtnl_call(rtnl, m, 0, NULL);
1754 if (r < 0) {
1755 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1756 return r;
7e227024 1757 }
c74e630d 1758 }
7e227024 1759
c74e630d
LP
1760 return 0;
1761}
1762
1763static int setup_macvlan(pid_t pid) {
1764 _cleanup_udev_unref_ struct udev *udev = NULL;
1765 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1766 char **i;
1767 int r;
1768
1769 if (!arg_private_network)
1770 return 0;
1771
1772 if (strv_isempty(arg_network_macvlan))
1773 return 0;
1774
1775 r = sd_rtnl_open(&rtnl, 0);
1776 if (r < 0) {
1777 log_error("Failed to connect to netlink: %s", strerror(-r));
1778 return r;
1779 }
1780
1781 udev = udev_new();
1782 if (!udev) {
1783 log_error("Failed to connect to udev.");
1784 return -ENOMEM;
1785 }
1786
1787 STRV_FOREACH(i, arg_network_macvlan) {
1788 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1789 _cleanup_free_ char *n = NULL;
1790 int ifi;
1791
1792 ifi = parse_interface(udev, *i);
1793 if (ifi < 0)
1794 return ifi;
1795
1796 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
1797 if (r < 0) {
1798 log_error("Failed to allocate netlink message: %s", strerror(-r));
1799 return r;
1800 }
1801
c74e630d
LP
1802 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1803 if (r < 0) {
1804 log_error("Failed to add netlink interface index: %s", strerror(-r));
1805 return r;
1806 }
1807
1808 n = strappend("mv-", *i);
1809 if (!n)
1810 return log_oom();
1811
1812 strshorten(n, IFNAMSIZ-1);
1813
1814 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1815 if (r < 0) {
1816 log_error("Failed to add netlink interface name: %s", strerror(-r));
1817 return r;
1818 }
1819
aa28aefe
LP
1820 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1821 if (r < 0) {
c74e630d
LP
1822 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1823 return r;
1824 }
1825
1826 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1827 if (r < 0) {
1828 log_error("Failed to open netlink container: %s", strerror(-r));
1829 return r;
1830 }
1831
d8e538ec 1832 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
c74e630d
LP
1833 if (r < 0) {
1834 log_error("Failed to open netlink container: %s", strerror(-r));
1835 return r;
1836 }
1837
1838 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1839 if (r < 0) {
1840 log_error("Failed to append macvlan mode: %s", strerror(-r));
1841 return r;
1842 }
1843
1844 r = sd_rtnl_message_close_container(m);
1845 if (r < 0) {
1846 log_error("Failed to close netlink container: %s", strerror(-r));
1847 return r;
1848 }
1849
1850 r = sd_rtnl_message_close_container(m);
1851 if (r < 0) {
1852 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
1853 return r;
1854 }
1855
1856 r = sd_rtnl_call(rtnl, m, 0, NULL);
1857 if (r < 0) {
c74e630d 1858 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
1859 return r;
1860 }
1861 }
1862
1863 return 0;
1864}
1865
24fb1112
LP
1866static int audit_still_doesnt_work_in_containers(void) {
1867
1868#ifdef HAVE_SECCOMP
1869 scmp_filter_ctx seccomp;
1870 int r;
1871
1872 /*
1873 Audit is broken in containers, much of the userspace audit
1874 hookup will fail if running inside a container. We don't
1875 care and just turn off creation of audit sockets.
1876
1877 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1878 with EAFNOSUPPORT which audit userspace uses as indication
1879 that audit is disabled in the kernel.
1880 */
1881
1882 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1883 if (!seccomp)
1884 return log_oom();
1885
e9642be2 1886 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1887 if (r < 0) {
e9642be2
LP
1888 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1889 goto finish;
1890 }
1891
3302da46 1892 r = seccomp_rule_add(
24fb1112
LP
1893 seccomp,
1894 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1895 SCMP_SYS(socket),
1896 2,
1897 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1898 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1899 if (r < 0) {
1900 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1901 goto finish;
1902 }
1903
1904 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1905 if (r < 0) {
1906 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1907 goto finish;
1908 }
1909
1910 r = seccomp_load(seccomp);
1911 if (r < 0)
1912 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1913
1914finish:
1915 seccomp_release(seccomp);
1916 return r;
1917#else
1918 return 0;
1919#endif
1920
1921}
1922
1b9e5b12
LP
1923static int setup_image(char **device_path, int *loop_nr) {
1924 struct loop_info64 info = {
1925 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1926 };
1927 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1928 _cleanup_free_ char* loopdev = NULL;
1929 struct stat st;
1930 int r, nr;
1931
1932 assert(device_path);
1933 assert(loop_nr);
1934
1935 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1936 if (fd < 0) {
1937 log_error("Failed to open %s: %m", arg_image);
1938 return -errno;
1939 }
1940
1941 if (fstat(fd, &st) < 0) {
1942 log_error("Failed to stat %s: %m", arg_image);
1943 return -errno;
1944 }
1945
1946 if (S_ISBLK(st.st_mode)) {
1947 char *p;
1948
1949 p = strdup(arg_image);
1950 if (!p)
1951 return log_oom();
1952
1953 *device_path = p;
1954
1955 *loop_nr = -1;
1956
1957 r = fd;
1958 fd = -1;
1959
1960 return r;
1961 }
1962
1963 if (!S_ISREG(st.st_mode)) {
1964 log_error("%s is not a regular file or block device: %m", arg_image);
1965 return -EINVAL;
1966 }
1967
1968 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1969 if (control < 0) {
1970 log_error("Failed to open /dev/loop-control: %m");
1971 return -errno;
1972 }
1973
1974 nr = ioctl(control, LOOP_CTL_GET_FREE);
1975 if (nr < 0) {
1976 log_error("Failed to allocate loop device: %m");
1977 return -errno;
1978 }
1979
1980 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1981 return log_oom();
1982
1983 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1984 if (loop < 0) {
1985 log_error("Failed to open loop device %s: %m", loopdev);
1986 return -errno;
1987 }
1988
1989 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1990 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1991 return -errno;
1992 }
1993
1994 if (arg_read_only)
1995 info.lo_flags |= LO_FLAGS_READ_ONLY;
1996
1997 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1998 log_error("Failed to set loopback settings on %s: %m", loopdev);
1999 return -errno;
2000 }
2001
2002 *device_path = loopdev;
2003 loopdev = NULL;
2004
2005 *loop_nr = nr;
2006
2007 r = loop;
2008 loop = -1;
2009
2010 return r;
2011}
2012
2013static int dissect_image(
2014 int fd,
727fd4fd
LP
2015 char **root_device, bool *root_device_rw,
2016 char **home_device, bool *home_device_rw,
2017 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2018 bool *secondary) {
2019
2020#ifdef HAVE_BLKID
2021 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2022 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2023 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2024 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2025 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2026 _cleanup_udev_unref_ struct udev *udev = NULL;
2027 struct udev_list_entry *first, *item;
727fd4fd 2028 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2029 const char *pttype = NULL;
2030 blkid_partlist pl;
2031 struct stat st;
2032 int r;
2033
2034 assert(fd >= 0);
2035 assert(root_device);
2036 assert(home_device);
2037 assert(srv_device);
2038 assert(secondary);
2039
2040 b = blkid_new_probe();
2041 if (!b)
2042 return log_oom();
2043
2044 errno = 0;
2045 r = blkid_probe_set_device(b, fd, 0, 0);
2046 if (r != 0) {
2047 if (errno == 0)
2048 return log_oom();
2049
2050 log_error("Failed to set device on blkid probe: %m");
2051 return -errno;
2052 }
2053
2054 blkid_probe_enable_partitions(b, 1);
2055 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2056
2057 errno = 0;
2058 r = blkid_do_safeprobe(b);
2059 if (r == -2 || r == 1) {
2060 log_error("Failed to identify any partition table on %s.\n"
2061 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2062 return -EINVAL;
2063 } else if (r != 0) {
2064 if (errno == 0)
2065 errno = EIO;
2066 log_error("Failed to probe: %m");
2067 return -errno;
2068 }
2069
2070 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2071 if (!streq_ptr(pttype, "gpt")) {
2072 log_error("Image %s does not carry a GUID Partition Table.\n"
2073 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2074 return -EINVAL;
2075 }
2076
2077 errno = 0;
2078 pl = blkid_probe_get_partitions(b);
2079 if (!pl) {
2080 if (errno == 0)
2081 return log_oom();
2082
2083 log_error("Failed to list partitions of %s", arg_image);
2084 return -errno;
2085 }
2086
2087 udev = udev_new();
2088 if (!udev)
2089 return log_oom();
2090
2091 if (fstat(fd, &st) < 0) {
2092 log_error("Failed to stat block device: %m");
2093 return -errno;
2094 }
2095
2096 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2097 if (!d)
2098 return log_oom();
2099
2100 e = udev_enumerate_new(udev);
2101 if (!e)
2102 return log_oom();
2103
2104 r = udev_enumerate_add_match_parent(e, d);
2105 if (r < 0)
2106 return log_oom();
2107
2108 r = udev_enumerate_scan_devices(e);
2109 if (r < 0) {
2110 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2111 return r;
2112 }
2113
2114 first = udev_enumerate_get_list_entry(e);
2115 udev_list_entry_foreach(item, first) {
2116 _cleanup_udev_device_unref_ struct udev_device *q;
2117 const char *stype, *node;
727fd4fd 2118 unsigned long long flags;
1b9e5b12
LP
2119 sd_id128_t type_id;
2120 blkid_partition pp;
2121 dev_t qn;
2122 int nr;
2123
2124 errno = 0;
2125 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2126 if (!q) {
2127 if (!errno)
2128 errno = ENOMEM;
2129
2130 log_error("Failed to get partition device of %s: %m", arg_image);
2131 return -errno;
2132 }
2133
2134 qn = udev_device_get_devnum(q);
2135 if (major(qn) == 0)
2136 continue;
2137
2138 if (st.st_rdev == qn)
2139 continue;
2140
2141 node = udev_device_get_devnode(q);
2142 if (!node)
2143 continue;
2144
2145 pp = blkid_partlist_devno_to_partition(pl, qn);
2146 if (!pp)
2147 continue;
2148
727fd4fd
LP
2149 flags = blkid_partition_get_flags(pp);
2150 if (flags & GPT_FLAG_NO_AUTO)
2151 continue;
2152
1b9e5b12
LP
2153 nr = blkid_partition_get_partno(pp);
2154 if (nr < 0)
2155 continue;
2156
2157 stype = blkid_partition_get_type_string(pp);
2158 if (!stype)
2159 continue;
2160
2161 if (sd_id128_from_string(stype, &type_id) < 0)
2162 continue;
2163
2164 if (sd_id128_equal(type_id, GPT_HOME)) {
2165
2166 if (home && nr >= home_nr)
2167 continue;
2168
2169 home_nr = nr;
727fd4fd
LP
2170 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2171
1b9e5b12
LP
2172 free(home);
2173 home = strdup(node);
2174 if (!home)
2175 return log_oom();
2176 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2177
2178 if (srv && nr >= srv_nr)
2179 continue;
2180
2181 srv_nr = nr;
727fd4fd
LP
2182 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2183
1b9e5b12
LP
2184 free(srv);
2185 srv = strdup(node);
2186 if (!srv)
2187 return log_oom();
2188 }
2189#ifdef GPT_ROOT_NATIVE
2190 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2191
2192 if (root && nr >= root_nr)
2193 continue;
2194
2195 root_nr = nr;
727fd4fd
LP
2196 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2197
1b9e5b12
LP
2198 free(root);
2199 root = strdup(node);
2200 if (!root)
2201 return log_oom();
2202 }
2203#endif
2204#ifdef GPT_ROOT_SECONDARY
2205 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2206
2207 if (secondary_root && nr >= secondary_root_nr)
2208 continue;
2209
2210 secondary_root_nr = nr;
727fd4fd
LP
2211 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2212
2213
1b9e5b12
LP
2214 free(secondary_root);
2215 secondary_root = strdup(node);
2216 if (!secondary_root)
2217 return log_oom();
2218 }
2219#endif
2220 }
2221
2222 if (!root && !secondary_root) {
2223 log_error("Failed to identify root partition in disk image %s.\n"
2224 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2225 return -EINVAL;
2226 }
2227
2228 if (root) {
2229 *root_device = root;
2230 root = NULL;
727fd4fd
LP
2231
2232 *root_device_rw = root_rw;
1b9e5b12
LP
2233 *secondary = false;
2234 } else if (secondary_root) {
2235 *root_device = secondary_root;
2236 secondary_root = NULL;
727fd4fd
LP
2237
2238 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2239 *secondary = true;
2240 }
2241
2242 if (home) {
2243 *home_device = home;
2244 home = NULL;
727fd4fd
LP
2245
2246 *home_device_rw = home_rw;
1b9e5b12
LP
2247 }
2248
2249 if (srv) {
2250 *srv_device = srv;
2251 srv = NULL;
727fd4fd
LP
2252
2253 *srv_device_rw = srv_rw;
1b9e5b12
LP
2254 }
2255
2256 return 0;
2257#else
2258 log_error("--image= is not supported, compiled without blkid support.");
2259 return -ENOTSUP;
2260#endif
2261}
2262
727fd4fd 2263static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2264#ifdef HAVE_BLKID
2265 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2266 const char *fstype, *p;
2267 int r;
2268
2269 assert(what);
2270 assert(where);
2271
727fd4fd
LP
2272 if (arg_read_only)
2273 rw = false;
2274
1b9e5b12
LP
2275 if (directory)
2276 p = strappenda(where, directory);
2277 else
2278 p = where;
2279
2280 errno = 0;
2281 b = blkid_new_probe_from_filename(what);
2282 if (!b) {
2283 if (errno == 0)
2284 return log_oom();
2285 log_error("Failed to allocate prober for %s: %m", what);
2286 return -errno;
2287 }
2288
2289 blkid_probe_enable_superblocks(b, 1);
2290 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2291
2292 errno = 0;
2293 r = blkid_do_safeprobe(b);
2294 if (r == -1 || r == 1) {
2295 log_error("Cannot determine file system type of %s", what);
2296 return -EINVAL;
2297 } else if (r != 0) {
2298 if (errno == 0)
2299 errno = EIO;
2300 log_error("Failed to probe %s: %m", what);
2301 return -errno;
2302 }
2303
2304 errno = 0;
2305 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2306 if (errno == 0)
2307 errno = EINVAL;
2308 log_error("Failed to determine file system type of %s", what);
2309 return -errno;
2310 }
2311
2312 if (streq(fstype, "crypto_LUKS")) {
2313 log_error("nspawn currently does not support LUKS disk images.");
2314 return -ENOTSUP;
2315 }
2316
727fd4fd 2317 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
1b9e5b12
LP
2318 log_error("Failed to mount %s: %m", what);
2319 return -errno;
2320 }
2321
2322 return 0;
2323#else
2324 log_error("--image= is not supported, compiled without blkid support.");
2325 return -ENOTSUP;
2326#endif
2327}
2328
727fd4fd
LP
2329static int mount_devices(
2330 const char *where,
2331 const char *root_device, bool root_device_rw,
2332 const char *home_device, bool home_device_rw,
2333 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2334 int r;
2335
2336 assert(where);
2337
2338 if (root_device) {
727fd4fd 2339 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
1b9e5b12
LP
2340 if (r < 0) {
2341 log_error("Failed to mount root directory: %s", strerror(-r));
2342 return r;
2343 }
2344 }
2345
2346 if (home_device) {
727fd4fd 2347 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
1b9e5b12
LP
2348 if (r < 0) {
2349 log_error("Failed to mount home directory: %s", strerror(-r));
2350 return r;
2351 }
2352 }
2353
2354 if (srv_device) {
727fd4fd 2355 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
1b9e5b12
LP
2356 if (r < 0) {
2357 log_error("Failed to mount server data directory: %s", strerror(-r));
2358 return r;
2359 }
2360 }
2361
2362 return 0;
2363}
2364
2365static void loop_remove(int nr, int *image_fd) {
2366 _cleanup_close_ int control = -1;
2367
2368 if (nr < 0)
2369 return;
2370
2371 if (image_fd && *image_fd >= 0) {
2372 ioctl(*image_fd, LOOP_CLR_FD);
03e334a1 2373 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2374 }
2375
2376 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2377 if (control < 0)
2378 return;
2379
2380 ioctl(control, LOOP_CTL_REMOVE, nr);
2381}
2382
0cb9fbcd
LP
2383static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2384 int pipe_fds[2];
2385 pid_t pid;
2386
2387 assert(database);
2388 assert(key);
2389 assert(rpid);
2390
2391 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2392 log_error("Failed to allocate pipe: %m");
2393 return -errno;
2394 }
2395
2396 pid = fork();
2397 if (pid < 0) {
2398 log_error("Failed to fork getent child: %m");
2399 return -errno;
2400 } else if (pid == 0) {
2401 int nullfd;
2402 char *empty_env = NULL;
2403
2404 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2405 _exit(EXIT_FAILURE);
2406
2407 if (pipe_fds[0] > 2)
03e334a1 2408 safe_close(pipe_fds[0]);
0cb9fbcd 2409 if (pipe_fds[1] > 2)
03e334a1 2410 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2411
2412 nullfd = open("/dev/null", O_RDWR);
2413 if (nullfd < 0)
2414 _exit(EXIT_FAILURE);
2415
2416 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2417 _exit(EXIT_FAILURE);
2418
2419 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2420 _exit(EXIT_FAILURE);
2421
2422 if (nullfd > 2)
03e334a1 2423 safe_close(nullfd);
0cb9fbcd
LP
2424
2425 reset_all_signal_handlers();
2426 close_all_fds(NULL, 0);
2427
4de82926
MM
2428 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2429 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2430 _exit(EXIT_FAILURE);
2431 }
2432
03e334a1 2433 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2434
2435 *rpid = pid;
2436
2437 return pipe_fds[0];
2438}
2439
2440static int change_uid_gid(char **_home) {
0cb9fbcd
LP
2441 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2442 _cleanup_free_ uid_t *uids = NULL;
2443 _cleanup_free_ char *home = NULL;
2444 _cleanup_fclose_ FILE *f = NULL;
2445 _cleanup_close_ int fd = -1;
2446 unsigned n_uids = 0;
70f539ca 2447 size_t sz = 0, l;
0cb9fbcd
LP
2448 uid_t uid;
2449 gid_t gid;
2450 pid_t pid;
2451 int r;
2452
2453 assert(_home);
2454
2455 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2456 /* Reset everything fully to 0, just in case */
2457
2458 if (setgroups(0, NULL) < 0) {
2459 log_error("setgroups() failed: %m");
2460 return -errno;
2461 }
2462
2463 if (setresgid(0, 0, 0) < 0) {
2464 log_error("setregid() failed: %m");
2465 return -errno;
2466 }
2467
2468 if (setresuid(0, 0, 0) < 0) {
2469 log_error("setreuid() failed: %m");
2470 return -errno;
2471 }
2472
2473 *_home = NULL;
2474 return 0;
2475 }
2476
2477 /* First, get user credentials */
2478 fd = spawn_getent("passwd", arg_user, &pid);
2479 if (fd < 0)
2480 return fd;
2481
2482 f = fdopen(fd, "r");
2483 if (!f)
2484 return log_oom();
2485 fd = -1;
2486
2487 if (!fgets(line, sizeof(line), f)) {
2488
2489 if (!ferror(f)) {
2490 log_error("Failed to resolve user %s.", arg_user);
2491 return -ESRCH;
2492 }
2493
2494 log_error("Failed to read from getent: %m");
2495 return -errno;
2496 }
2497
2498 truncate_nl(line);
2499
2500 wait_for_terminate_and_warn("getent passwd", pid);
2501
2502 x = strchr(line, ':');
2503 if (!x) {
2504 log_error("/etc/passwd entry has invalid user field.");
2505 return -EIO;
2506 }
2507
2508 u = strchr(x+1, ':');
2509 if (!u) {
2510 log_error("/etc/passwd entry has invalid password field.");
2511 return -EIO;
2512 }
2513
2514 u++;
2515 g = strchr(u, ':');
2516 if (!g) {
2517 log_error("/etc/passwd entry has invalid UID field.");
2518 return -EIO;
2519 }
2520
2521 *g = 0;
2522 g++;
2523 x = strchr(g, ':');
2524 if (!x) {
2525 log_error("/etc/passwd entry has invalid GID field.");
2526 return -EIO;
2527 }
2528
2529 *x = 0;
2530 h = strchr(x+1, ':');
2531 if (!h) {
2532 log_error("/etc/passwd entry has invalid GECOS field.");
2533 return -EIO;
2534 }
2535
2536 h++;
2537 x = strchr(h, ':');
2538 if (!x) {
2539 log_error("/etc/passwd entry has invalid home directory field.");
2540 return -EIO;
2541 }
2542
2543 *x = 0;
2544
2545 r = parse_uid(u, &uid);
2546 if (r < 0) {
2547 log_error("Failed to parse UID of user.");
2548 return -EIO;
2549 }
2550
2551 r = parse_gid(g, &gid);
2552 if (r < 0) {
2553 log_error("Failed to parse GID of user.");
2554 return -EIO;
2555 }
2556
2557 home = strdup(h);
2558 if (!home)
2559 return log_oom();
2560
2561 /* Second, get group memberships */
2562 fd = spawn_getent("initgroups", arg_user, &pid);
2563 if (fd < 0)
2564 return fd;
2565
2566 fclose(f);
2567 f = fdopen(fd, "r");
2568 if (!f)
2569 return log_oom();
2570 fd = -1;
2571
2572 if (!fgets(line, sizeof(line), f)) {
2573 if (!ferror(f)) {
2574 log_error("Failed to resolve user %s.", arg_user);
2575 return -ESRCH;
2576 }
2577
2578 log_error("Failed to read from getent: %m");
2579 return -errno;
2580 }
2581
2582 truncate_nl(line);
2583
2584 wait_for_terminate_and_warn("getent initgroups", pid);
2585
2586 /* Skip over the username and subsequent separator whitespace */
2587 x = line;
2588 x += strcspn(x, WHITESPACE);
2589 x += strspn(x, WHITESPACE);
2590
2591 FOREACH_WORD(w, l, x, state) {
2592 char c[l+1];
2593
2594 memcpy(c, w, l);
2595 c[l] = 0;
2596
2597 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2598 return log_oom();
2599
2600 r = parse_uid(c, &uids[n_uids++]);
2601 if (r < 0) {
2602 log_error("Failed to parse group data from getent.");
2603 return -EIO;
2604 }
2605 }
2606
2607 r = mkdir_parents(home, 0775);
2608 if (r < 0) {
2609 log_error("Failed to make home root directory: %s", strerror(-r));
2610 return r;
2611 }
2612
2613 r = mkdir_safe(home, 0755, uid, gid);
f418f31d 2614 if (r < 0 && r != -EEXIST) {
0cb9fbcd
LP
2615 log_error("Failed to make home directory: %s", strerror(-r));
2616 return r;
2617 }
2618
2619 fchown(STDIN_FILENO, uid, gid);
2620 fchown(STDOUT_FILENO, uid, gid);
2621 fchown(STDERR_FILENO, uid, gid);
2622
2623 if (setgroups(n_uids, uids) < 0) {
2624 log_error("Failed to set auxiliary groups: %m");
2625 return -errno;
2626 }
2627
2628 if (setresgid(gid, gid, gid) < 0) {
2629 log_error("setregid() failed: %m");
2630 return -errno;
2631 }
2632
2633 if (setresuid(uid, uid, uid) < 0) {
2634 log_error("setreuid() failed: %m");
2635 return -errno;
2636 }
2637
2638 if (_home) {
2639 *_home = home;
2640 home = NULL;
2641 }
2642
2643 return 0;
2644}
2645
113cea80
DH
2646/*
2647 * Return 0 in case the container is being rebooted, has been shut
2648 * down or exited successfully. On failures a negative value is
2649 * returned.
2650 *
2651 * The status of the container "CONTAINER_TERMINATED" or
2652 * "CONTAINER_REBOOTED" will be saved in the container argument
2653 */
2654static int wait_for_container(pid_t pid, ContainerStatus *container) {
2655 int r;
2656 siginfo_t status;
2657
2658 r = wait_for_terminate(pid, &status);
2659 if (r < 0)
2660 return r;
2661
2662 switch (status.si_code) {
2663 case CLD_EXITED:
2664 r = status.si_status;
2665 if (r == 0) {
2666 if (!arg_quiet)
2667 log_debug("Container %s exited successfully.",
2668 arg_machine);
2669
2670 *container = CONTAINER_TERMINATED;
2671 } else {
2672 log_error("Container %s failed with error code %i.",
2673 arg_machine, status.si_status);
2674 r = -1;
2675 }
2676 break;
2677
2678 case CLD_KILLED:
2679 if (status.si_status == SIGINT) {
2680 if (!arg_quiet)
2681 log_info("Container %s has been shut down.",
2682 arg_machine);
2683
2684 *container = CONTAINER_TERMINATED;
2685 r = 0;
2686 break;
2687 } else if (status.si_status == SIGHUP) {
2688 if (!arg_quiet)
2689 log_info("Container %s is being rebooted.",
2690 arg_machine);
2691
2692 *container = CONTAINER_REBOOTED;
2693 r = 0;
2694 break;
2695 }
2696 /* CLD_KILLED fallthrough */
2697
2698 case CLD_DUMPED:
2699 log_error("Container %s terminated by signal %s.",
2700 arg_machine, signal_to_string(status.si_status));
2701 r = -1;
2702 break;
2703
2704 default:
2705 log_error("Container %s failed due to unknown reason.",
2706 arg_machine);
2707 r = -1;
2708 break;
2709 }
2710
2711 return r;
2712}
2713
e866af3a
DH
2714static void nop_handler(int sig) {}
2715
88213476 2716int main(int argc, char *argv[]) {
69c79d3c 2717
1b9e5b12 2718 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2719 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
d96c1ecf 2720 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
3d94f76c 2721 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2722 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2723 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2724 const char *console = NULL;
1b9e5b12
LP
2725 char veth_name[IFNAMSIZ];
2726 bool secondary = false;
e866af3a 2727 sigset_t mask, mask_chld;
69c79d3c 2728 pid_t pid = 0;
88213476
LP
2729
2730 log_parse_environment();
2731 log_open();
2732
05947bef
LP
2733 k = parse_argv(argc, argv);
2734 if (k < 0)
88213476 2735 goto finish;
05947bef
LP
2736 else if (k == 0) {
2737 r = EXIT_SUCCESS;
2738 goto finish;
2739 }
88213476 2740
1b9e5b12
LP
2741 if (!arg_image) {
2742 if (arg_directory) {
2743 char *p;
88213476 2744
1b9e5b12
LP
2745 p = path_make_absolute_cwd(arg_directory);
2746 free(arg_directory);
2747 arg_directory = p;
2748 } else
2749 arg_directory = get_current_dir_name();
88213476 2750
1b9e5b12
LP
2751 if (!arg_directory) {
2752 log_error("Failed to determine path, please use -D.");
2753 goto finish;
2754 }
2755 path_kill_slashes(arg_directory);
88213476
LP
2756 }
2757
7027ff61 2758 if (!arg_machine) {
1b9e5b12 2759 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2760 if (!arg_machine) {
2761 log_oom();
2762 goto finish;
2763 }
2764
e724b063 2765 hostname_cleanup(arg_machine, false);
7027ff61
LP
2766 if (isempty(arg_machine)) {
2767 log_error("Failed to determine machine name automatically, please use -M.");
2768 goto finish;
2769 }
2770 }
2771
88213476
LP
2772 if (geteuid() != 0) {
2773 log_error("Need to be root.");
2774 goto finish;
2775 }
2776
04d391da
LP
2777 if (sd_booted() <= 0) {
2778 log_error("Not running on a systemd system.");
2779 goto finish;
2780 }
2781
1b9e5b12
LP
2782 log_close();
2783 n_fd_passed = sd_listen_fds(false);
2784 if (n_fd_passed > 0) {
2785 k = fdset_new_listen_fds(&fds, false);
2786 if (k < 0) {
2787 log_error("Failed to collect file descriptors: %s", strerror(-k));
2788 goto finish;
2789 }
88213476 2790 }
1b9e5b12
LP
2791 fdset_close_others(fds);
2792 log_open();
88213476 2793
1b9e5b12
LP
2794 if (arg_directory) {
2795 if (path_equal(arg_directory, "/")) {
2796 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2797 goto finish;
2798 }
1b9e5b12
LP
2799
2800 if (arg_boot) {
2801 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 2802 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
2803 goto finish;
2804 }
2805 } else {
2806 const char *p;
2807
2808 p = strappenda(arg_directory,
2809 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2810 if (access(p, F_OK) < 0) {
2811 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2812 goto finish;
2813
2814 }
2815 }
6b9132a9 2816 } else {
1b9e5b12 2817 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2818
1b9e5b12
LP
2819 if (!mkdtemp(template)) {
2820 log_error("Failed to create temporary directory: %m");
2821 r = -errno;
6b9132a9 2822 goto finish;
1b9e5b12 2823 }
6b9132a9 2824
1b9e5b12
LP
2825 arg_directory = strdup(template);
2826 if (!arg_directory) {
2827 r = log_oom();
2828 goto finish;
6b9132a9 2829 }
88213476 2830
1b9e5b12
LP
2831 image_fd = setup_image(&device_path, &loop_nr);
2832 if (image_fd < 0) {
2833 r = image_fd;
842f3b0f
LP
2834 goto finish;
2835 }
1b9e5b12 2836
727fd4fd 2837 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
1b9e5b12
LP
2838 if (r < 0)
2839 goto finish;
842f3b0f 2840 }
842f3b0f 2841
db7feb7e
LP
2842 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2843 if (master < 0) {
a258bf26
LP
2844 log_error("Failed to acquire pseudo tty: %m");
2845 goto finish;
2846 }
2847
db7feb7e
LP
2848 console = ptsname(master);
2849 if (!console) {
a258bf26
LP
2850 log_error("Failed to determine tty name: %m");
2851 goto finish;
2852 }
2853
284c0b91 2854 if (!arg_quiet)
45f1386c
ZJS
2855 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2856 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2857
2858 if (unlockpt(master) < 0) {
2859 log_error("Failed to unlock tty: %m");
2860 goto finish;
2861 }
2862
eb91eb18
LP
2863 if (access("/dev/kdbus/control", F_OK) >= 0) {
2864
2865 if (arg_share_system) {
2866 kdbus_domain = strdup("/dev/kdbus");
2867 if (!kdbus_domain) {
2868 log_oom();
2869 goto finish;
2870 }
2871 } else {
2872 const char *ns;
2873
2874 ns = strappenda("machine-", arg_machine);
2875 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2876 if (r < 0)
2877 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2878 else
2879 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2880 }
2881 }
9bd37b40 2882
e58a1277 2883 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
2884 log_error("Failed to create kmsg socket pair: %m");
2885 goto finish;
2886 }
2887
05947bef
LP
2888 sd_notify(0, "READY=1");
2889
a258bf26 2890 assert_se(sigemptyset(&mask) == 0);
e866af3a
DH
2891 assert_se(sigemptyset(&mask_chld) == 0);
2892 sigaddset(&mask_chld, SIGCHLD);
a258bf26
LP
2893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2895
d87be9b0 2896 for (;;) {
113cea80 2897 ContainerStatus container_status;
e866af3a
DH
2898 int eventfds[2] = { -1, -1 };
2899 struct sigaction sa = {
2900 .sa_handler = nop_handler,
2901 .sa_flags = SA_NOCLDSTOP,
2902 };
2903
2904 /* Child can be killed before execv(), so handle SIGCHLD
2905 * in order to interrupt parent's blocking calls and
2906 * give it a chance to call wait() and terminate. */
2907 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2908 if (r < 0) {
2909 log_error("Failed to change the signal mask: %m");
d96c1ecf
LP
2910 goto finish;
2911 }
2912
e866af3a
DH
2913 r = sigaction(SIGCHLD, &sa, NULL);
2914 if (r < 0) {
2915 log_error("Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
2916 goto finish;
2917 }
2918
e866af3a
DH
2919 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2920 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2921 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
d87be9b0
LP
2922 if (pid < 0) {
2923 if (errno == EINVAL)
2924 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2925 else
2926 log_error("clone() failed: %m");
a258bf26 2927
e866af3a 2928 r = pid;
d87be9b0
LP
2929 goto finish;
2930 }
a258bf26 2931
d87be9b0
LP
2932 if (pid == 0) {
2933 /* child */
0cb9fbcd 2934 _cleanup_free_ char *home = NULL;
5674767e 2935 unsigned n_env = 2;
d87be9b0 2936 const char *envp[] = {
e10a55fd 2937 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
2938 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2939 NULL, /* TERM */
2940 NULL, /* HOME */
2941 NULL, /* USER */
2942 NULL, /* LOGNAME */
2943 NULL, /* container_uuid */
842f3b0f
LP
2944 NULL, /* LISTEN_FDS */
2945 NULL, /* LISTEN_PID */
d87be9b0
LP
2946 NULL
2947 };
f4889f65 2948 char **env_use;
a258bf26 2949
5674767e
ZJS
2950 envp[n_env] = strv_find_prefix(environ, "TERM=");
2951 if (envp[n_env])
2952 n_env ++;
a258bf26 2953
03e334a1 2954 master = safe_close(master);
a258bf26 2955
d87be9b0
LP
2956 close_nointr(STDIN_FILENO);
2957 close_nointr(STDOUT_FILENO);
2958 close_nointr(STDERR_FILENO);
db7feb7e 2959
03e334a1 2960 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 2961
d87be9b0 2962 reset_all_signal_handlers();
88213476 2963
d87be9b0
LP
2964 assert_se(sigemptyset(&mask) == 0);
2965 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 2966
842f3b0f
LP
2967 k = open_terminal(console, O_RDWR);
2968 if (k != STDIN_FILENO) {
2969 if (k >= 0) {
03e334a1 2970 safe_close(k);
842f3b0f
LP
2971 k = -EINVAL;
2972 }
2973
2974 log_error("Failed to open console: %s", strerror(-k));
2975 goto child_fail;
2976 }
2977
2978 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2979 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2980 log_error("Failed to duplicate console: %m");
d87be9b0 2981 goto child_fail;
842f3b0f 2982 }
bc2f673e 2983
d87be9b0
LP
2984 if (setsid() < 0) {
2985 log_error("setsid() failed: %m");
bc2f673e
LP
2986 goto child_fail;
2987 }
2988
db999e0f
LP
2989 if (reset_audit_loginuid() < 0)
2990 goto child_fail;
2991
d87be9b0
LP
2992 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2993 log_error("PR_SET_PDEATHSIG failed: %m");
2994 goto child_fail;
2995 }
e58a1277 2996
d87be9b0
LP
2997 /* Mark everything as slave, so that we still
2998 * receive mounts from the real root, but don't
2999 * propagate mounts to the real root. */
3000 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3001 log_error("MS_SLAVE|MS_REC failed: %m");
3002 goto child_fail;
3003 }
04bc4a3f 3004
727fd4fd
LP
3005 if (mount_devices(arg_directory,
3006 root_device, root_device_rw,
3007 home_device, home_device_rw,
3008 srv_device, srv_device_rw) < 0)
1b9e5b12
LP
3009 goto child_fail;
3010
d87be9b0
LP
3011 /* Turn directory into bind mount */
3012 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
d6797c92 3013 log_error("Failed to make bind mount: %m");
d87be9b0
LP
3014 goto child_fail;
3015 }
88213476 3016
d6797c92
LP
3017 if (arg_read_only) {
3018 k = bind_remount_recursive(arg_directory, true);
3019 if (k < 0) {
3020 log_error("Failed to make tree read-only: %s", strerror(-k));
d87be9b0
LP
3021 goto child_fail;
3022 }
d6797c92 3023 }
2547bb41 3024
d87be9b0
LP
3025 if (mount_all(arg_directory) < 0)
3026 goto child_fail;
57fb9fb5 3027
d87be9b0
LP
3028 if (copy_devnodes(arg_directory) < 0)
3029 goto child_fail;
a258bf26 3030
f2d88580
LP
3031 if (setup_ptmx(arg_directory) < 0)
3032 goto child_fail;
3033
d87be9b0 3034 dev_setup(arg_directory);
88213476 3035
24fb1112
LP
3036 if (audit_still_doesnt_work_in_containers() < 0)
3037 goto child_fail;
3038
d87be9b0
LP
3039 if (setup_dev_console(arg_directory, console) < 0)
3040 goto child_fail;
88213476 3041
d87be9b0
LP
3042 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3043 goto child_fail;
88213476 3044
03e334a1 3045 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3046
d87be9b0
LP
3047 if (setup_boot_id(arg_directory) < 0)
3048 goto child_fail;
a41fe3a2 3049
d87be9b0
LP
3050 if (setup_timezone(arg_directory) < 0)
3051 goto child_fail;
88213476 3052
d87be9b0
LP
3053 if (setup_resolv_conf(arg_directory) < 0)
3054 goto child_fail;
687d0825 3055
d87be9b0 3056 if (setup_journal(arg_directory) < 0)
687d0825 3057 goto child_fail;
687d0825 3058
d6797c92 3059 if (mount_binds(arg_directory, arg_bind, false) < 0)
17fe0523
LP
3060 goto child_fail;
3061
d6797c92 3062 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
17fe0523
LP
3063 goto child_fail;
3064
06c17c39
LP
3065 if (mount_tmpfs(arg_directory) < 0)
3066 goto child_fail;
3067
486e99a3 3068 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
9bd37b40
LP
3069 goto child_fail;
3070
d96c1ecf
LP
3071 /* Tell the parent that we are ready, and that
3072 * it can cgroupify us to that we lack access
3073 * to certain devices and resources. */
e866af3a
DH
3074 r = eventfd_send_state(eventfds[1],
3075 EVENTFD_CHILD_SUCCEEDED);
3076 eventfds[1] = safe_close(eventfds[1]);
3077 if (r < 0)
3078 goto child_fail;
d96c1ecf 3079
d87be9b0
LP
3080 if (chdir(arg_directory) < 0) {
3081 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
3082 goto child_fail;
3083 }
3084
d87be9b0
LP
3085 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3086 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
3087 goto child_fail;
3088 }
3089
d87be9b0
LP
3090 if (chroot(".") < 0) {
3091 log_error("chroot() failed: %m");
687d0825
MV
3092 goto child_fail;
3093 }
3094
d87be9b0
LP
3095 if (chdir("/") < 0) {
3096 log_error("chdir() failed: %m");
687d0825
MV
3097 goto child_fail;
3098 }
3099
d87be9b0
LP
3100 umask(0022);
3101
eb91eb18
LP
3102 if (arg_private_network)
3103 loopback_setup();
d87be9b0
LP
3104
3105 if (drop_capabilities() < 0) {
3106 log_error("drop_capabilities() failed: %m");
687d0825
MV
3107 goto child_fail;
3108 }
687d0825 3109
0cb9fbcd
LP
3110 r = change_uid_gid(&home);
3111 if (r < 0)
3112 goto child_fail;
d87be9b0 3113
842f3b0f
LP
3114 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3115 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3116 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3117 log_oom();
144f0fc0
LP
3118 goto child_fail;
3119 }
687d0825 3120
9444b1f2 3121 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3122 char as_uuid[37];
3123
3124 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f
LP
3125 log_oom();
3126 goto child_fail;
3127 }
3128 }
3129
3130 if (fdset_size(fds) > 0) {
3131 k = fdset_cloexec(fds, false);
3132 if (k < 0) {
3133 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3134 goto child_fail;
3135 }
3136
3137 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3138 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
3139 log_oom();
3140 goto child_fail;
3141 }
3142 }
3143
3144 setup_hostname();
3145
6afc95b7
LP
3146 if (arg_personality != 0xffffffffLU) {
3147 if (personality(arg_personality) < 0) {
3148 log_error("personality() failed: %m");
3149 goto child_fail;
3150 }
1b9e5b12
LP
3151 } else if (secondary) {
3152 if (personality(PER_LINUX32) < 0) {
3153 log_error("personality() failed: %m");
3154 goto child_fail;
3155 }
6afc95b7
LP
3156 }
3157
d96c1ecf
LP
3158#ifdef HAVE_SELINUX
3159 if (arg_selinux_context)
0cb9fbcd 3160 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 3161 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
0cb9fbcd
LP
3162 goto child_fail;
3163 }
d96c1ecf 3164#endif
354bfd2b 3165
f4889f65
LP
3166 if (!strv_isempty(arg_setenv)) {
3167 char **n;
3168
3169 n = strv_env_merge(2, envp, arg_setenv);
3170 if (!n) {
3171 log_oom();
3172 goto child_fail;
3173 }
3174
3175 env_use = n;
3176 } else
3177 env_use = (char**) envp;
3178
d96c1ecf 3179 /* Wait until the parent is ready with the setup, too... */
e866af3a
DH
3180 r = eventfd_parent_succeeded(eventfds[0]);
3181 eventfds[0] = safe_close(eventfds[0]);
3182 if (r < 0)
3183 goto child_fail;
d96c1ecf 3184
d87be9b0
LP
3185 if (arg_boot) {
3186 char **a;
3187 size_t l;
88213476 3188
d87be9b0 3189 /* Automatically search for the init system */
0f0dbc46 3190
d87be9b0
LP
3191 l = 1 + argc - optind;
3192 a = newa(char*, l + 1);
3193 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3194
d87be9b0 3195 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3196 execve(a[0], a, env_use);
0f0dbc46 3197
d87be9b0 3198 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3199 execve(a[0], a, env_use);
0f0dbc46 3200
d87be9b0 3201 a[0] = (char*) "/sbin/init";
f4889f65 3202 execve(a[0], a, env_use);
d87be9b0 3203 } else if (argc > optind)
f4889f65 3204 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3205 else {
3206 chdir(home ? home : "/root");
f4889f65 3207 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3208 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3209 }
3210
3211 log_error("execv() failed: %m");
0f0dbc46 3212
d87be9b0 3213 child_fail:
e866af3a
DH
3214 /* Tell the parent that the setup failed, so he
3215 * can clean up resources and terminate. */
3216 if (eventfds[1] != -1)
3217 eventfd_send_state(eventfds[1],
3218 EVENTFD_CHILD_FAILED);
d87be9b0 3219 _exit(EXIT_FAILURE);
da5b3bad 3220 }
88213476 3221
842f3b0f
LP
3222 fdset_free(fds);
3223 fds = NULL;
3224
e866af3a
DH
3225 /* Wait for the child event:
3226 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3227 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3228 * it is ready with all it needs to do with priviliges.
3229 * After we got the notification we can make the process
3230 * join its cgroup which might limit what it can do */
3231 r = eventfd_child_succeeded(eventfds[1]);
3232 eventfds[1] = safe_close(eventfds[1]);
3233 if (r < 0)
3234 goto check_container_status;
d96c1ecf 3235
354bfd2b
LP
3236 r = register_machine(pid);
3237 if (r < 0)
3238 goto finish;
3239
aa28aefe
LP
3240 r = move_network_interfaces(pid);
3241 if (r < 0)
3242 goto finish;
3243
ab046dde
TG
3244 r = setup_veth(pid, veth_name);
3245 if (r < 0)
3246 goto finish;
3247
3248 r = setup_bridge(veth_name);
3249 if (r < 0)
3250 goto finish;
3251
c74e630d
LP
3252 r = setup_macvlan(pid);
3253 if (r < 0)
3254 goto finish;
3255
e866af3a
DH
3256 /* Block SIGCHLD here, before notifying child.
3257 * process_pty() will handle it with the other signals. */
3258 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3259 if (r < 0)
3260 goto finish;
3261
3262 /* Reset signal to default */
3263 r = default_signals(SIGCHLD, -1);
3264 if (r < 0)
3265 goto finish;
3266
d96c1ecf 3267 /* Notify the child that the parent is ready with all
e866af3a 3268 * its setup, and that the child can now hand over
d96c1ecf 3269 * control to the code to run inside the container. */
e866af3a
DH
3270 r = eventfd_send_state(eventfds[0],
3271 EVENTFD_PARENT_SUCCEEDED);
3272 eventfds[0] = safe_close(eventfds[0]);
3273 if (r < 0)
3274 goto finish;
354bfd2b 3275
04d39279
LP
3276 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3277 if (k < 0) {
3278 r = EXIT_FAILURE;
3279 break;
3280 }
88213476 3281
284c0b91
LP
3282 if (!arg_quiet)
3283 putc('\n', stdout);
04d39279
LP
3284
3285 /* Kill if it is not dead yet anyway */
1f0cd86b
LP
3286 terminate_machine(pid);
3287
e866af3a 3288check_container_status:
1f0cd86b 3289 /* Redundant, but better safe than sorry */
04d39279 3290 kill(pid, SIGKILL);
a258bf26 3291
113cea80 3292 r = wait_for_container(pid, &container_status);
04d39279
LP
3293 pid = 0;
3294
113cea80 3295 if (r < 0) {
d87be9b0
LP
3296 r = EXIT_FAILURE;
3297 break;
113cea80 3298 } else if (container_status == CONTAINER_TERMINATED)
d87be9b0 3299 break;
88213476 3300
113cea80 3301 /* CONTAINER_REBOOTED, loop again */
d87be9b0 3302 }
88213476
LP
3303
3304finish:
1b9e5b12
LP
3305 loop_remove(loop_nr, &image_fd);
3306
9444b1f2
LP
3307 if (pid > 0)
3308 kill(pid, SIGKILL);
88213476 3309
04d391da 3310 free(arg_directory);
7027ff61 3311 free(arg_machine);
c74e630d
LP
3312 free(arg_user);
3313 strv_free(arg_setenv);
3314 strv_free(arg_network_interfaces);
3315 strv_free(arg_network_macvlan);
3316 strv_free(arg_bind);
3317 strv_free(arg_bind_ro);
06c17c39 3318 strv_free(arg_tmpfs);
88213476
LP
3319
3320 return r;
3321}