]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: Fix erroneous OOM when building group list
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94
95 typedef enum LinkJournal {
96 LINK_NO,
97 LINK_AUTO,
98 LINK_HOST,
99 LINK_GUEST
100 } LinkJournal;
101
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114 (1ULL << CAP_CHOWN) |
115 (1ULL << CAP_DAC_OVERRIDE) |
116 (1ULL << CAP_DAC_READ_SEARCH) |
117 (1ULL << CAP_FOWNER) |
118 (1ULL << CAP_FSETID) |
119 (1ULL << CAP_IPC_OWNER) |
120 (1ULL << CAP_KILL) |
121 (1ULL << CAP_LEASE) |
122 (1ULL << CAP_LINUX_IMMUTABLE) |
123 (1ULL << CAP_NET_BIND_SERVICE) |
124 (1ULL << CAP_NET_BROADCAST) |
125 (1ULL << CAP_NET_RAW) |
126 (1ULL << CAP_SETGID) |
127 (1ULL << CAP_SETFCAP) |
128 (1ULL << CAP_SETPCAP) |
129 (1ULL << CAP_SETUID) |
130 (1ULL << CAP_SYS_ADMIN) |
131 (1ULL << CAP_SYS_CHROOT) |
132 (1ULL << CAP_SYS_NICE) |
133 (1ULL << CAP_SYS_PTRACE) |
134 (1ULL << CAP_SYS_TTY_CONFIG) |
135 (1ULL << CAP_SYS_RESOURCE) |
136 (1ULL << CAP_SYS_BOOT) |
137 (1ULL << CAP_AUDIT_WRITE) |
138 (1ULL << CAP_AUDIT_CONTROL) |
139 (1ULL << CAP_MKNOD);
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
153
154 static int help(void) {
155
156 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158 " -h --help Show this help\n"
159 " --version Print version string\n"
160 " -q --quiet Do not show status information\n"
161 " -D --directory=PATH Root directory for the container\n"
162 " -i --image=PATH File system device or image for the container\n"
163 " -b --boot Boot up full system (i.e. invoke init)\n"
164 " -u --user=USER Run the command under specified user or uid\n"
165 " -M --machine=NAME Set the machine name for the container\n"
166 " --uuid=UUID Set a specific machine UUID for the container\n"
167 " -S --slice=SLICE Place the container in the specified slice\n"
168 " --private-network Disable network in container\n"
169 " --network-interface=INTERFACE\n"
170 " Assign an existing network interface to the\n"
171 " container\n"
172 " --network-macvlan=INTERFACE\n"
173 " Create a macvlan network interface based on an\n"
174 " existing network interface to the container\n"
175 " --network-veth Add a virtual ethernet connection between host\n"
176 " and container\n"
177 " --network-bridge=INTERFACE\n"
178 " Add a virtual ethernet connection between host\n"
179 " and container and add it to an existing bridge on\n"
180 " the host\n"
181 " -Z --selinux-context=SECLABEL\n"
182 " Set the SELinux security context to be used by\n"
183 " processes in the container\n"
184 " -L --selinux-apifs-context=SECLABEL\n"
185 " Set the SELinux security context to be used by\n"
186 " API/tmpfs file systems in the container\n"
187 " --capability=CAP In addition to the default, retain specified\n"
188 " capability\n"
189 " --drop-capability=CAP Drop the specified capability from the default set\n"
190 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
191 " -j Equivalent to --link-journal=host\n"
192 " --read-only Mount the root directory read-only\n"
193 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
194 " the container\n"
195 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
196 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
197 " --share-system Share system namespaces with host\n"
198 " --register=BOOLEAN Register container as machine\n"
199 " --keep-unit Do not register a scope for the machine, reuse\n"
200 " the service unit nspawn is running in\n",
201 program_invocation_short_name);
202
203 return 0;
204 }
205
206 static int parse_argv(int argc, char *argv[]) {
207
208 enum {
209 ARG_VERSION = 0x100,
210 ARG_PRIVATE_NETWORK,
211 ARG_UUID,
212 ARG_READ_ONLY,
213 ARG_CAPABILITY,
214 ARG_DROP_CAPABILITY,
215 ARG_LINK_JOURNAL,
216 ARG_BIND,
217 ARG_BIND_RO,
218 ARG_SETENV,
219 ARG_SHARE_SYSTEM,
220 ARG_REGISTER,
221 ARG_KEEP_UNIT,
222 ARG_NETWORK_INTERFACE,
223 ARG_NETWORK_MACVLAN,
224 ARG_NETWORK_VETH,
225 ARG_NETWORK_BRIDGE,
226 ARG_PERSONALITY,
227 };
228
229 static const struct option options[] = {
230 { "help", no_argument, NULL, 'h' },
231 { "version", no_argument, NULL, ARG_VERSION },
232 { "directory", required_argument, NULL, 'D' },
233 { "user", required_argument, NULL, 'u' },
234 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
235 { "boot", no_argument, NULL, 'b' },
236 { "uuid", required_argument, NULL, ARG_UUID },
237 { "read-only", no_argument, NULL, ARG_READ_ONLY },
238 { "capability", required_argument, NULL, ARG_CAPABILITY },
239 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
240 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
241 { "bind", required_argument, NULL, ARG_BIND },
242 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
243 { "machine", required_argument, NULL, 'M' },
244 { "slice", required_argument, NULL, 'S' },
245 { "setenv", required_argument, NULL, ARG_SETENV },
246 { "selinux-context", required_argument, NULL, 'Z' },
247 { "selinux-apifs-context", required_argument, NULL, 'L' },
248 { "quiet", no_argument, NULL, 'q' },
249 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
250 { "register", required_argument, NULL, ARG_REGISTER },
251 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
252 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
253 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
254 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
255 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
256 { "personality", required_argument, NULL, ARG_PERSONALITY },
257 { "image", required_argument, NULL, 'i' },
258 {}
259 };
260
261 int c, r;
262 uint64_t plus = 0, minus = 0;
263
264 assert(argc >= 0);
265 assert(argv);
266
267 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
268
269 switch (c) {
270
271 case 'h':
272 return help();
273
274 case ARG_VERSION:
275 puts(PACKAGE_STRING);
276 puts(SYSTEMD_FEATURES);
277 return 0;
278
279 case 'D':
280 free(arg_directory);
281 arg_directory = canonicalize_file_name(optarg);
282 if (!arg_directory) {
283 log_error("Invalid root directory: %m");
284 return -ENOMEM;
285 }
286
287 break;
288
289 case 'i':
290 arg_image = optarg;
291 break;
292
293 case 'u':
294 free(arg_user);
295 arg_user = strdup(optarg);
296 if (!arg_user)
297 return log_oom();
298
299 break;
300
301 case ARG_NETWORK_BRIDGE:
302 arg_network_bridge = optarg;
303
304 /* fall through */
305
306 case ARG_NETWORK_VETH:
307 arg_network_veth = true;
308 arg_private_network = true;
309 break;
310
311 case ARG_NETWORK_INTERFACE:
312 if (strv_extend(&arg_network_interfaces, optarg) < 0)
313 return log_oom();
314
315 arg_private_network = true;
316 break;
317
318 case ARG_NETWORK_MACVLAN:
319 if (strv_extend(&arg_network_macvlan, optarg) < 0)
320 return log_oom();
321
322 /* fall through */
323
324 case ARG_PRIVATE_NETWORK:
325 arg_private_network = true;
326 break;
327
328 case 'b':
329 arg_boot = true;
330 break;
331
332 case ARG_UUID:
333 r = sd_id128_from_string(optarg, &arg_uuid);
334 if (r < 0) {
335 log_error("Invalid UUID: %s", optarg);
336 return r;
337 }
338 break;
339
340 case 'S':
341 arg_slice = optarg;
342 break;
343
344 case 'M':
345 if (isempty(optarg)) {
346 free(arg_machine);
347 arg_machine = NULL;
348 } else {
349
350 if (!hostname_is_valid(optarg)) {
351 log_error("Invalid machine name: %s", optarg);
352 return -EINVAL;
353 }
354
355 free(arg_machine);
356 arg_machine = strdup(optarg);
357 if (!arg_machine)
358 return log_oom();
359
360 break;
361 }
362
363 case 'Z':
364 arg_selinux_context = optarg;
365 break;
366
367 case 'L':
368 arg_selinux_apifs_context = optarg;
369 break;
370
371 case ARG_READ_ONLY:
372 arg_read_only = true;
373 break;
374
375 case ARG_CAPABILITY:
376 case ARG_DROP_CAPABILITY: {
377 char *state, *word;
378 size_t length;
379
380 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381 _cleanup_free_ char *t;
382 cap_value_t cap;
383
384 t = strndup(word, length);
385 if (!t)
386 return log_oom();
387
388 if (streq(t, "all")) {
389 if (c == ARG_CAPABILITY)
390 plus = (uint64_t) -1;
391 else
392 minus = (uint64_t) -1;
393 } else {
394 if (cap_from_name(t, &cap) < 0) {
395 log_error("Failed to parse capability %s.", t);
396 return -EINVAL;
397 }
398
399 if (c == ARG_CAPABILITY)
400 plus |= 1ULL << (uint64_t) cap;
401 else
402 minus |= 1ULL << (uint64_t) cap;
403 }
404 }
405
406 break;
407 }
408
409 case 'j':
410 arg_link_journal = LINK_GUEST;
411 break;
412
413 case ARG_LINK_JOURNAL:
414 if (streq(optarg, "auto"))
415 arg_link_journal = LINK_AUTO;
416 else if (streq(optarg, "no"))
417 arg_link_journal = LINK_NO;
418 else if (streq(optarg, "guest"))
419 arg_link_journal = LINK_GUEST;
420 else if (streq(optarg, "host"))
421 arg_link_journal = LINK_HOST;
422 else {
423 log_error("Failed to parse link journal mode %s", optarg);
424 return -EINVAL;
425 }
426
427 break;
428
429 case ARG_BIND:
430 case ARG_BIND_RO: {
431 _cleanup_free_ char *a = NULL, *b = NULL;
432 char *e;
433 char ***x;
434
435 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436
437 e = strchr(optarg, ':');
438 if (e) {
439 a = strndup(optarg, e - optarg);
440 b = strdup(e + 1);
441 } else {
442 a = strdup(optarg);
443 b = strdup(optarg);
444 }
445
446 if (!a || !b)
447 return log_oom();
448
449 if (!path_is_absolute(a) || !path_is_absolute(b)) {
450 log_error("Invalid bind mount specification: %s", optarg);
451 return -EINVAL;
452 }
453
454 r = strv_extend(x, a);
455 if (r < 0)
456 return log_oom();
457
458 r = strv_extend(x, b);
459 if (r < 0)
460 return log_oom();
461
462 break;
463 }
464
465 case ARG_SETENV: {
466 char **n;
467
468 if (!env_assignment_is_valid(optarg)) {
469 log_error("Environment variable assignment '%s' is not valid.", optarg);
470 return -EINVAL;
471 }
472
473 n = strv_env_set(arg_setenv, optarg);
474 if (!n)
475 return log_oom();
476
477 strv_free(arg_setenv);
478 arg_setenv = n;
479 break;
480 }
481
482 case 'q':
483 arg_quiet = true;
484 break;
485
486 case ARG_SHARE_SYSTEM:
487 arg_share_system = true;
488 break;
489
490 case ARG_REGISTER:
491 r = parse_boolean(optarg);
492 if (r < 0) {
493 log_error("Failed to parse --register= argument: %s", optarg);
494 return r;
495 }
496
497 arg_register = r;
498 break;
499
500 case ARG_KEEP_UNIT:
501 arg_keep_unit = true;
502 break;
503
504 case ARG_PERSONALITY:
505
506 arg_personality = personality_from_string(optarg);
507 if (arg_personality == 0xffffffffLU) {
508 log_error("Unknown or unsupported personality '%s'.", optarg);
509 return -EINVAL;
510 }
511
512 break;
513
514 case '?':
515 return -EINVAL;
516
517 default:
518 assert_not_reached("Unhandled option");
519 }
520 }
521
522 if (arg_share_system)
523 arg_register = false;
524
525 if (arg_boot && arg_share_system) {
526 log_error("--boot and --share-system may not be combined.");
527 return -EINVAL;
528 }
529
530 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531 log_error("--keep-unit may not be used when invoked from a user session.");
532 return -EINVAL;
533 }
534
535 if (arg_directory && arg_image) {
536 log_error("--directory= and --image= may not be combined.");
537 return -EINVAL;
538 }
539
540 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
541
542 return 1;
543 }
544
545 static int mount_all(const char *dest) {
546
547 typedef struct MountPoint {
548 const char *what;
549 const char *where;
550 const char *type;
551 const char *options;
552 unsigned long flags;
553 bool fatal;
554 } MountPoint;
555
556 static const MountPoint mount_table[] = {
557 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
558 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
559 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
560 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
561 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
562 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
564 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
565 #ifdef HAVE_SELINUX
566 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
567 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
568 #endif
569 };
570
571 unsigned k;
572 int r = 0;
573
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575 _cleanup_free_ char *where = NULL;
576 #ifdef HAVE_SELINUX
577 _cleanup_free_ char *options = NULL;
578 #endif
579 const char *o;
580 int t;
581
582 where = strjoin(dest, "/", mount_table[k].where, NULL);
583 if (!where)
584 return log_oom();
585
586 t = path_is_mount_point(where, true);
587 if (t < 0) {
588 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
589
590 if (r == 0)
591 r = t;
592
593 continue;
594 }
595
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && t > 0)
598 continue;
599
600 mkdir_p(where, 0755);
601
602 #ifdef HAVE_SELINUX
603 if (arg_selinux_apifs_context &&
604 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
606 if (!options)
607 return log_oom();
608
609 o = options;
610 } else
611 #endif
612 o = mount_table[k].options;
613
614
615 if (mount(mount_table[k].what,
616 where,
617 mount_table[k].type,
618 mount_table[k].flags,
619 o) < 0 &&
620 mount_table[k].fatal) {
621
622 log_error("mount(%s) failed: %m", where);
623
624 if (r == 0)
625 r = -errno;
626 }
627 }
628
629 return r;
630 }
631
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
633 char **x, **y;
634
635 STRV_FOREACH_PAIR(x, y, l) {
636 char *where;
637 struct stat source_st, dest_st;
638 int r;
639
640 if (stat(*x, &source_st) < 0) {
641 log_error("Failed to stat %s: %m", *x);
642 return -errno;
643 }
644
645 where = strappenda(dest, *y);
646 r = stat(where, &dest_st);
647 if (r == 0) {
648 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649 log_error("The file types of %s and %s do not match. Refusing bind mount",
650 *x, where);
651 return -EINVAL;
652 }
653 } else if (errno == ENOENT) {
654 r = mkdir_parents_label(where, 0755);
655 if (r < 0) {
656 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
657 return r;
658 }
659 } else {
660 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
661 return -errno;
662 }
663 /* Create the mount point, but be conservative -- refuse to create block
664 * and char devices. */
665 if (S_ISDIR(source_st.st_mode))
666 mkdir_label(where, 0755);
667 else if (S_ISFIFO(source_st.st_mode))
668 mkfifo(where, 0644);
669 else if (S_ISSOCK(source_st.st_mode))
670 mknod(where, 0644 | S_IFSOCK, 0);
671 else if (S_ISREG(source_st.st_mode))
672 touch(where);
673 else {
674 log_error("Refusing to create mountpoint for file: %s", *x);
675 return -ENOTSUP;
676 }
677
678 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679 log_error("mount(%s) failed: %m", where);
680 return -errno;
681 }
682
683 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684 log_error("mount(%s) failed: %m", where);
685 return -errno;
686 }
687 }
688
689 return 0;
690 }
691
692 static int setup_timezone(const char *dest) {
693 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
694 char *z, *y;
695 int r;
696
697 assert(dest);
698
699 /* Fix the timezone, if possible */
700 r = readlink_malloc("/etc/localtime", &p);
701 if (r < 0) {
702 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
703 return 0;
704 }
705
706 z = path_startswith(p, "../usr/share/zoneinfo/");
707 if (!z)
708 z = path_startswith(p, "/usr/share/zoneinfo/");
709 if (!z) {
710 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
711 return 0;
712 }
713
714 where = strappend(dest, "/etc/localtime");
715 if (!where)
716 return log_oom();
717
718 r = readlink_malloc(where, &q);
719 if (r >= 0) {
720 y = path_startswith(q, "../usr/share/zoneinfo/");
721 if (!y)
722 y = path_startswith(q, "/usr/share/zoneinfo/");
723
724
725 /* Already pointing to the right place? Then do nothing .. */
726 if (y && streq(y, z))
727 return 0;
728 }
729
730 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
731 if (!check)
732 return log_oom();
733
734 if (access(check, F_OK) < 0) {
735 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
736 return 0;
737 }
738
739 what = strappend("../usr/share/zoneinfo/", z);
740 if (!what)
741 return log_oom();
742
743 unlink(where);
744 if (symlink(what, where) < 0) {
745 log_error("Failed to correct timezone of container: %m");
746 return 0;
747 }
748
749 return 0;
750 }
751
752 static int setup_resolv_conf(const char *dest) {
753 char _cleanup_free_ *where = NULL;
754
755 assert(dest);
756
757 if (arg_private_network)
758 return 0;
759
760 /* Fix resolv.conf, if possible */
761 where = strappend(dest, "/etc/resolv.conf");
762 if (!where)
763 return log_oom();
764
765 /* We don't really care for the results of this really. If it
766 * fails, it fails, but meh... */
767 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
768
769 return 0;
770 }
771
772 static int setup_boot_id(const char *dest) {
773 _cleanup_free_ char *from = NULL, *to = NULL;
774 sd_id128_t rnd = {};
775 char as_uuid[37];
776 int r;
777
778 assert(dest);
779
780 if (arg_share_system)
781 return 0;
782
783 /* Generate a new randomized boot ID, so that each boot-up of
784 * the container gets a new one */
785
786 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
787 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
788 if (!from || !to)
789 return log_oom();
790
791 r = sd_id128_randomize(&rnd);
792 if (r < 0) {
793 log_error("Failed to generate random boot id: %s", strerror(-r));
794 return r;
795 }
796
797 snprintf(as_uuid, sizeof(as_uuid),
798 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
799 SD_ID128_FORMAT_VAL(rnd));
800 char_array_0(as_uuid);
801
802 r = write_string_file(from, as_uuid);
803 if (r < 0) {
804 log_error("Failed to write boot id: %s", strerror(-r));
805 return r;
806 }
807
808 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
809 log_error("Failed to bind mount boot id: %m");
810 r = -errno;
811 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
812 log_warning("Failed to make boot id read-only: %m");
813
814 unlink(from);
815 return r;
816 }
817
818 static int copy_devnodes(const char *dest) {
819
820 static const char devnodes[] =
821 "null\0"
822 "zero\0"
823 "full\0"
824 "random\0"
825 "urandom\0"
826 "tty\0";
827
828 const char *d;
829 int r = 0;
830 _cleanup_umask_ mode_t u;
831
832 assert(dest);
833
834 u = umask(0000);
835
836 NULSTR_FOREACH(d, devnodes) {
837 _cleanup_free_ char *from = NULL, *to = NULL;
838 struct stat st;
839
840 from = strappend("/dev/", d);
841 to = strjoin(dest, "/dev/", d, NULL);
842 if (!from || !to)
843 return log_oom();
844
845 if (stat(from, &st) < 0) {
846
847 if (errno != ENOENT) {
848 log_error("Failed to stat %s: %m", from);
849 return -errno;
850 }
851
852 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
853
854 log_error("%s is not a char or block device, cannot copy", from);
855 return -EIO;
856
857 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
858
859 log_error("mknod(%s) failed: %m", dest);
860 return -errno;
861 }
862 }
863
864 return r;
865 }
866
867 static int setup_ptmx(const char *dest) {
868 _cleanup_free_ char *p = NULL;
869
870 p = strappend(dest, "/dev/ptmx");
871 if (!p)
872 return log_oom();
873
874 if (symlink("pts/ptmx", p) < 0) {
875 log_error("Failed to create /dev/ptmx symlink: %m");
876 return -errno;
877 }
878
879 return 0;
880 }
881
882 static int setup_dev_console(const char *dest, const char *console) {
883 _cleanup_umask_ mode_t u;
884 const char *to;
885 struct stat st;
886 int r;
887
888 assert(dest);
889 assert(console);
890
891 u = umask(0000);
892
893 if (stat("/dev/null", &st) < 0) {
894 log_error("Failed to stat /dev/null: %m");
895 return -errno;
896 }
897
898 r = chmod_and_chown(console, 0600, 0, 0);
899 if (r < 0) {
900 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
901 return r;
902 }
903
904 /* We need to bind mount the right tty to /dev/console since
905 * ptys can only exist on pts file systems. To have something
906 * to bind mount things on we create a device node first, and
907 * use /dev/null for that since we the cgroups device policy
908 * allows us to create that freely, while we cannot create
909 * /dev/console. (Note that the major minor doesn't actually
910 * matter here, since we mount it over anyway). */
911
912 to = strappenda(dest, "/dev/console");
913 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
914 log_error("mknod() for /dev/console failed: %m");
915 return -errno;
916 }
917
918 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
919 log_error("Bind mount for /dev/console failed: %m");
920 return -errno;
921 }
922
923 return 0;
924 }
925
926 static int setup_kmsg(const char *dest, int kmsg_socket) {
927 _cleanup_free_ char *from = NULL, *to = NULL;
928 int r, fd, k;
929 _cleanup_umask_ mode_t u;
930 union {
931 struct cmsghdr cmsghdr;
932 uint8_t buf[CMSG_SPACE(sizeof(int))];
933 } control = {};
934 struct msghdr mh = {
935 .msg_control = &control,
936 .msg_controllen = sizeof(control),
937 };
938 struct cmsghdr *cmsg;
939
940 assert(dest);
941 assert(kmsg_socket >= 0);
942
943 u = umask(0000);
944
945 /* We create the kmsg FIFO as /dev/kmsg, but immediately
946 * delete it after bind mounting it to /proc/kmsg. While FIFOs
947 * on the reading side behave very similar to /proc/kmsg,
948 * their writing side behaves differently from /dev/kmsg in
949 * that writing blocks when nothing is reading. In order to
950 * avoid any problems with containers deadlocking due to this
951 * we simply make /dev/kmsg unavailable to the container. */
952 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
953 asprintf(&to, "%s/proc/kmsg", dest) < 0)
954 return log_oom();
955
956 if (mkfifo(from, 0600) < 0) {
957 log_error("mkfifo() for /dev/kmsg failed: %m");
958 return -errno;
959 }
960
961 r = chmod_and_chown(from, 0600, 0, 0);
962 if (r < 0) {
963 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
964 return r;
965 }
966
967 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
968 log_error("Bind mount for /proc/kmsg failed: %m");
969 return -errno;
970 }
971
972 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
973 if (fd < 0) {
974 log_error("Failed to open fifo: %m");
975 return -errno;
976 }
977
978 cmsg = CMSG_FIRSTHDR(&mh);
979 cmsg->cmsg_level = SOL_SOCKET;
980 cmsg->cmsg_type = SCM_RIGHTS;
981 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
982 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
983
984 mh.msg_controllen = cmsg->cmsg_len;
985
986 /* Store away the fd in the socket, so that it stays open as
987 * long as we run the child */
988 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
989 safe_close(fd);
990
991 if (k < 0) {
992 log_error("Failed to send FIFO fd: %m");
993 return -errno;
994 }
995
996 /* And now make the FIFO unavailable as /dev/kmsg... */
997 unlink(from);
998 return 0;
999 }
1000
1001 static int setup_hostname(void) {
1002
1003 if (arg_share_system)
1004 return 0;
1005
1006 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1007 return -errno;
1008
1009 return 0;
1010 }
1011
1012 static int setup_journal(const char *directory) {
1013 sd_id128_t machine_id, this_id;
1014 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1015 char *id;
1016 int r;
1017
1018 p = strappend(directory, "/etc/machine-id");
1019 if (!p)
1020 return log_oom();
1021
1022 r = read_one_line_file(p, &b);
1023 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1024 return 0;
1025 else if (r < 0) {
1026 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1027 return r;
1028 }
1029
1030 id = strstrip(b);
1031 if (isempty(id) && arg_link_journal == LINK_AUTO)
1032 return 0;
1033
1034 /* Verify validity */
1035 r = sd_id128_from_string(id, &machine_id);
1036 if (r < 0) {
1037 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1038 return r;
1039 }
1040
1041 r = sd_id128_get_machine(&this_id);
1042 if (r < 0) {
1043 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1044 return r;
1045 }
1046
1047 if (sd_id128_equal(machine_id, this_id)) {
1048 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1049 "Host and machine ids are equal (%s): refusing to link journals", id);
1050 if (arg_link_journal == LINK_AUTO)
1051 return 0;
1052 return
1053 -EEXIST;
1054 }
1055
1056 if (arg_link_journal == LINK_NO)
1057 return 0;
1058
1059 free(p);
1060 p = strappend("/var/log/journal/", id);
1061 q = strjoin(directory, "/var/log/journal/", id, NULL);
1062 if (!p || !q)
1063 return log_oom();
1064
1065 if (path_is_mount_point(p, false) > 0) {
1066 if (arg_link_journal != LINK_AUTO) {
1067 log_error("%s: already a mount point, refusing to use for journal", p);
1068 return -EEXIST;
1069 }
1070
1071 return 0;
1072 }
1073
1074 if (path_is_mount_point(q, false) > 0) {
1075 if (arg_link_journal != LINK_AUTO) {
1076 log_error("%s: already a mount point, refusing to use for journal", q);
1077 return -EEXIST;
1078 }
1079
1080 return 0;
1081 }
1082
1083 r = readlink_and_make_absolute(p, &d);
1084 if (r >= 0) {
1085 if ((arg_link_journal == LINK_GUEST ||
1086 arg_link_journal == LINK_AUTO) &&
1087 path_equal(d, q)) {
1088
1089 r = mkdir_p(q, 0755);
1090 if (r < 0)
1091 log_warning("failed to create directory %s: %m", q);
1092 return 0;
1093 }
1094
1095 if (unlink(p) < 0) {
1096 log_error("Failed to remove symlink %s: %m", p);
1097 return -errno;
1098 }
1099 } else if (r == -EINVAL) {
1100
1101 if (arg_link_journal == LINK_GUEST &&
1102 rmdir(p) < 0) {
1103
1104 if (errno == ENOTDIR) {
1105 log_error("%s already exists and is neither a symlink nor a directory", p);
1106 return r;
1107 } else {
1108 log_error("Failed to remove %s: %m", p);
1109 return -errno;
1110 }
1111 }
1112 } else if (r != -ENOENT) {
1113 log_error("readlink(%s) failed: %m", p);
1114 return r;
1115 }
1116
1117 if (arg_link_journal == LINK_GUEST) {
1118
1119 if (symlink(q, p) < 0) {
1120 log_error("Failed to symlink %s to %s: %m", q, p);
1121 return -errno;
1122 }
1123
1124 r = mkdir_p(q, 0755);
1125 if (r < 0)
1126 log_warning("failed to create directory %s: %m", q);
1127 return 0;
1128 }
1129
1130 if (arg_link_journal == LINK_HOST) {
1131 r = mkdir_p(p, 0755);
1132 if (r < 0) {
1133 log_error("Failed to create %s: %m", p);
1134 return r;
1135 }
1136
1137 } else if (access(p, F_OK) < 0)
1138 return 0;
1139
1140 if (dir_is_empty(q) == 0) {
1141 log_error("%s not empty.", q);
1142 return -ENOTEMPTY;
1143 }
1144
1145 r = mkdir_p(q, 0755);
1146 if (r < 0) {
1147 log_error("Failed to create %s: %m", q);
1148 return r;
1149 }
1150
1151 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1152 log_error("Failed to bind mount journal from host into guest: %m");
1153 return -errno;
1154 }
1155
1156 return 0;
1157 }
1158
1159 static int setup_kdbus(const char *dest, const char *path) {
1160 const char *p;
1161
1162 if (!path)
1163 return 0;
1164
1165 p = strappenda(dest, "/dev/kdbus");
1166 if (mkdir(p, 0755) < 0) {
1167 log_error("Failed to create kdbus path: %m");
1168 return -errno;
1169 }
1170
1171 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1172 log_error("Failed to mount kdbus domain path: %m");
1173 return -errno;
1174 }
1175
1176 return 0;
1177 }
1178
1179 static int drop_capabilities(void) {
1180 return capability_bounding_set_drop(~arg_retain, false);
1181 }
1182
1183 static int register_machine(pid_t pid) {
1184 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1185 _cleanup_bus_unref_ sd_bus *bus = NULL;
1186 int r;
1187
1188 if (!arg_register)
1189 return 0;
1190
1191 r = sd_bus_default_system(&bus);
1192 if (r < 0) {
1193 log_error("Failed to open system bus: %s", strerror(-r));
1194 return r;
1195 }
1196
1197 if (arg_keep_unit) {
1198 r = sd_bus_call_method(
1199 bus,
1200 "org.freedesktop.machine1",
1201 "/org/freedesktop/machine1",
1202 "org.freedesktop.machine1.Manager",
1203 "RegisterMachine",
1204 &error,
1205 NULL,
1206 "sayssus",
1207 arg_machine,
1208 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1209 "nspawn",
1210 "container",
1211 (uint32_t) pid,
1212 strempty(arg_directory));
1213 } else {
1214 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1215
1216 r = sd_bus_message_new_method_call(
1217 bus,
1218 &m,
1219 "org.freedesktop.machine1",
1220 "/org/freedesktop/machine1",
1221 "org.freedesktop.machine1.Manager",
1222 "CreateMachine");
1223 if (r < 0) {
1224 log_error("Failed to create message: %s", strerror(-r));
1225 return r;
1226 }
1227
1228 r = sd_bus_message_append(
1229 m,
1230 "sayssus",
1231 arg_machine,
1232 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1233 "nspawn",
1234 "container",
1235 (uint32_t) pid,
1236 strempty(arg_directory));
1237 if (r < 0) {
1238 log_error("Failed to append message arguments: %s", strerror(-r));
1239 return r;
1240 }
1241
1242 r = sd_bus_message_open_container(m, 'a', "(sv)");
1243 if (r < 0) {
1244 log_error("Failed to open container: %s", strerror(-r));
1245 return r;
1246 }
1247
1248 if (!isempty(arg_slice)) {
1249 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1250 if (r < 0) {
1251 log_error("Failed to append slice: %s", strerror(-r));
1252 return r;
1253 }
1254 }
1255
1256 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1257 if (r < 0) {
1258 log_error("Failed to add device policy: %s", strerror(-r));
1259 return r;
1260 }
1261
1262 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1263 /* Allow the container to
1264 * access and create the API
1265 * device nodes, so that
1266 * PrivateDevices= in the
1267 * container can work
1268 * fine */
1269 "/dev/null", "rwm",
1270 "/dev/zero", "rwm",
1271 "/dev/full", "rwm",
1272 "/dev/random", "rwm",
1273 "/dev/urandom", "rwm",
1274 "/dev/tty", "rwm",
1275 /* Allow the container
1276 * access to ptys. However,
1277 * do not permit the
1278 * container to ever create
1279 * these device nodes. */
1280 "/dev/pts/ptmx", "rw",
1281 "char-pts", "rw",
1282 /* Allow the container
1283 * access to all kdbus
1284 * devices. Again, the
1285 * container cannot create
1286 * these nodes, only use
1287 * them. We use a pretty
1288 * open match here, so that
1289 * the kernel API can still
1290 * change. */
1291 "char-kdbus", "rw",
1292 "char-kdbus/*", "rw");
1293 if (r < 0) {
1294 log_error("Failed to add device whitelist: %s", strerror(-r));
1295 return r;
1296 }
1297
1298 r = sd_bus_message_close_container(m);
1299 if (r < 0) {
1300 log_error("Failed to close container: %s", strerror(-r));
1301 return r;
1302 }
1303
1304 r = sd_bus_call(bus, m, 0, &error, NULL);
1305 }
1306
1307 if (r < 0) {
1308 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1309 return r;
1310 }
1311
1312 return 0;
1313 }
1314
1315 static int terminate_machine(pid_t pid) {
1316 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1317 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1318 _cleanup_bus_unref_ sd_bus *bus = NULL;
1319 const char *path;
1320 int r;
1321
1322 if (!arg_register)
1323 return 0;
1324
1325 r = sd_bus_default_system(&bus);
1326 if (r < 0) {
1327 log_error("Failed to open system bus: %s", strerror(-r));
1328 return r;
1329 }
1330
1331 r = sd_bus_call_method(
1332 bus,
1333 "org.freedesktop.machine1",
1334 "/org/freedesktop/machine1",
1335 "org.freedesktop.machine1.Manager",
1336 "GetMachineByPID",
1337 &error,
1338 &reply,
1339 "u",
1340 (uint32_t) pid);
1341 if (r < 0) {
1342 /* Note that the machine might already have been
1343 * cleaned up automatically, hence don't consider it a
1344 * failure if we cannot get the machine object. */
1345 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1346 return 0;
1347 }
1348
1349 r = sd_bus_message_read(reply, "o", &path);
1350 if (r < 0)
1351 return bus_log_parse_error(r);
1352
1353 r = sd_bus_call_method(
1354 bus,
1355 "org.freedesktop.machine1",
1356 path,
1357 "org.freedesktop.machine1.Machine",
1358 "Terminate",
1359 &error,
1360 NULL,
1361 NULL);
1362 if (r < 0) {
1363 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1364 return 0;
1365 }
1366
1367 return 0;
1368 }
1369
1370 static int reset_audit_loginuid(void) {
1371 _cleanup_free_ char *p = NULL;
1372 int r;
1373
1374 if (arg_share_system)
1375 return 0;
1376
1377 r = read_one_line_file("/proc/self/loginuid", &p);
1378 if (r == -ENOENT)
1379 return 0;
1380 if (r < 0) {
1381 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1382 return r;
1383 }
1384
1385 /* Already reset? */
1386 if (streq(p, "4294967295"))
1387 return 0;
1388
1389 r = write_string_file("/proc/self/loginuid", "4294967295");
1390 if (r < 0) {
1391 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1392 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1393 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1394 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1395 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1396
1397 sleep(5);
1398 }
1399
1400 return 0;
1401 }
1402
1403 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1404
1405 static int get_mac(struct ether_addr *mac) {
1406 int r;
1407
1408 uint8_t result[8];
1409 size_t l, sz;
1410 uint8_t *v;
1411
1412 l = strlen(arg_machine);
1413 sz = sizeof(sd_id128_t) + l;
1414 v = alloca(sz);
1415
1416 /* fetch some persistent data unique to the host */
1417 r = sd_id128_get_machine((sd_id128_t*) v);
1418 if (r < 0)
1419 return r;
1420
1421 /* combine with some data unique (on this host) to this
1422 * container instance */
1423 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1424
1425 /* Let's hash the host machine ID plus the container name. We
1426 * use a fixed, but originally randomly created hash key here. */
1427 siphash24(result, v, sz, HASH_KEY.bytes);
1428
1429 assert_cc(ETH_ALEN <= sizeof(result));
1430 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1431
1432 /* see eth_random_addr in the kernel */
1433 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1434 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1435
1436 return 0;
1437 }
1438
1439 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1440 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1441 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1442 struct ether_addr mac;
1443 int r;
1444
1445 if (!arg_private_network)
1446 return 0;
1447
1448 if (!arg_network_veth)
1449 return 0;
1450
1451 /* Use two different interface name prefixes depending whether
1452 * we are in bridge mode or not. */
1453 if (arg_network_bridge)
1454 memcpy(iface_name, "vb-", 3);
1455 else
1456 memcpy(iface_name, "ve-", 3);
1457 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1458
1459 r = get_mac(&mac);
1460 if (r < 0) {
1461 log_error("Failed to generate predictable MAC address for host0");
1462 return r;
1463 }
1464
1465 r = sd_rtnl_open(&rtnl, 0);
1466 if (r < 0) {
1467 log_error("Failed to connect to netlink: %s", strerror(-r));
1468 return r;
1469 }
1470
1471 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1472 if (r < 0) {
1473 log_error("Failed to allocate netlink message: %s", strerror(-r));
1474 return r;
1475 }
1476
1477 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1478 if (r < 0) {
1479 log_error("Failed to add netlink interface name: %s", strerror(-r));
1480 return r;
1481 }
1482
1483 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1484 if (r < 0) {
1485 log_error("Failed to open netlink container: %s", strerror(-r));
1486 return r;
1487 }
1488
1489 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1490 if (r < 0) {
1491 log_error("Failed to open netlink container: %s", strerror(-r));
1492 return r;
1493 }
1494
1495 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1496 if (r < 0) {
1497 log_error("Failed to open netlink container: %s", strerror(-r));
1498 return r;
1499 }
1500
1501 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1502 if (r < 0) {
1503 log_error("Failed to add netlink interface name: %s", strerror(-r));
1504 return r;
1505 }
1506
1507 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1508 if (r < 0) {
1509 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1510 return r;
1511 }
1512
1513 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1514 if (r < 0) {
1515 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1516 return r;
1517 }
1518
1519 r = sd_rtnl_message_close_container(m);
1520 if (r < 0) {
1521 log_error("Failed to close netlink container: %s", strerror(-r));
1522 return r;
1523 }
1524
1525 r = sd_rtnl_message_close_container(m);
1526 if (r < 0) {
1527 log_error("Failed to close netlink container: %s", strerror(-r));
1528 return r;
1529 }
1530
1531 r = sd_rtnl_message_close_container(m);
1532 if (r < 0) {
1533 log_error("Failed to close netlink container: %s", strerror(-r));
1534 return r;
1535 }
1536
1537 r = sd_rtnl_call(rtnl, m, 0, NULL);
1538 if (r < 0) {
1539 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1540 return r;
1541 }
1542
1543 return 0;
1544 }
1545
1546 static int setup_bridge(const char veth_name[]) {
1547 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1548 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1549 int r, bridge;
1550
1551 if (!arg_private_network)
1552 return 0;
1553
1554 if (!arg_network_veth)
1555 return 0;
1556
1557 if (!arg_network_bridge)
1558 return 0;
1559
1560 bridge = (int) if_nametoindex(arg_network_bridge);
1561 if (bridge <= 0) {
1562 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1563 return -errno;
1564 }
1565
1566 r = sd_rtnl_open(&rtnl, 0);
1567 if (r < 0) {
1568 log_error("Failed to connect to netlink: %s", strerror(-r));
1569 return r;
1570 }
1571
1572 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1573 if (r < 0) {
1574 log_error("Failed to allocate netlink message: %s", strerror(-r));
1575 return r;
1576 }
1577
1578 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1579 if (r < 0) {
1580 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1581 return r;
1582 }
1583
1584 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1585 if (r < 0) {
1586 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1587 return r;
1588 }
1589
1590 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1591 if (r < 0) {
1592 log_error("Failed to add netlink master field: %s", strerror(-r));
1593 return r;
1594 }
1595
1596 r = sd_rtnl_call(rtnl, m, 0, NULL);
1597 if (r < 0) {
1598 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1599 return r;
1600 }
1601
1602 return 0;
1603 }
1604
1605 static int parse_interface(struct udev *udev, const char *name) {
1606 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1607 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1608 int ifi;
1609
1610 ifi = (int) if_nametoindex(name);
1611 if (ifi <= 0) {
1612 log_error("Failed to resolve interface %s: %m", name);
1613 return -errno;
1614 }
1615
1616 sprintf(ifi_str, "n%i", ifi);
1617 d = udev_device_new_from_device_id(udev, ifi_str);
1618 if (!d) {
1619 log_error("Failed to get udev device for interface %s: %m", name);
1620 return -errno;
1621 }
1622
1623 if (udev_device_get_is_initialized(d) <= 0) {
1624 log_error("Network interface %s is not initialized yet.", name);
1625 return -EBUSY;
1626 }
1627
1628 return ifi;
1629 }
1630
1631 static int move_network_interfaces(pid_t pid) {
1632 _cleanup_udev_unref_ struct udev *udev = NULL;
1633 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1634 char **i;
1635 int r;
1636
1637 if (!arg_private_network)
1638 return 0;
1639
1640 if (strv_isempty(arg_network_interfaces))
1641 return 0;
1642
1643 r = sd_rtnl_open(&rtnl, 0);
1644 if (r < 0) {
1645 log_error("Failed to connect to netlink: %s", strerror(-r));
1646 return r;
1647 }
1648
1649 udev = udev_new();
1650 if (!udev) {
1651 log_error("Failed to connect to udev.");
1652 return -ENOMEM;
1653 }
1654
1655 STRV_FOREACH(i, arg_network_interfaces) {
1656 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1657 int ifi;
1658
1659 ifi = parse_interface(udev, *i);
1660 if (ifi < 0)
1661 return ifi;
1662
1663 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1664 if (r < 0) {
1665 log_error("Failed to allocate netlink message: %s", strerror(-r));
1666 return r;
1667 }
1668
1669 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1670 if (r < 0) {
1671 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1672 return r;
1673 }
1674
1675 r = sd_rtnl_call(rtnl, m, 0, NULL);
1676 if (r < 0) {
1677 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1678 return r;
1679 }
1680 }
1681
1682 return 0;
1683 }
1684
1685 static int setup_macvlan(pid_t pid) {
1686 _cleanup_udev_unref_ struct udev *udev = NULL;
1687 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1688 char **i;
1689 int r;
1690
1691 if (!arg_private_network)
1692 return 0;
1693
1694 if (strv_isempty(arg_network_macvlan))
1695 return 0;
1696
1697 r = sd_rtnl_open(&rtnl, 0);
1698 if (r < 0) {
1699 log_error("Failed to connect to netlink: %s", strerror(-r));
1700 return r;
1701 }
1702
1703 udev = udev_new();
1704 if (!udev) {
1705 log_error("Failed to connect to udev.");
1706 return -ENOMEM;
1707 }
1708
1709 STRV_FOREACH(i, arg_network_macvlan) {
1710 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1711 _cleanup_free_ char *n = NULL;
1712 int ifi;
1713
1714 ifi = parse_interface(udev, *i);
1715 if (ifi < 0)
1716 return ifi;
1717
1718 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1719 if (r < 0) {
1720 log_error("Failed to allocate netlink message: %s", strerror(-r));
1721 return r;
1722 }
1723
1724 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1725 if (r < 0) {
1726 log_error("Failed to add netlink interface index: %s", strerror(-r));
1727 return r;
1728 }
1729
1730 n = strappend("mv-", *i);
1731 if (!n)
1732 return log_oom();
1733
1734 strshorten(n, IFNAMSIZ-1);
1735
1736 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1737 if (r < 0) {
1738 log_error("Failed to add netlink interface name: %s", strerror(-r));
1739 return r;
1740 }
1741
1742 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1743 if (r < 0) {
1744 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1745 return r;
1746 }
1747
1748 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1749 if (r < 0) {
1750 log_error("Failed to open netlink container: %s", strerror(-r));
1751 return r;
1752 }
1753
1754 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1755 if (r < 0) {
1756 log_error("Failed to open netlink container: %s", strerror(-r));
1757 return r;
1758 }
1759
1760 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1761 if (r < 0) {
1762 log_error("Failed to append macvlan mode: %s", strerror(-r));
1763 return r;
1764 }
1765
1766 r = sd_rtnl_message_close_container(m);
1767 if (r < 0) {
1768 log_error("Failed to close netlink container: %s", strerror(-r));
1769 return r;
1770 }
1771
1772 r = sd_rtnl_message_close_container(m);
1773 if (r < 0) {
1774 log_error("Failed to close netlink container: %s", strerror(-r));
1775 return r;
1776 }
1777
1778 r = sd_rtnl_call(rtnl, m, 0, NULL);
1779 if (r < 0) {
1780 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1781 return r;
1782 }
1783 }
1784
1785 return 0;
1786 }
1787
1788 static int audit_still_doesnt_work_in_containers(void) {
1789
1790 #ifdef HAVE_SECCOMP
1791 scmp_filter_ctx seccomp;
1792 int r;
1793
1794 /*
1795 Audit is broken in containers, much of the userspace audit
1796 hookup will fail if running inside a container. We don't
1797 care and just turn off creation of audit sockets.
1798
1799 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1800 with EAFNOSUPPORT which audit userspace uses as indication
1801 that audit is disabled in the kernel.
1802 */
1803
1804 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1805 if (!seccomp)
1806 return log_oom();
1807
1808 r = seccomp_add_secondary_archs(seccomp);
1809 if (r < 0) {
1810 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1811 goto finish;
1812 }
1813
1814 r = seccomp_rule_add(
1815 seccomp,
1816 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1817 SCMP_SYS(socket),
1818 2,
1819 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1820 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1821 if (r < 0) {
1822 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1823 goto finish;
1824 }
1825
1826 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1827 if (r < 0) {
1828 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1829 goto finish;
1830 }
1831
1832 r = seccomp_load(seccomp);
1833 if (r < 0)
1834 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1835
1836 finish:
1837 seccomp_release(seccomp);
1838 return r;
1839 #else
1840 return 0;
1841 #endif
1842
1843 }
1844
1845 static int setup_image(char **device_path, int *loop_nr) {
1846 struct loop_info64 info = {
1847 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1848 };
1849 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1850 _cleanup_free_ char* loopdev = NULL;
1851 struct stat st;
1852 int r, nr;
1853
1854 assert(device_path);
1855 assert(loop_nr);
1856
1857 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1858 if (fd < 0) {
1859 log_error("Failed to open %s: %m", arg_image);
1860 return -errno;
1861 }
1862
1863 if (fstat(fd, &st) < 0) {
1864 log_error("Failed to stat %s: %m", arg_image);
1865 return -errno;
1866 }
1867
1868 if (S_ISBLK(st.st_mode)) {
1869 char *p;
1870
1871 p = strdup(arg_image);
1872 if (!p)
1873 return log_oom();
1874
1875 *device_path = p;
1876
1877 *loop_nr = -1;
1878
1879 r = fd;
1880 fd = -1;
1881
1882 return r;
1883 }
1884
1885 if (!S_ISREG(st.st_mode)) {
1886 log_error("%s is not a regular file or block device: %m", arg_image);
1887 return -EINVAL;
1888 }
1889
1890 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1891 if (control < 0) {
1892 log_error("Failed to open /dev/loop-control: %m");
1893 return -errno;
1894 }
1895
1896 nr = ioctl(control, LOOP_CTL_GET_FREE);
1897 if (nr < 0) {
1898 log_error("Failed to allocate loop device: %m");
1899 return -errno;
1900 }
1901
1902 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1903 return log_oom();
1904
1905 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1906 if (loop < 0) {
1907 log_error("Failed to open loop device %s: %m", loopdev);
1908 return -errno;
1909 }
1910
1911 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1912 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1913 return -errno;
1914 }
1915
1916 if (arg_read_only)
1917 info.lo_flags |= LO_FLAGS_READ_ONLY;
1918
1919 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1920 log_error("Failed to set loopback settings on %s: %m", loopdev);
1921 return -errno;
1922 }
1923
1924 *device_path = loopdev;
1925 loopdev = NULL;
1926
1927 *loop_nr = nr;
1928
1929 r = loop;
1930 loop = -1;
1931
1932 return r;
1933 }
1934
1935 static int dissect_image(
1936 int fd,
1937 char **root_device, bool *root_device_rw,
1938 char **home_device, bool *home_device_rw,
1939 char **srv_device, bool *srv_device_rw,
1940 bool *secondary) {
1941
1942 #ifdef HAVE_BLKID
1943 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1944 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1945 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1946 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1947 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1948 _cleanup_udev_unref_ struct udev *udev = NULL;
1949 struct udev_list_entry *first, *item;
1950 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1951 const char *pttype = NULL;
1952 blkid_partlist pl;
1953 struct stat st;
1954 int r;
1955
1956 assert(fd >= 0);
1957 assert(root_device);
1958 assert(home_device);
1959 assert(srv_device);
1960 assert(secondary);
1961
1962 b = blkid_new_probe();
1963 if (!b)
1964 return log_oom();
1965
1966 errno = 0;
1967 r = blkid_probe_set_device(b, fd, 0, 0);
1968 if (r != 0) {
1969 if (errno == 0)
1970 return log_oom();
1971
1972 log_error("Failed to set device on blkid probe: %m");
1973 return -errno;
1974 }
1975
1976 blkid_probe_enable_partitions(b, 1);
1977 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1978
1979 errno = 0;
1980 r = blkid_do_safeprobe(b);
1981 if (r == -2 || r == 1) {
1982 log_error("Failed to identify any partition table on %s.\n"
1983 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1984 return -EINVAL;
1985 } else if (r != 0) {
1986 if (errno == 0)
1987 errno = EIO;
1988 log_error("Failed to probe: %m");
1989 return -errno;
1990 }
1991
1992 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1993 if (!streq_ptr(pttype, "gpt")) {
1994 log_error("Image %s does not carry a GUID Partition Table.\n"
1995 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1996 return -EINVAL;
1997 }
1998
1999 errno = 0;
2000 pl = blkid_probe_get_partitions(b);
2001 if (!pl) {
2002 if (errno == 0)
2003 return log_oom();
2004
2005 log_error("Failed to list partitions of %s", arg_image);
2006 return -errno;
2007 }
2008
2009 udev = udev_new();
2010 if (!udev)
2011 return log_oom();
2012
2013 if (fstat(fd, &st) < 0) {
2014 log_error("Failed to stat block device: %m");
2015 return -errno;
2016 }
2017
2018 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2019 if (!d)
2020 return log_oom();
2021
2022 e = udev_enumerate_new(udev);
2023 if (!e)
2024 return log_oom();
2025
2026 r = udev_enumerate_add_match_parent(e, d);
2027 if (r < 0)
2028 return log_oom();
2029
2030 r = udev_enumerate_scan_devices(e);
2031 if (r < 0) {
2032 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2033 return r;
2034 }
2035
2036 first = udev_enumerate_get_list_entry(e);
2037 udev_list_entry_foreach(item, first) {
2038 _cleanup_udev_device_unref_ struct udev_device *q;
2039 const char *stype, *node;
2040 unsigned long long flags;
2041 sd_id128_t type_id;
2042 blkid_partition pp;
2043 dev_t qn;
2044 int nr;
2045
2046 errno = 0;
2047 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2048 if (!q) {
2049 if (!errno)
2050 errno = ENOMEM;
2051
2052 log_error("Failed to get partition device of %s: %m", arg_image);
2053 return -errno;
2054 }
2055
2056 qn = udev_device_get_devnum(q);
2057 if (major(qn) == 0)
2058 continue;
2059
2060 if (st.st_rdev == qn)
2061 continue;
2062
2063 node = udev_device_get_devnode(q);
2064 if (!node)
2065 continue;
2066
2067 pp = blkid_partlist_devno_to_partition(pl, qn);
2068 if (!pp)
2069 continue;
2070
2071 flags = blkid_partition_get_flags(pp);
2072 if (flags & GPT_FLAG_NO_AUTO)
2073 continue;
2074
2075 nr = blkid_partition_get_partno(pp);
2076 if (nr < 0)
2077 continue;
2078
2079 stype = blkid_partition_get_type_string(pp);
2080 if (!stype)
2081 continue;
2082
2083 if (sd_id128_from_string(stype, &type_id) < 0)
2084 continue;
2085
2086 if (sd_id128_equal(type_id, GPT_HOME)) {
2087
2088 if (home && nr >= home_nr)
2089 continue;
2090
2091 home_nr = nr;
2092 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2093
2094 free(home);
2095 home = strdup(node);
2096 if (!home)
2097 return log_oom();
2098 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2099
2100 if (srv && nr >= srv_nr)
2101 continue;
2102
2103 srv_nr = nr;
2104 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2105
2106 free(srv);
2107 srv = strdup(node);
2108 if (!srv)
2109 return log_oom();
2110 }
2111 #ifdef GPT_ROOT_NATIVE
2112 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2113
2114 if (root && nr >= root_nr)
2115 continue;
2116
2117 root_nr = nr;
2118 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2119
2120 free(root);
2121 root = strdup(node);
2122 if (!root)
2123 return log_oom();
2124 }
2125 #endif
2126 #ifdef GPT_ROOT_SECONDARY
2127 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2128
2129 if (secondary_root && nr >= secondary_root_nr)
2130 continue;
2131
2132 secondary_root_nr = nr;
2133 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2134
2135
2136 free(secondary_root);
2137 secondary_root = strdup(node);
2138 if (!secondary_root)
2139 return log_oom();
2140 }
2141 #endif
2142 }
2143
2144 if (!root && !secondary_root) {
2145 log_error("Failed to identify root partition in disk image %s.\n"
2146 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2147 return -EINVAL;
2148 }
2149
2150 if (root) {
2151 *root_device = root;
2152 root = NULL;
2153
2154 *root_device_rw = root_rw;
2155 *secondary = false;
2156 } else if (secondary_root) {
2157 *root_device = secondary_root;
2158 secondary_root = NULL;
2159
2160 *root_device_rw = secondary_root_rw;
2161 *secondary = true;
2162 }
2163
2164 if (home) {
2165 *home_device = home;
2166 home = NULL;
2167
2168 *home_device_rw = home_rw;
2169 }
2170
2171 if (srv) {
2172 *srv_device = srv;
2173 srv = NULL;
2174
2175 *srv_device_rw = srv_rw;
2176 }
2177
2178 return 0;
2179 #else
2180 log_error("--image= is not supported, compiled without blkid support.");
2181 return -ENOTSUP;
2182 #endif
2183 }
2184
2185 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2186 #ifdef HAVE_BLKID
2187 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2188 const char *fstype, *p;
2189 int r;
2190
2191 assert(what);
2192 assert(where);
2193
2194 if (arg_read_only)
2195 rw = false;
2196
2197 if (directory)
2198 p = strappenda(where, directory);
2199 else
2200 p = where;
2201
2202 errno = 0;
2203 b = blkid_new_probe_from_filename(what);
2204 if (!b) {
2205 if (errno == 0)
2206 return log_oom();
2207 log_error("Failed to allocate prober for %s: %m", what);
2208 return -errno;
2209 }
2210
2211 blkid_probe_enable_superblocks(b, 1);
2212 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2213
2214 errno = 0;
2215 r = blkid_do_safeprobe(b);
2216 if (r == -1 || r == 1) {
2217 log_error("Cannot determine file system type of %s", what);
2218 return -EINVAL;
2219 } else if (r != 0) {
2220 if (errno == 0)
2221 errno = EIO;
2222 log_error("Failed to probe %s: %m", what);
2223 return -errno;
2224 }
2225
2226 errno = 0;
2227 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2228 if (errno == 0)
2229 errno = EINVAL;
2230 log_error("Failed to determine file system type of %s", what);
2231 return -errno;
2232 }
2233
2234 if (streq(fstype, "crypto_LUKS")) {
2235 log_error("nspawn currently does not support LUKS disk images.");
2236 return -ENOTSUP;
2237 }
2238
2239 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2240 log_error("Failed to mount %s: %m", what);
2241 return -errno;
2242 }
2243
2244 return 0;
2245 #else
2246 log_error("--image= is not supported, compiled without blkid support.");
2247 return -ENOTSUP;
2248 #endif
2249 }
2250
2251 static int mount_devices(
2252 const char *where,
2253 const char *root_device, bool root_device_rw,
2254 const char *home_device, bool home_device_rw,
2255 const char *srv_device, bool srv_device_rw) {
2256 int r;
2257
2258 assert(where);
2259
2260 if (root_device) {
2261 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2262 if (r < 0) {
2263 log_error("Failed to mount root directory: %s", strerror(-r));
2264 return r;
2265 }
2266 }
2267
2268 if (home_device) {
2269 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2270 if (r < 0) {
2271 log_error("Failed to mount home directory: %s", strerror(-r));
2272 return r;
2273 }
2274 }
2275
2276 if (srv_device) {
2277 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2278 if (r < 0) {
2279 log_error("Failed to mount server data directory: %s", strerror(-r));
2280 return r;
2281 }
2282 }
2283
2284 return 0;
2285 }
2286
2287 static void loop_remove(int nr, int *image_fd) {
2288 _cleanup_close_ int control = -1;
2289
2290 if (nr < 0)
2291 return;
2292
2293 if (image_fd && *image_fd >= 0) {
2294 ioctl(*image_fd, LOOP_CLR_FD);
2295 *image_fd = safe_close(*image_fd);
2296 }
2297
2298 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2299 if (control < 0)
2300 return;
2301
2302 ioctl(control, LOOP_CTL_REMOVE, nr);
2303 }
2304
2305 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2306 int pipe_fds[2];
2307 pid_t pid;
2308
2309 assert(database);
2310 assert(key);
2311 assert(rpid);
2312
2313 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2314 log_error("Failed to allocate pipe: %m");
2315 return -errno;
2316 }
2317
2318 pid = fork();
2319 if (pid < 0) {
2320 log_error("Failed to fork getent child: %m");
2321 return -errno;
2322 } else if (pid == 0) {
2323 int nullfd;
2324 char *empty_env = NULL;
2325
2326 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2327 _exit(EXIT_FAILURE);
2328
2329 if (pipe_fds[0] > 2)
2330 safe_close(pipe_fds[0]);
2331 if (pipe_fds[1] > 2)
2332 safe_close(pipe_fds[1]);
2333
2334 nullfd = open("/dev/null", O_RDWR);
2335 if (nullfd < 0)
2336 _exit(EXIT_FAILURE);
2337
2338 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2339 _exit(EXIT_FAILURE);
2340
2341 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2342 _exit(EXIT_FAILURE);
2343
2344 if (nullfd > 2)
2345 safe_close(nullfd);
2346
2347 reset_all_signal_handlers();
2348 close_all_fds(NULL, 0);
2349
2350 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2351 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2352 _exit(EXIT_FAILURE);
2353 }
2354
2355 pipe_fds[1] = safe_close(pipe_fds[1]);
2356
2357 *rpid = pid;
2358
2359 return pipe_fds[0];
2360 }
2361
2362 static int change_uid_gid(char **_home) {
2363 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2364 _cleanup_free_ uid_t *uids = NULL;
2365 _cleanup_free_ char *home = NULL;
2366 _cleanup_fclose_ FILE *f = NULL;
2367 _cleanup_close_ int fd = -1;
2368 unsigned n_uids = 0;
2369 size_t sz = 0, l;
2370 uid_t uid;
2371 gid_t gid;
2372 pid_t pid;
2373 int r;
2374
2375 assert(_home);
2376
2377 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2378 /* Reset everything fully to 0, just in case */
2379
2380 if (setgroups(0, NULL) < 0) {
2381 log_error("setgroups() failed: %m");
2382 return -errno;
2383 }
2384
2385 if (setresgid(0, 0, 0) < 0) {
2386 log_error("setregid() failed: %m");
2387 return -errno;
2388 }
2389
2390 if (setresuid(0, 0, 0) < 0) {
2391 log_error("setreuid() failed: %m");
2392 return -errno;
2393 }
2394
2395 *_home = NULL;
2396 return 0;
2397 }
2398
2399 /* First, get user credentials */
2400 fd = spawn_getent("passwd", arg_user, &pid);
2401 if (fd < 0)
2402 return fd;
2403
2404 f = fdopen(fd, "r");
2405 if (!f)
2406 return log_oom();
2407 fd = -1;
2408
2409 if (!fgets(line, sizeof(line), f)) {
2410
2411 if (!ferror(f)) {
2412 log_error("Failed to resolve user %s.", arg_user);
2413 return -ESRCH;
2414 }
2415
2416 log_error("Failed to read from getent: %m");
2417 return -errno;
2418 }
2419
2420 truncate_nl(line);
2421
2422 wait_for_terminate_and_warn("getent passwd", pid);
2423
2424 x = strchr(line, ':');
2425 if (!x) {
2426 log_error("/etc/passwd entry has invalid user field.");
2427 return -EIO;
2428 }
2429
2430 u = strchr(x+1, ':');
2431 if (!u) {
2432 log_error("/etc/passwd entry has invalid password field.");
2433 return -EIO;
2434 }
2435
2436 u++;
2437 g = strchr(u, ':');
2438 if (!g) {
2439 log_error("/etc/passwd entry has invalid UID field.");
2440 return -EIO;
2441 }
2442
2443 *g = 0;
2444 g++;
2445 x = strchr(g, ':');
2446 if (!x) {
2447 log_error("/etc/passwd entry has invalid GID field.");
2448 return -EIO;
2449 }
2450
2451 *x = 0;
2452 h = strchr(x+1, ':');
2453 if (!h) {
2454 log_error("/etc/passwd entry has invalid GECOS field.");
2455 return -EIO;
2456 }
2457
2458 h++;
2459 x = strchr(h, ':');
2460 if (!x) {
2461 log_error("/etc/passwd entry has invalid home directory field.");
2462 return -EIO;
2463 }
2464
2465 *x = 0;
2466
2467 r = parse_uid(u, &uid);
2468 if (r < 0) {
2469 log_error("Failed to parse UID of user.");
2470 return -EIO;
2471 }
2472
2473 r = parse_gid(g, &gid);
2474 if (r < 0) {
2475 log_error("Failed to parse GID of user.");
2476 return -EIO;
2477 }
2478
2479 home = strdup(h);
2480 if (!home)
2481 return log_oom();
2482
2483 /* Second, get group memberships */
2484 fd = spawn_getent("initgroups", arg_user, &pid);
2485 if (fd < 0)
2486 return fd;
2487
2488 fclose(f);
2489 f = fdopen(fd, "r");
2490 if (!f)
2491 return log_oom();
2492 fd = -1;
2493
2494 if (!fgets(line, sizeof(line), f)) {
2495 if (!ferror(f)) {
2496 log_error("Failed to resolve user %s.", arg_user);
2497 return -ESRCH;
2498 }
2499
2500 log_error("Failed to read from getent: %m");
2501 return -errno;
2502 }
2503
2504 truncate_nl(line);
2505
2506 wait_for_terminate_and_warn("getent initgroups", pid);
2507
2508 /* Skip over the username and subsequent separator whitespace */
2509 x = line;
2510 x += strcspn(x, WHITESPACE);
2511 x += strspn(x, WHITESPACE);
2512
2513 FOREACH_WORD(w, l, x, state) {
2514 char c[l+1];
2515
2516 memcpy(c, w, l);
2517 c[l] = 0;
2518
2519 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2520 return log_oom();
2521
2522 r = parse_uid(c, &uids[n_uids++]);
2523 if (r < 0) {
2524 log_error("Failed to parse group data from getent.");
2525 return -EIO;
2526 }
2527 }
2528
2529 r = mkdir_parents(home, 0775);
2530 if (r < 0) {
2531 log_error("Failed to make home root directory: %s", strerror(-r));
2532 return r;
2533 }
2534
2535 r = mkdir_safe(home, 0755, uid, gid);
2536 if (r < 0 && r != -EEXIST) {
2537 log_error("Failed to make home directory: %s", strerror(-r));
2538 return r;
2539 }
2540
2541 fchown(STDIN_FILENO, uid, gid);
2542 fchown(STDOUT_FILENO, uid, gid);
2543 fchown(STDERR_FILENO, uid, gid);
2544
2545 if (setgroups(n_uids, uids) < 0) {
2546 log_error("Failed to set auxiliary groups: %m");
2547 return -errno;
2548 }
2549
2550 if (setresgid(gid, gid, gid) < 0) {
2551 log_error("setregid() failed: %m");
2552 return -errno;
2553 }
2554
2555 if (setresuid(uid, uid, uid) < 0) {
2556 log_error("setreuid() failed: %m");
2557 return -errno;
2558 }
2559
2560 if (_home) {
2561 *_home = home;
2562 home = NULL;
2563 }
2564
2565 return 0;
2566 }
2567
2568 int main(int argc, char *argv[]) {
2569
2570 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2571 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2572 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2573 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2574 _cleanup_fdset_free_ FDSet *fds = NULL;
2575 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2576 const char *console = NULL;
2577 char veth_name[IFNAMSIZ];
2578 bool secondary = false;
2579 pid_t pid = 0;
2580 sigset_t mask;
2581
2582 log_parse_environment();
2583 log_open();
2584
2585 k = parse_argv(argc, argv);
2586 if (k < 0)
2587 goto finish;
2588 else if (k == 0) {
2589 r = EXIT_SUCCESS;
2590 goto finish;
2591 }
2592
2593 if (!arg_image) {
2594 if (arg_directory) {
2595 char *p;
2596
2597 p = path_make_absolute_cwd(arg_directory);
2598 free(arg_directory);
2599 arg_directory = p;
2600 } else
2601 arg_directory = get_current_dir_name();
2602
2603 if (!arg_directory) {
2604 log_error("Failed to determine path, please use -D.");
2605 goto finish;
2606 }
2607 path_kill_slashes(arg_directory);
2608 }
2609
2610 if (!arg_machine) {
2611 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2612 if (!arg_machine) {
2613 log_oom();
2614 goto finish;
2615 }
2616
2617 hostname_cleanup(arg_machine, false);
2618 if (isempty(arg_machine)) {
2619 log_error("Failed to determine machine name automatically, please use -M.");
2620 goto finish;
2621 }
2622 }
2623
2624 if (geteuid() != 0) {
2625 log_error("Need to be root.");
2626 goto finish;
2627 }
2628
2629 if (sd_booted() <= 0) {
2630 log_error("Not running on a systemd system.");
2631 goto finish;
2632 }
2633
2634 log_close();
2635 n_fd_passed = sd_listen_fds(false);
2636 if (n_fd_passed > 0) {
2637 k = fdset_new_listen_fds(&fds, false);
2638 if (k < 0) {
2639 log_error("Failed to collect file descriptors: %s", strerror(-k));
2640 goto finish;
2641 }
2642 }
2643 fdset_close_others(fds);
2644 log_open();
2645
2646 if (arg_directory) {
2647 if (path_equal(arg_directory, "/")) {
2648 log_error("Spawning container on root directory not supported.");
2649 goto finish;
2650 }
2651
2652 if (arg_boot) {
2653 if (path_is_os_tree(arg_directory) <= 0) {
2654 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2655 goto finish;
2656 }
2657 } else {
2658 const char *p;
2659
2660 p = strappenda(arg_directory,
2661 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2662 if (access(p, F_OK) < 0) {
2663 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2664 goto finish;
2665
2666 }
2667 }
2668 } else {
2669 char template[] = "/tmp/nspawn-root-XXXXXX";
2670
2671 if (!mkdtemp(template)) {
2672 log_error("Failed to create temporary directory: %m");
2673 r = -errno;
2674 goto finish;
2675 }
2676
2677 arg_directory = strdup(template);
2678 if (!arg_directory) {
2679 r = log_oom();
2680 goto finish;
2681 }
2682
2683 image_fd = setup_image(&device_path, &loop_nr);
2684 if (image_fd < 0) {
2685 r = image_fd;
2686 goto finish;
2687 }
2688
2689 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2690 if (r < 0)
2691 goto finish;
2692 }
2693
2694 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2695 if (master < 0) {
2696 log_error("Failed to acquire pseudo tty: %m");
2697 goto finish;
2698 }
2699
2700 console = ptsname(master);
2701 if (!console) {
2702 log_error("Failed to determine tty name: %m");
2703 goto finish;
2704 }
2705
2706 if (!arg_quiet)
2707 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2708
2709 if (unlockpt(master) < 0) {
2710 log_error("Failed to unlock tty: %m");
2711 goto finish;
2712 }
2713
2714 if (access("/dev/kdbus/control", F_OK) >= 0) {
2715
2716 if (arg_share_system) {
2717 kdbus_domain = strdup("/dev/kdbus");
2718 if (!kdbus_domain) {
2719 log_oom();
2720 goto finish;
2721 }
2722 } else {
2723 const char *ns;
2724
2725 ns = strappenda("machine-", arg_machine);
2726 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2727 if (r < 0)
2728 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2729 else
2730 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2731 }
2732 }
2733
2734 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2735 log_error("Failed to create kmsg socket pair: %m");
2736 goto finish;
2737 }
2738
2739 sd_notify(0, "READY=1");
2740
2741 assert_se(sigemptyset(&mask) == 0);
2742 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2743 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2744
2745 for (;;) {
2746 int parent_ready_fd = -1, child_ready_fd = -1;
2747 siginfo_t status;
2748 eventfd_t x;
2749
2750 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2751 if (parent_ready_fd < 0) {
2752 log_error("Failed to create event fd: %m");
2753 goto finish;
2754 }
2755
2756 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2757 if (child_ready_fd < 0) {
2758 log_error("Failed to create event fd: %m");
2759 goto finish;
2760 }
2761
2762 pid = syscall(__NR_clone,
2763 SIGCHLD|CLONE_NEWNS|
2764 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2765 (arg_private_network ? CLONE_NEWNET : 0), NULL);
2766 if (pid < 0) {
2767 if (errno == EINVAL)
2768 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2769 else
2770 log_error("clone() failed: %m");
2771
2772 goto finish;
2773 }
2774
2775 if (pid == 0) {
2776 /* child */
2777 _cleanup_free_ char *home = NULL;
2778 unsigned n_env = 2;
2779 const char *envp[] = {
2780 "PATH=" DEFAULT_PATH_SPLIT_USR,
2781 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2782 NULL, /* TERM */
2783 NULL, /* HOME */
2784 NULL, /* USER */
2785 NULL, /* LOGNAME */
2786 NULL, /* container_uuid */
2787 NULL, /* LISTEN_FDS */
2788 NULL, /* LISTEN_PID */
2789 NULL
2790 };
2791 char **env_use;
2792
2793 envp[n_env] = strv_find_prefix(environ, "TERM=");
2794 if (envp[n_env])
2795 n_env ++;
2796
2797 master = safe_close(master);
2798
2799 close_nointr(STDIN_FILENO);
2800 close_nointr(STDOUT_FILENO);
2801 close_nointr(STDERR_FILENO);
2802
2803 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2804
2805 reset_all_signal_handlers();
2806
2807 assert_se(sigemptyset(&mask) == 0);
2808 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2809
2810 k = open_terminal(console, O_RDWR);
2811 if (k != STDIN_FILENO) {
2812 if (k >= 0) {
2813 safe_close(k);
2814 k = -EINVAL;
2815 }
2816
2817 log_error("Failed to open console: %s", strerror(-k));
2818 goto child_fail;
2819 }
2820
2821 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2822 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2823 log_error("Failed to duplicate console: %m");
2824 goto child_fail;
2825 }
2826
2827 if (setsid() < 0) {
2828 log_error("setsid() failed: %m");
2829 goto child_fail;
2830 }
2831
2832 if (reset_audit_loginuid() < 0)
2833 goto child_fail;
2834
2835 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2836 log_error("PR_SET_PDEATHSIG failed: %m");
2837 goto child_fail;
2838 }
2839
2840 /* Mark everything as slave, so that we still
2841 * receive mounts from the real root, but don't
2842 * propagate mounts to the real root. */
2843 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2844 log_error("MS_SLAVE|MS_REC failed: %m");
2845 goto child_fail;
2846 }
2847
2848 if (mount_devices(arg_directory,
2849 root_device, root_device_rw,
2850 home_device, home_device_rw,
2851 srv_device, srv_device_rw) < 0)
2852 goto child_fail;
2853
2854 /* Turn directory into bind mount */
2855 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2856 log_error("Failed to make bind mount.");
2857 goto child_fail;
2858 }
2859
2860 if (arg_read_only)
2861 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2862 log_error("Failed to make read-only.");
2863 goto child_fail;
2864 }
2865
2866 if (mount_all(arg_directory) < 0)
2867 goto child_fail;
2868
2869 if (copy_devnodes(arg_directory) < 0)
2870 goto child_fail;
2871
2872 if (setup_ptmx(arg_directory) < 0)
2873 goto child_fail;
2874
2875 dev_setup(arg_directory);
2876
2877 if (audit_still_doesnt_work_in_containers() < 0)
2878 goto child_fail;
2879
2880 if (setup_dev_console(arg_directory, console) < 0)
2881 goto child_fail;
2882
2883 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2884 goto child_fail;
2885
2886 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2887
2888 if (setup_boot_id(arg_directory) < 0)
2889 goto child_fail;
2890
2891 if (setup_timezone(arg_directory) < 0)
2892 goto child_fail;
2893
2894 if (setup_resolv_conf(arg_directory) < 0)
2895 goto child_fail;
2896
2897 if (setup_journal(arg_directory) < 0)
2898 goto child_fail;
2899
2900 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2901 goto child_fail;
2902
2903 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2904 goto child_fail;
2905
2906 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2907 goto child_fail;
2908
2909 /* Tell the parent that we are ready, and that
2910 * it can cgroupify us to that we lack access
2911 * to certain devices and resources. */
2912 eventfd_write(child_ready_fd, 1);
2913 child_ready_fd = safe_close(child_ready_fd);
2914
2915 if (chdir(arg_directory) < 0) {
2916 log_error("chdir(%s) failed: %m", arg_directory);
2917 goto child_fail;
2918 }
2919
2920 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2921 log_error("mount(MS_MOVE) failed: %m");
2922 goto child_fail;
2923 }
2924
2925 if (chroot(".") < 0) {
2926 log_error("chroot() failed: %m");
2927 goto child_fail;
2928 }
2929
2930 if (chdir("/") < 0) {
2931 log_error("chdir() failed: %m");
2932 goto child_fail;
2933 }
2934
2935 umask(0022);
2936
2937 if (arg_private_network)
2938 loopback_setup();
2939
2940 if (drop_capabilities() < 0) {
2941 log_error("drop_capabilities() failed: %m");
2942 goto child_fail;
2943 }
2944
2945 r = change_uid_gid(&home);
2946 if (r < 0)
2947 goto child_fail;
2948
2949 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2950 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2951 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2952 log_oom();
2953 goto child_fail;
2954 }
2955
2956 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2957 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2958 log_oom();
2959 goto child_fail;
2960 }
2961 }
2962
2963 if (fdset_size(fds) > 0) {
2964 k = fdset_cloexec(fds, false);
2965 if (k < 0) {
2966 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2967 goto child_fail;
2968 }
2969
2970 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2971 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2972 log_oom();
2973 goto child_fail;
2974 }
2975 }
2976
2977 setup_hostname();
2978
2979 if (arg_personality != 0xffffffffLU) {
2980 if (personality(arg_personality) < 0) {
2981 log_error("personality() failed: %m");
2982 goto child_fail;
2983 }
2984 } else if (secondary) {
2985 if (personality(PER_LINUX32) < 0) {
2986 log_error("personality() failed: %m");
2987 goto child_fail;
2988 }
2989 }
2990
2991 #ifdef HAVE_SELINUX
2992 if (arg_selinux_context)
2993 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
2994 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2995 goto child_fail;
2996 }
2997 #endif
2998
2999 if (!strv_isempty(arg_setenv)) {
3000 char **n;
3001
3002 n = strv_env_merge(2, envp, arg_setenv);
3003 if (!n) {
3004 log_oom();
3005 goto child_fail;
3006 }
3007
3008 env_use = n;
3009 } else
3010 env_use = (char**) envp;
3011
3012 /* Wait until the parent is ready with the setup, too... */
3013 eventfd_read(parent_ready_fd, &x);
3014 parent_ready_fd = safe_close(parent_ready_fd);
3015
3016 if (arg_boot) {
3017 char **a;
3018 size_t l;
3019
3020 /* Automatically search for the init system */
3021
3022 l = 1 + argc - optind;
3023 a = newa(char*, l + 1);
3024 memcpy(a + 1, argv + optind, l * sizeof(char*));
3025
3026 a[0] = (char*) "/usr/lib/systemd/systemd";
3027 execve(a[0], a, env_use);
3028
3029 a[0] = (char*) "/lib/systemd/systemd";
3030 execve(a[0], a, env_use);
3031
3032 a[0] = (char*) "/sbin/init";
3033 execve(a[0], a, env_use);
3034 } else if (argc > optind)
3035 execvpe(argv[optind], argv + optind, env_use);
3036 else {
3037 chdir(home ? home : "/root");
3038 execle("/bin/bash", "-bash", NULL, env_use);
3039 execle("/bin/sh", "-sh", NULL, env_use);
3040 }
3041
3042 log_error("execv() failed: %m");
3043
3044 child_fail:
3045 _exit(EXIT_FAILURE);
3046 }
3047
3048 fdset_free(fds);
3049 fds = NULL;
3050
3051 /* Wait until the child reported that it is ready with
3052 * all it needs to do with priviliges. After we got
3053 * the notification we can make the process join its
3054 * cgroup which might limit what it can do */
3055 eventfd_read(child_ready_fd, &x);
3056
3057 r = register_machine(pid);
3058 if (r < 0)
3059 goto finish;
3060
3061 r = move_network_interfaces(pid);
3062 if (r < 0)
3063 goto finish;
3064
3065 r = setup_veth(pid, veth_name);
3066 if (r < 0)
3067 goto finish;
3068
3069 r = setup_bridge(veth_name);
3070 if (r < 0)
3071 goto finish;
3072
3073 r = setup_macvlan(pid);
3074 if (r < 0)
3075 goto finish;
3076
3077 /* Notify the child that the parent is ready with all
3078 * its setup, and thtat the child can now hand over
3079 * control to the code to run inside the container. */
3080 eventfd_write(parent_ready_fd, 1);
3081
3082 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3083 if (k < 0) {
3084 r = EXIT_FAILURE;
3085 break;
3086 }
3087
3088 if (!arg_quiet)
3089 putc('\n', stdout);
3090
3091 /* Kill if it is not dead yet anyway */
3092 terminate_machine(pid);
3093
3094 /* Redundant, but better safe than sorry */
3095 kill(pid, SIGKILL);
3096
3097 k = wait_for_terminate(pid, &status);
3098 pid = 0;
3099
3100 if (k < 0) {
3101 r = EXIT_FAILURE;
3102 break;
3103 }
3104
3105 if (status.si_code == CLD_EXITED) {
3106 r = status.si_status;
3107 if (status.si_status != 0) {
3108 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3109 break;
3110 }
3111
3112 if (!arg_quiet)
3113 log_debug("Container %s exited successfully.", arg_machine);
3114 break;
3115 } else if (status.si_code == CLD_KILLED &&
3116 status.si_status == SIGINT) {
3117
3118 if (!arg_quiet)
3119 log_info("Container %s has been shut down.", arg_machine);
3120 r = 0;
3121 break;
3122 } else if (status.si_code == CLD_KILLED &&
3123 status.si_status == SIGHUP) {
3124
3125 if (!arg_quiet)
3126 log_info("Container %s is being rebooted.", arg_machine);
3127 continue;
3128 } else if (status.si_code == CLD_KILLED ||
3129 status.si_code == CLD_DUMPED) {
3130
3131 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3132 r = EXIT_FAILURE;
3133 break;
3134 } else {
3135 log_error("Container %s failed due to unknown reason.", arg_machine);
3136 r = EXIT_FAILURE;
3137 break;
3138 }
3139 }
3140
3141 finish:
3142 loop_remove(loop_nr, &image_fd);
3143
3144 if (pid > 0)
3145 kill(pid, SIGKILL);
3146
3147 free(arg_directory);
3148 free(arg_machine);
3149 free(arg_user);
3150 strv_free(arg_setenv);
3151 strv_free(arg_network_interfaces);
3152 strv_free(arg_network_macvlan);
3153 strv_free(arg_bind);
3154 strv_free(arg_bind_ro);
3155
3156 return r;
3157 }