]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: use automatic cleanup for umask
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58
59 typedef enum LinkJournal {
60 LINK_NO,
61 LINK_AUTO,
62 LINK_HOST,
63 LINK_GUEST
64 } LinkJournal;
65
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
75 (1ULL << CAP_CHOWN) |
76 (1ULL << CAP_DAC_OVERRIDE) |
77 (1ULL << CAP_DAC_READ_SEARCH) |
78 (1ULL << CAP_FOWNER) |
79 (1ULL << CAP_FSETID) |
80 (1ULL << CAP_IPC_OWNER) |
81 (1ULL << CAP_KILL) |
82 (1ULL << CAP_LEASE) |
83 (1ULL << CAP_LINUX_IMMUTABLE) |
84 (1ULL << CAP_NET_BIND_SERVICE) |
85 (1ULL << CAP_NET_BROADCAST) |
86 (1ULL << CAP_NET_RAW) |
87 (1ULL << CAP_SETGID) |
88 (1ULL << CAP_SETFCAP) |
89 (1ULL << CAP_SETPCAP) |
90 (1ULL << CAP_SETUID) |
91 (1ULL << CAP_SYS_ADMIN) |
92 (1ULL << CAP_SYS_CHROOT) |
93 (1ULL << CAP_SYS_NICE) |
94 (1ULL << CAP_SYS_PTRACE) |
95 (1ULL << CAP_SYS_TTY_CONFIG) |
96 (1ULL << CAP_SYS_RESOURCE) |
97 (1ULL << CAP_SYS_BOOT);
98
99 static int help(void) {
100
101 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103 " -h --help Show this help\n"
104 " -D --directory=NAME Root directory for the container\n"
105 " -b --boot Boot up full system (i.e. invoke init)\n"
106 " -u --user=USER Run the command under specified user or uid\n"
107 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
108 " --uuid=UUID Set a specific machine UUID for the container\n"
109 " --private-network Disable network in container\n"
110 " --read-only Mount the root directory read-only\n"
111 " --capability=CAP In addition to the default, retain specified capability\n"
112 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
113 " -j Equivalent to --link-journal=host\n",
114 program_invocation_short_name);
115
116 return 0;
117 }
118
119 static int parse_argv(int argc, char *argv[]) {
120
121 enum {
122 ARG_PRIVATE_NETWORK = 0x100,
123 ARG_UUID,
124 ARG_READ_ONLY,
125 ARG_CAPABILITY,
126 ARG_LINK_JOURNAL
127 };
128
129 static const struct option options[] = {
130 { "help", no_argument, NULL, 'h' },
131 { "directory", required_argument, NULL, 'D' },
132 { "user", required_argument, NULL, 'u' },
133 { "controllers", required_argument, NULL, 'C' },
134 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
135 { "boot", no_argument, NULL, 'b' },
136 { "uuid", required_argument, NULL, ARG_UUID },
137 { "read-only", no_argument, NULL, ARG_READ_ONLY },
138 { "capability", required_argument, NULL, ARG_CAPABILITY },
139 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
140 { NULL, 0, NULL, 0 }
141 };
142
143 int c;
144
145 assert(argc >= 0);
146 assert(argv);
147
148 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
149
150 switch (c) {
151
152 case 'h':
153 help();
154 return 0;
155
156 case 'D':
157 free(arg_directory);
158 arg_directory = canonicalize_file_name(optarg);
159 if (!arg_directory) {
160 log_error("Failed to canonicalize root directory.");
161 return -ENOMEM;
162 }
163
164 break;
165
166 case 'u':
167 free(arg_user);
168 if (!(arg_user = strdup(optarg))) {
169 log_error("Failed to duplicate user name.");
170 return -ENOMEM;
171 }
172
173 break;
174
175 case 'C':
176 strv_free(arg_controllers);
177 arg_controllers = strv_split(optarg, ",");
178 if (!arg_controllers) {
179 log_error("Failed to split controllers list.");
180 return -ENOMEM;
181 }
182 strv_uniq(arg_controllers);
183
184 break;
185
186 case ARG_PRIVATE_NETWORK:
187 arg_private_network = true;
188 break;
189
190 case 'b':
191 arg_boot = true;
192 break;
193
194 case ARG_UUID:
195 arg_uuid = optarg;
196 break;
197
198 case ARG_READ_ONLY:
199 arg_read_only = true;
200 break;
201
202 case ARG_CAPABILITY: {
203 char *state, *word;
204 size_t length;
205
206 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207 cap_value_t cap;
208 char *t;
209
210 t = strndup(word, length);
211 if (!t)
212 return log_oom();
213
214 if (cap_from_name(t, &cap) < 0) {
215 log_error("Failed to parse capability %s.", t);
216 free(t);
217 return -EINVAL;
218 }
219
220 free(t);
221 arg_retain |= 1ULL << (uint64_t) cap;
222 }
223
224 break;
225 }
226
227 case 'j':
228 arg_link_journal = LINK_GUEST;
229 break;
230
231 case ARG_LINK_JOURNAL:
232 if (streq(optarg, "auto"))
233 arg_link_journal = LINK_AUTO;
234 else if (streq(optarg, "no"))
235 arg_link_journal = LINK_NO;
236 else if (streq(optarg, "guest"))
237 arg_link_journal = LINK_GUEST;
238 else if (streq(optarg, "host"))
239 arg_link_journal = LINK_HOST;
240 else {
241 log_error("Failed to parse link journal mode %s", optarg);
242 return -EINVAL;
243 }
244
245 break;
246
247 case '?':
248 return -EINVAL;
249
250 default:
251 log_error("Unknown option code %c", c);
252 return -EINVAL;
253 }
254 }
255
256 return 1;
257 }
258
259 static int mount_all(const char *dest) {
260
261 typedef struct MountPoint {
262 const char *what;
263 const char *where;
264 const char *type;
265 const char *options;
266 unsigned long flags;
267 bool fatal;
268 } MountPoint;
269
270 static const MountPoint mount_table[] = {
271 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
272 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
273 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
274 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
276 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
277 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
278 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
279 #ifdef HAVE_SELINUX
280 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
281 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
282 #endif
283 };
284
285 unsigned k;
286 int r = 0;
287 char _cleanup_free_ *where = NULL;
288
289 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290 int t;
291
292 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
293 log_oom();
294
295 if (r == 0)
296 r = -ENOMEM;
297
298 break;
299 }
300
301 t = path_is_mount_point(where, true);
302 if (t < 0) {
303 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
304
305 if (r == 0)
306 r = t;
307
308 continue;
309 }
310
311 /* Skip this entry if it is not a remount. */
312 if (mount_table[k].what && t > 0)
313 continue;
314
315 mkdir_p_label(where, 0755);
316
317 if (mount(mount_table[k].what,
318 where,
319 mount_table[k].type,
320 mount_table[k].flags,
321 mount_table[k].options) < 0 &&
322 mount_table[k].fatal) {
323
324 log_error("mount(%s) failed: %m", where);
325
326 if (r == 0)
327 r = -errno;
328 }
329 }
330
331 return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335 char *where;
336
337 assert(dest);
338
339 /* Fix the timezone, if possible */
340 where = strappend(dest, "/etc/localtime");
341 if (!where)
342 return log_oom();
343
344 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347 free(where);
348
349 where = strappend(dest, "/etc/timezone");
350 if (!where)
351 return log_oom();
352
353 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356 free(where);
357
358 return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362 char *where;
363
364 assert(dest);
365
366 if (arg_private_network)
367 return 0;
368
369 /* Fix resolv.conf, if possible */
370 where = strappend(dest, "/etc/resolv.conf");
371 if (!where)
372 return log_oom();
373
374 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377 free(where);
378
379 return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383 char _cleanup_free_ *from = NULL, *to = NULL;
384 sd_id128_t rnd;
385 char as_uuid[37];
386 int r;
387
388 assert(dest);
389
390 /* Generate a new randomized boot ID, so that each boot-up of
391 * the container gets a new one */
392
393 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
395 if (!from || !to)
396 return log_oom();
397
398 r = sd_id128_randomize(&rnd);
399 if (r < 0) {
400 log_error("Failed to generate random boot id: %s", strerror(-r));
401 return r;
402 }
403
404 snprintf(as_uuid, sizeof(as_uuid),
405 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
406 SD_ID128_FORMAT_VAL(rnd));
407 char_array_0(as_uuid);
408
409 r = write_one_line_file(from, as_uuid);
410 if (r < 0) {
411 log_error("Failed to write boot id: %s", strerror(-r));
412 return r;
413 }
414
415 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
416 log_error("Failed to bind mount boot id: %m");
417 r = -errno;
418 } else
419 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
420
421 unlink(from);
422 return r;
423 }
424
425 static int copy_devnodes(const char *dest) {
426
427 static const char devnodes[] =
428 "null\0"
429 "zero\0"
430 "full\0"
431 "random\0"
432 "urandom\0"
433 "tty\0"
434 "ptmx\0";
435
436 const char *d;
437 int r = 0;
438 mode_t _cleanup_umask_ u;
439
440 assert(dest);
441
442 u = umask(0000);
443
444 NULSTR_FOREACH(d, devnodes) {
445 struct stat st;
446 char _cleanup_free_ *from = NULL, *to = NULL;
447
448 asprintf(&from, "/dev/%s", d);
449 asprintf(&to, "%s/dev/%s", dest, d);
450
451 if (!from || !to) {
452 log_oom();
453
454 if (r == 0)
455 r = -ENOMEM;
456
457 break;
458 }
459
460 if (stat(from, &st) < 0) {
461
462 if (errno != ENOENT) {
463 log_error("Failed to stat %s: %m", from);
464 if (r == 0)
465 r = -errno;
466 }
467
468 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
469
470 log_error("%s is not a char or block device, cannot copy", from);
471 if (r == 0)
472 r = -EIO;
473
474 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
475
476 log_error("mknod(%s) failed: %m", dest);
477 if (r == 0)
478 r = -errno;
479 }
480 }
481
482 return r;
483 }
484
485 static int setup_dev_console(const char *dest, const char *console) {
486 struct stat st;
487 char _cleanup_free_ *to = NULL;
488 int r;
489 mode_t _cleanup_umask_ u;
490
491 assert(dest);
492 assert(console);
493
494 u = umask(0000);
495
496 if (stat(console, &st) < 0) {
497 log_error("Failed to stat %s: %m", console);
498 return -errno;
499
500 } else if (!S_ISCHR(st.st_mode)) {
501 log_error("/dev/console is not a char device");
502 return -EIO;
503 }
504
505 r = chmod_and_chown(console, 0600, 0, 0);
506 if (r < 0) {
507 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
508 return r;
509 }
510
511 if (asprintf(&to, "%s/dev/console", dest) < 0)
512 return log_oom();
513
514 /* We need to bind mount the right tty to /dev/console since
515 * ptys can only exist on pts file systems. To have something
516 * to bind mount things on we create a device node first, that
517 * has the right major/minor (note that the major minor
518 * doesn't actually matter here, since we mount it over
519 * anyway). */
520
521 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
522 log_error("mknod() for /dev/console failed: %m");
523 return -errno;
524 }
525
526 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
527 log_error("Bind mount for /dev/console failed: %m");
528 return -errno;
529 }
530
531 return 0;
532 }
533
534 static int setup_kmsg(const char *dest, int kmsg_socket) {
535 char _cleanup_free_ *from = NULL, *to = NULL;
536 int r, fd, k;
537 mode_t _cleanup_umask_ u;
538 union {
539 struct cmsghdr cmsghdr;
540 uint8_t buf[CMSG_SPACE(sizeof(int))];
541 } control;
542 struct msghdr mh;
543 struct cmsghdr *cmsg;
544
545 assert(dest);
546 assert(kmsg_socket >= 0);
547
548 u = umask(0000);
549
550 /* We create the kmsg FIFO as /dev/kmsg, but immediately
551 * delete it after bind mounting it to /proc/kmsg. While FIFOs
552 * on the reading side behave very similar to /proc/kmsg,
553 * their writing side behaves differently from /dev/kmsg in
554 * that writing blocks when nothing is reading. In order to
555 * avoid any problems with containers deadlocking due to this
556 * we simply make /dev/kmsg unavailable to the container. */
557 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
558 asprintf(&to, "%s/proc/kmsg", dest) < 0)
559 return log_oom();
560
561 if (mkfifo(from, 0600) < 0) {
562 log_error("mkfifo() for /dev/kmsg failed: %m");
563 return -errno;
564 }
565
566 r = chmod_and_chown(from, 0600, 0, 0);
567 if (r < 0) {
568 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
569 return r;
570 }
571
572 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
573 log_error("Bind mount for /proc/kmsg failed: %m");
574 return -errno;
575 }
576
577 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
578 if (fd < 0) {
579 log_error("Failed to open fifo: %m");
580 return -errno;
581 }
582
583 zero(mh);
584 zero(control);
585
586 mh.msg_control = &control;
587 mh.msg_controllen = sizeof(control);
588
589 cmsg = CMSG_FIRSTHDR(&mh);
590 cmsg->cmsg_level = SOL_SOCKET;
591 cmsg->cmsg_type = SCM_RIGHTS;
592 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
593 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
594
595 mh.msg_controllen = cmsg->cmsg_len;
596
597 /* Store away the fd in the socket, so that it stays open as
598 * long as we run the child */
599 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
600 close_nointr_nofail(fd);
601
602 if (k < 0) {
603 log_error("Failed to send FIFO fd: %m");
604 return -errno;
605 }
606
607 /* And now make the FIFO unavailable as /dev/kmsg... */
608 unlink(from);
609 return 0;
610 }
611
612 static int setup_hostname(void) {
613 char *hn;
614 int r = 0;
615
616 hn = path_get_file_name(arg_directory);
617 if (hn) {
618 hn = strdup(hn);
619 if (!hn)
620 return -ENOMEM;
621
622 hostname_cleanup(hn);
623
624 if (!isempty(hn))
625 if (sethostname(hn, strlen(hn)) < 0)
626 r = -errno;
627
628 free(hn);
629 }
630
631 return r;
632 }
633
634 static int setup_journal(const char *directory) {
635 sd_id128_t machine_id;
636 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
637 int r;
638
639 if (arg_link_journal == LINK_NO)
640 return 0;
641
642 p = strappend(directory, "/etc/machine-id");
643 if (!p) {
644 r = log_oom();
645 goto finish;
646 }
647
648 r = read_one_line_file(p, &b);
649 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
650 r = 0;
651 goto finish;
652 } else if (r < 0) {
653 log_error("Failed to read machine ID: %s", strerror(-r));
654 return r;
655 }
656
657 l = strstrip(b);
658 if (isempty(l) && arg_link_journal == LINK_AUTO) {
659 r = 0;
660 goto finish;
661 }
662
663 /* Verify validaty */
664 r = sd_id128_from_string(l, &machine_id);
665 if (r < 0) {
666 log_error("Failed to parse machine ID: %s", strerror(-r));
667 goto finish;
668 }
669
670 free(p);
671 p = strappend("/var/log/journal/", l);
672 q = strjoin(directory, "/var/log/journal/", l, NULL);
673 if (!p || !q) {
674 r = log_oom();
675 goto finish;
676 }
677
678 if (path_is_mount_point(p, false) > 0 ||
679 path_is_mount_point(q, false) > 0) {
680 if (arg_link_journal != LINK_AUTO) {
681 log_error("Journal already a mount point, refusing.");
682 r = -EEXIST;
683 goto finish;
684 }
685
686 r = 0;
687 goto finish;
688 }
689
690 r = readlink_and_make_absolute(p, &d);
691 if (r >= 0) {
692 if ((arg_link_journal == LINK_GUEST ||
693 arg_link_journal == LINK_AUTO) &&
694 path_equal(d, q)) {
695
696 mkdir_p(q, 0755);
697
698 r = 0;
699 goto finish;
700 }
701
702 if (unlink(p) < 0) {
703 log_error("Failed to remove symlink %s: %m", p);
704 r = -errno;
705 goto finish;
706 }
707 } else if (r == -EINVAL) {
708
709 if (arg_link_journal == LINK_GUEST &&
710 rmdir(p) < 0) {
711
712 if (errno == ENOTDIR)
713 log_error("%s already exists and is neither symlink nor directory.", p);
714 else {
715 log_error("Failed to remove %s: %m", p);
716 r = -errno;
717 }
718
719 goto finish;
720 }
721 } else if (r != -ENOENT) {
722 log_error("readlink(%s) failed: %m", p);
723 goto finish;
724 }
725
726 if (arg_link_journal == LINK_GUEST) {
727
728 if (symlink(q, p) < 0) {
729 log_error("Failed to symlink %s to %s: %m", q, p);
730 r = -errno;
731 goto finish;
732 }
733
734 mkdir_p(q, 0755);
735
736 r = 0;
737 goto finish;
738 }
739
740 if (arg_link_journal == LINK_HOST) {
741 r = mkdir_p(p, 0755);
742 if (r < 0) {
743 log_error("Failed to create %s: %m", p);
744 goto finish;
745 }
746
747 } else if (access(p, F_OK) < 0) {
748 r = 0;
749 goto finish;
750 }
751
752 if (dir_is_empty(q) == 0) {
753 log_error("%s not empty.", q);
754 r = -ENOTEMPTY;
755 goto finish;
756 }
757
758 r = mkdir_p(q, 0755);
759 if (r < 0) {
760 log_error("Failed to create %s: %m", q);
761 goto finish;
762 }
763
764 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765 log_error("Failed to bind mount journal from host into guest: %m");
766 r = -errno;
767 goto finish;
768 }
769
770 r = 0;
771
772 finish:
773 free(p);
774 free(q);
775 free(d);
776 free(b);
777 return r;
778
779 }
780
781 static int drop_capabilities(void) {
782 return capability_bounding_set_drop(~arg_retain, false);
783 }
784
785 static int is_os_tree(const char *path) {
786 int r;
787 char *p;
788 /* We use /bin/sh as flag file if something is an OS */
789
790 if (asprintf(&p, "%s/bin/sh", path) < 0)
791 return -ENOMEM;
792
793 r = access(p, F_OK);
794 free(p);
795
796 return r < 0 ? 0 : 1;
797 }
798
799 static int process_pty(int master, sigset_t *mask) {
800
801 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802 size_t in_buffer_full = 0, out_buffer_full = 0;
803 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805 int ep = -1, signal_fd = -1, r;
806
807 fd_nonblock(STDIN_FILENO, 1);
808 fd_nonblock(STDOUT_FILENO, 1);
809 fd_nonblock(master, 1);
810
811 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
812 if (signal_fd < 0) {
813 log_error("signalfd(): %m");
814 r = -errno;
815 goto finish;
816 }
817
818 ep = epoll_create1(EPOLL_CLOEXEC);
819 if (ep < 0) {
820 log_error("Failed to create epoll: %m");
821 r = -errno;
822 goto finish;
823 }
824
825 zero(stdin_ev);
826 stdin_ev.events = EPOLLIN|EPOLLET;
827 stdin_ev.data.fd = STDIN_FILENO;
828
829 zero(stdout_ev);
830 stdout_ev.events = EPOLLOUT|EPOLLET;
831 stdout_ev.data.fd = STDOUT_FILENO;
832
833 zero(master_ev);
834 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
835 master_ev.data.fd = master;
836
837 zero(signal_ev);
838 signal_ev.events = EPOLLIN;
839 signal_ev.data.fd = signal_fd;
840
841 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
842 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
843 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
844 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
845 log_error("Failed to regiser fds in epoll: %m");
846 r = -errno;
847 goto finish;
848 }
849
850 for (;;) {
851 struct epoll_event ev[16];
852 ssize_t k;
853 int i, nfds;
854
855 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
856 if (nfds < 0) {
857
858 if (errno == EINTR || errno == EAGAIN)
859 continue;
860
861 log_error("epoll_wait(): %m");
862 r = -errno;
863 goto finish;
864 }
865
866 assert(nfds >= 1);
867
868 for (i = 0; i < nfds; i++) {
869 if (ev[i].data.fd == STDIN_FILENO) {
870
871 if (ev[i].events & (EPOLLIN|EPOLLHUP))
872 stdin_readable = true;
873
874 } else if (ev[i].data.fd == STDOUT_FILENO) {
875
876 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877 stdout_writable = true;
878
879 } else if (ev[i].data.fd == master) {
880
881 if (ev[i].events & (EPOLLIN|EPOLLHUP))
882 master_readable = true;
883
884 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
885 master_writable = true;
886
887 } else if (ev[i].data.fd == signal_fd) {
888 struct signalfd_siginfo sfsi;
889 ssize_t n;
890
891 n = read(signal_fd, &sfsi, sizeof(sfsi));
892 if (n != sizeof(sfsi)) {
893
894 if (n >= 0) {
895 log_error("Failed to read from signalfd: invalid block size");
896 r = -EIO;
897 goto finish;
898 }
899
900 if (errno != EINTR && errno != EAGAIN) {
901 log_error("Failed to read from signalfd: %m");
902 r = -errno;
903 goto finish;
904 }
905 } else {
906
907 if (sfsi.ssi_signo == SIGWINCH) {
908 struct winsize ws;
909
910 /* The window size changed, let's forward that. */
911 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
912 ioctl(master, TIOCSWINSZ, &ws);
913 } else {
914 r = 0;
915 goto finish;
916 }
917 }
918 }
919 }
920
921 while ((stdin_readable && in_buffer_full <= 0) ||
922 (master_writable && in_buffer_full > 0) ||
923 (master_readable && out_buffer_full <= 0) ||
924 (stdout_writable && out_buffer_full > 0)) {
925
926 if (stdin_readable && in_buffer_full < LINE_MAX) {
927
928 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
929 if (k < 0) {
930
931 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
932 stdin_readable = false;
933 else {
934 log_error("read(): %m");
935 r = -errno;
936 goto finish;
937 }
938 } else
939 in_buffer_full += (size_t) k;
940 }
941
942 if (master_writable && in_buffer_full > 0) {
943
944 k = write(master, in_buffer, in_buffer_full);
945 if (k < 0) {
946
947 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
948 master_writable = false;
949 else {
950 log_error("write(): %m");
951 r = -errno;
952 goto finish;
953 }
954
955 } else {
956 assert(in_buffer_full >= (size_t) k);
957 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
958 in_buffer_full -= k;
959 }
960 }
961
962 if (master_readable && out_buffer_full < LINE_MAX) {
963
964 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
965 if (k < 0) {
966
967 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
968 master_readable = false;
969 else {
970 log_error("read(): %m");
971 r = -errno;
972 goto finish;
973 }
974 } else
975 out_buffer_full += (size_t) k;
976 }
977
978 if (stdout_writable && out_buffer_full > 0) {
979
980 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
981 if (k < 0) {
982
983 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984 stdout_writable = false;
985 else {
986 log_error("write(): %m");
987 r = -errno;
988 goto finish;
989 }
990
991 } else {
992 assert(out_buffer_full >= (size_t) k);
993 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
994 out_buffer_full -= k;
995 }
996 }
997 }
998 }
999
1000 finish:
1001 if (ep >= 0)
1002 close_nointr_nofail(ep);
1003
1004 if (signal_fd >= 0)
1005 close_nointr_nofail(signal_fd);
1006
1007 return r;
1008 }
1009
1010 int main(int argc, char *argv[]) {
1011 pid_t pid = 0;
1012 int r = EXIT_FAILURE, k;
1013 char *oldcg = NULL, *newcg = NULL;
1014 char **controller = NULL;
1015 int master = -1;
1016 const char *console = NULL;
1017 struct termios saved_attr, raw_attr;
1018 sigset_t mask;
1019 bool saved_attr_valid = false;
1020 struct winsize ws;
1021 int kmsg_socket_pair[2] = { -1, -1 };
1022
1023 log_parse_environment();
1024 log_open();
1025
1026 r = parse_argv(argc, argv);
1027 if (r <= 0)
1028 goto finish;
1029
1030 if (arg_directory) {
1031 char *p;
1032
1033 p = path_make_absolute_cwd(arg_directory);
1034 free(arg_directory);
1035 arg_directory = p;
1036 } else
1037 arg_directory = get_current_dir_name();
1038
1039 if (!arg_directory) {
1040 log_error("Failed to determine path");
1041 goto finish;
1042 }
1043
1044 path_kill_slashes(arg_directory);
1045
1046 if (geteuid() != 0) {
1047 log_error("Need to be root.");
1048 goto finish;
1049 }
1050
1051 if (sd_booted() <= 0) {
1052 log_error("Not running on a systemd system.");
1053 goto finish;
1054 }
1055
1056 if (path_equal(arg_directory, "/")) {
1057 log_error("Spawning container on root directory not supported.");
1058 goto finish;
1059 }
1060
1061 if (is_os_tree(arg_directory) <= 0) {
1062 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1063 goto finish;
1064 }
1065
1066 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1067 if (k < 0) {
1068 log_error("Failed to determine current cgroup: %s", strerror(-k));
1069 goto finish;
1070 }
1071
1072 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1073 log_error("Failed to allocate cgroup path.");
1074 goto finish;
1075 }
1076
1077 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1078 if (k < 0) {
1079 log_error("Failed to create cgroup: %s", strerror(-k));
1080 goto finish;
1081 }
1082
1083 STRV_FOREACH(controller, arg_controllers) {
1084 k = cg_create_and_attach(*controller, newcg, 0);
1085 if (k < 0)
1086 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1087 }
1088
1089 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1090 if (master < 0) {
1091 log_error("Failed to acquire pseudo tty: %m");
1092 goto finish;
1093 }
1094
1095 console = ptsname(master);
1096 if (!console) {
1097 log_error("Failed to determine tty name: %m");
1098 goto finish;
1099 }
1100
1101 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1102
1103 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1104 ioctl(master, TIOCSWINSZ, &ws);
1105
1106 if (unlockpt(master) < 0) {
1107 log_error("Failed to unlock tty: %m");
1108 goto finish;
1109 }
1110
1111 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1112 log_error("Failed to get terminal attributes: %m");
1113 goto finish;
1114 }
1115
1116 saved_attr_valid = true;
1117
1118 raw_attr = saved_attr;
1119 cfmakeraw(&raw_attr);
1120 raw_attr.c_lflag &= ~ECHO;
1121
1122 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1123 log_error("Failed to create kmsg socket pair");
1124 goto finish;
1125 }
1126
1127 assert_se(sigemptyset(&mask) == 0);
1128 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1129 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1130
1131 for (;;) {
1132 siginfo_t status;
1133
1134 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1135 log_error("Failed to set terminal attributes: %m");
1136 goto finish;
1137 }
1138
1139 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1140 if (pid < 0) {
1141 if (errno == EINVAL)
1142 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1143 else
1144 log_error("clone() failed: %m");
1145
1146 goto finish;
1147 }
1148
1149 if (pid == 0) {
1150 /* child */
1151
1152 const char *home = NULL;
1153 uid_t uid = (uid_t) -1;
1154 gid_t gid = (gid_t) -1;
1155 const char *envp[] = {
1156 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1157 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1158 NULL, /* TERM */
1159 NULL, /* HOME */
1160 NULL, /* USER */
1161 NULL, /* LOGNAME */
1162 NULL, /* container_uuid */
1163 NULL
1164 };
1165
1166 envp[2] = strv_find_prefix(environ, "TERM=");
1167
1168 close_nointr_nofail(master);
1169
1170 close_nointr(STDIN_FILENO);
1171 close_nointr(STDOUT_FILENO);
1172 close_nointr(STDERR_FILENO);
1173
1174 close_all_fds(&kmsg_socket_pair[1], 1);
1175
1176 reset_all_signal_handlers();
1177
1178 assert_se(sigemptyset(&mask) == 0);
1179 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1180
1181 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1182 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1183 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1184 goto child_fail;
1185
1186 if (setsid() < 0) {
1187 log_error("setsid() failed: %m");
1188 goto child_fail;
1189 }
1190
1191 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1192 log_error("PR_SET_PDEATHSIG failed: %m");
1193 goto child_fail;
1194 }
1195
1196 /* Mark everything as slave, so that we still
1197 * receive mounts from the real root, but don't
1198 * propagate mounts to the real root. */
1199 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1200 log_error("MS_SLAVE|MS_REC failed: %m");
1201 goto child_fail;
1202 }
1203
1204 /* Turn directory into bind mount */
1205 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1206 log_error("Failed to make bind mount.");
1207 goto child_fail;
1208 }
1209
1210 if (arg_read_only)
1211 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1212 log_error("Failed to make read-only.");
1213 goto child_fail;
1214 }
1215
1216 if (mount_all(arg_directory) < 0)
1217 goto child_fail;
1218
1219 if (copy_devnodes(arg_directory) < 0)
1220 goto child_fail;
1221
1222 dev_setup(arg_directory);
1223
1224 if (setup_dev_console(arg_directory, console) < 0)
1225 goto child_fail;
1226
1227 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1228 goto child_fail;
1229
1230 close_nointr_nofail(kmsg_socket_pair[1]);
1231
1232 if (setup_boot_id(arg_directory) < 0)
1233 goto child_fail;
1234
1235 if (setup_timezone(arg_directory) < 0)
1236 goto child_fail;
1237
1238 if (setup_resolv_conf(arg_directory) < 0)
1239 goto child_fail;
1240
1241 if (setup_journal(arg_directory) < 0)
1242 goto child_fail;
1243
1244 if (chdir(arg_directory) < 0) {
1245 log_error("chdir(%s) failed: %m", arg_directory);
1246 goto child_fail;
1247 }
1248
1249 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1250 log_error("mount(MS_MOVE) failed: %m");
1251 goto child_fail;
1252 }
1253
1254 if (chroot(".") < 0) {
1255 log_error("chroot() failed: %m");
1256 goto child_fail;
1257 }
1258
1259 if (chdir("/") < 0) {
1260 log_error("chdir() failed: %m");
1261 goto child_fail;
1262 }
1263
1264 umask(0022);
1265
1266 loopback_setup();
1267
1268 if (drop_capabilities() < 0) {
1269 log_error("drop_capabilities() failed: %m");
1270 goto child_fail;
1271 }
1272
1273 if (arg_user) {
1274
1275 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1276 log_error("get_user_creds() failed: %m");
1277 goto child_fail;
1278 }
1279
1280 if (mkdir_parents_label(home, 0775) < 0) {
1281 log_error("mkdir_parents_label() failed: %m");
1282 goto child_fail;
1283 }
1284
1285 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1286 log_error("mkdir_safe_label() failed: %m");
1287 goto child_fail;
1288 }
1289
1290 if (initgroups((const char*)arg_user, gid) < 0) {
1291 log_error("initgroups() failed: %m");
1292 goto child_fail;
1293 }
1294
1295 if (setresgid(gid, gid, gid) < 0) {
1296 log_error("setregid() failed: %m");
1297 goto child_fail;
1298 }
1299
1300 if (setresuid(uid, uid, uid) < 0) {
1301 log_error("setreuid() failed: %m");
1302 goto child_fail;
1303 }
1304 }
1305
1306 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1307 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1308 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1309 log_oom();
1310 goto child_fail;
1311 }
1312
1313 if (arg_uuid) {
1314 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1315 log_oom();
1316 goto child_fail;
1317 }
1318 }
1319
1320 setup_hostname();
1321
1322 if (arg_boot) {
1323 char **a;
1324 size_t l;
1325
1326 /* Automatically search for the init system */
1327
1328 l = 1 + argc - optind;
1329 a = newa(char*, l + 1);
1330 memcpy(a + 1, argv + optind, l * sizeof(char*));
1331
1332 a[0] = (char*) "/usr/lib/systemd/systemd";
1333 execve(a[0], a, (char**) envp);
1334
1335 a[0] = (char*) "/lib/systemd/systemd";
1336 execve(a[0], a, (char**) envp);
1337
1338 a[0] = (char*) "/sbin/init";
1339 execve(a[0], a, (char**) envp);
1340 } else if (argc > optind)
1341 execvpe(argv[optind], argv + optind, (char**) envp);
1342 else {
1343 chdir(home ? home : "/root");
1344 execle("/bin/bash", "-bash", NULL, (char**) envp);
1345 }
1346
1347 log_error("execv() failed: %m");
1348
1349 child_fail:
1350 _exit(EXIT_FAILURE);
1351 }
1352
1353 if (process_pty(master, &mask) < 0)
1354 goto finish;
1355
1356
1357 if (saved_attr_valid)
1358 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1359
1360 r = wait_for_terminate(pid, &status);
1361 if (r < 0) {
1362 r = EXIT_FAILURE;
1363 break;
1364 }
1365
1366 if (status.si_code == CLD_EXITED) {
1367 if (status.si_status != 0) {
1368 log_error("Container failed with error code %i.", status.si_status);
1369 r = status.si_status;
1370 break;
1371 }
1372
1373 log_debug("Container exited successfully.");
1374 break;
1375 } else if (status.si_code == CLD_KILLED &&
1376 status.si_status == SIGINT) {
1377 log_info("Container has been shut down.");
1378 r = 0;
1379 break;
1380 } else if (status.si_code == CLD_KILLED &&
1381 status.si_status == SIGHUP) {
1382 log_info("Container is being rebooted.");
1383 continue;
1384 } else if (status.si_code == CLD_KILLED ||
1385 status.si_code == CLD_DUMPED) {
1386
1387 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1388 r = EXIT_FAILURE;
1389 break;
1390 } else {
1391 log_error("Container failed due to unknown reason.");
1392 r = EXIT_FAILURE;
1393 break;
1394 }
1395 }
1396
1397 finish:
1398 if (saved_attr_valid)
1399 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1400
1401 if (master >= 0)
1402 close_nointr_nofail(master);
1403
1404 close_pipe(kmsg_socket_pair);
1405
1406 if (oldcg)
1407 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1408
1409 if (newcg)
1410 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1411
1412 free(arg_directory);
1413 strv_free(arg_controllers);
1414 free(oldcg);
1415 free(newcg);
1416
1417 return r;
1418 }