]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
main: add configuration option to alter capability bounding set for PID 1
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55
56 static char *arg_directory = NULL;
57 static char *arg_user = NULL;
58 static char **arg_controllers = NULL;
59 static char *arg_uuid = NULL;
60 static bool arg_private_network = false;
61 static bool arg_read_only = false;
62 static bool arg_boot = false;
63
64 static int help(void) {
65
66 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
67 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
68 " -h --help Show this help\n"
69 " -D --directory=NAME Root directory for the container\n"
70 " -b --boot Boot up full system (i.e. invoke init)\n"
71 " -u --user=USER Run the command under specified user or uid\n"
72 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
73 " --uuid=UUID Set a specific machine UUID for the container\n"
74 " --private-network Disable network in container\n"
75 " --read-only Mount the root directory read-only\n",
76 program_invocation_short_name);
77
78 return 0;
79 }
80
81 static int parse_argv(int argc, char *argv[]) {
82
83 enum {
84 ARG_PRIVATE_NETWORK = 0x100,
85 ARG_UUID,
86 ARG_READ_ONLY
87 };
88
89 static const struct option options[] = {
90 { "help", no_argument, NULL, 'h' },
91 { "directory", required_argument, NULL, 'D' },
92 { "user", required_argument, NULL, 'u' },
93 { "controllers", required_argument, NULL, 'C' },
94 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
95 { "boot", no_argument, NULL, 'b' },
96 { "uuid", required_argument, NULL, ARG_UUID },
97 { "read-only", no_argument, NULL, ARG_READ_ONLY },
98 { NULL, 0, NULL, 0 }
99 };
100
101 int c;
102
103 assert(argc >= 0);
104 assert(argv);
105
106 while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
107
108 switch (c) {
109
110 case 'h':
111 help();
112 return 0;
113
114 case 'D':
115 free(arg_directory);
116 arg_directory = canonicalize_file_name(optarg);
117 if (!arg_directory) {
118 log_error("Failed to canonicalize root directory.");
119 return -ENOMEM;
120 }
121
122 break;
123
124 case 'u':
125 free(arg_user);
126 if (!(arg_user = strdup(optarg))) {
127 log_error("Failed to duplicate user name.");
128 return -ENOMEM;
129 }
130
131 break;
132
133 case 'C':
134 strv_free(arg_controllers);
135 arg_controllers = strv_split(optarg, ",");
136 if (!arg_controllers) {
137 log_error("Failed to split controllers list.");
138 return -ENOMEM;
139 }
140 strv_uniq(arg_controllers);
141
142 break;
143
144 case ARG_PRIVATE_NETWORK:
145 arg_private_network = true;
146 break;
147
148 case 'b':
149 arg_boot = true;
150 break;
151
152 case ARG_UUID:
153 arg_uuid = optarg;
154 break;
155
156 case ARG_READ_ONLY:
157 arg_read_only = true;
158 break;
159
160 case '?':
161 return -EINVAL;
162
163 default:
164 log_error("Unknown option code %c", c);
165 return -EINVAL;
166 }
167 }
168
169 return 1;
170 }
171
172 static int mount_all(const char *dest) {
173
174 typedef struct MountPoint {
175 const char *what;
176 const char *where;
177 const char *type;
178 const char *options;
179 unsigned long flags;
180 bool fatal;
181 } MountPoint;
182
183 static const MountPoint mount_table[] = {
184 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
185 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
186 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
187 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
188 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
189 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
190 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
191 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
192 #ifdef HAVE_SELINUX
193 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
194 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
195 #endif
196 };
197
198 unsigned k;
199 int r = 0;
200 char *where;
201
202 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
203 int t;
204
205 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
206 log_error("Out of memory");
207
208 if (r == 0)
209 r = -ENOMEM;
210
211 break;
212 }
213
214 t = path_is_mount_point(where, false);
215 if (t < 0) {
216 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
217 free(where);
218
219 if (r == 0)
220 r = t;
221
222 continue;
223 }
224
225 mkdir_p(where, 0755);
226
227 if (mount(mount_table[k].what,
228 where,
229 mount_table[k].type,
230 mount_table[k].flags,
231 mount_table[k].options) < 0 &&
232 mount_table[k].fatal) {
233
234 log_error("mount(%s) failed: %m", where);
235
236 if (r == 0)
237 r = -errno;
238 }
239
240 free(where);
241 }
242
243 return r;
244 }
245
246 static int setup_timezone(const char *dest) {
247 char *where;
248
249 assert(dest);
250
251 /* Fix the timezone, if possible */
252 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
253 log_error("Out of memory");
254 return -ENOMEM;
255 }
256
257 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
258 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
259
260 free(where);
261
262 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
263 log_error("Out of memory");
264 return -ENOMEM;
265 }
266
267 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
268 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
269
270 free(where);
271
272 return 0;
273 }
274
275 static int setup_resolv_conf(const char *dest) {
276 char *where;
277
278 assert(dest);
279
280 if (arg_private_network)
281 return 0;
282
283 /* Fix resolv.conf, if possible */
284 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
285 log_error("Out of memory");
286 return -ENOMEM;
287 }
288
289 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
290 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
291
292 free(where);
293
294 return 0;
295 }
296
297 static int copy_devnodes(const char *dest) {
298
299 static const char devnodes[] =
300 "null\0"
301 "zero\0"
302 "full\0"
303 "random\0"
304 "urandom\0"
305 "tty\0"
306 "ptmx\0"
307 "rtc0\0";
308
309 const char *d;
310 int r = 0;
311 mode_t u;
312
313 assert(dest);
314
315 u = umask(0000);
316
317 NULSTR_FOREACH(d, devnodes) {
318 struct stat st;
319 char *from = NULL, *to = NULL;
320
321 asprintf(&from, "/dev/%s", d);
322 asprintf(&to, "%s/dev/%s", dest, d);
323
324 if (!from || !to) {
325 log_error("Failed to allocate devnode path");
326
327 free(from);
328 free(to);
329
330 from = to = NULL;
331
332 if (r == 0)
333 r = -ENOMEM;
334
335 break;
336 }
337
338 if (stat(from, &st) < 0) {
339
340 if (errno != ENOENT) {
341 log_error("Failed to stat %s: %m", from);
342 if (r == 0)
343 r = -errno;
344 }
345
346 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
347
348 log_error("%s is not a char or block device, cannot copy.", from);
349 if (r == 0)
350 r = -EIO;
351
352 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
353
354 log_error("mknod(%s) failed: %m", dest);
355 if (r == 0)
356 r = -errno;
357 }
358
359 free(from);
360 free(to);
361 }
362
363 umask(u);
364
365 return r;
366 }
367
368 static int setup_dev_console(const char *dest, const char *console) {
369 struct stat st;
370 char *to = NULL;
371 int r;
372 mode_t u;
373
374 assert(dest);
375 assert(console);
376
377 u = umask(0000);
378
379 if (stat(console, &st) < 0) {
380 log_error("Failed to stat %s: %m", console);
381 r = -errno;
382 goto finish;
383
384 } else if (!S_ISCHR(st.st_mode)) {
385 log_error("/dev/console is not a char device.");
386 r = -EIO;
387 goto finish;
388 }
389
390 r = chmod_and_chown(console, 0600, 0, 0);
391 if (r < 0) {
392 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
393 goto finish;
394 }
395
396 if (asprintf(&to, "%s/dev/console", dest) < 0) {
397 log_error("Out of memory");
398 r = -ENOMEM;
399 goto finish;
400 }
401
402 /* We need to bind mount the right tty to /dev/console since
403 * ptys can only exist on pts file systems. To have something
404 * to bind mount things on we create a device node first, that
405 * has the right major/minor (note that the major minor
406 * doesn't actually matter here, since we mount it over
407 * anyway). */
408
409 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
410 log_error("mknod() for /dev/console failed: %m");
411 r = -errno;
412 goto finish;
413 }
414
415 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
416 log_error("Bind mount for /dev/console failed: %m");
417 r = -errno;
418 goto finish;
419 }
420
421 finish:
422 free(to);
423 umask(u);
424
425 return r;
426 }
427
428 static int setup_kmsg(const char *dest, int kmsg_socket) {
429 char *from = NULL, *to = NULL;
430 int r, fd, k;
431 mode_t u;
432 union {
433 struct cmsghdr cmsghdr;
434 uint8_t buf[CMSG_SPACE(sizeof(int))];
435 } control;
436 struct msghdr mh;
437 struct cmsghdr *cmsg;
438
439 assert(dest);
440 assert(kmsg_socket >= 0);
441
442 u = umask(0000);
443
444 /* We create the kmsg FIFO as /dev/kmsg, but immediately
445 * delete it after bind mounting it to /proc/kmsg. While FIFOs
446 * on the reading side behave very similar to /proc/kmsg,
447 * their writing side behaves differently from /dev/kmsg in
448 * that writing blocks when nothing is reading. In order to
449 * avoid any problems with containers deadlocking due to this
450 * we simply make /dev/kmsg unavailable to the container. */
451 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
452 log_error("Out of memory");
453 r = -ENOMEM;
454 goto finish;
455 }
456
457 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
458 log_error("Out of memory");
459 r = -ENOMEM;
460 goto finish;
461 }
462
463 if (mkfifo(from, 0600) < 0) {
464 log_error("mkfifo() for /dev/kmsg failed: %m");
465 r = -errno;
466 goto finish;
467 }
468
469 r = chmod_and_chown(from, 0600, 0, 0);
470 if (r < 0) {
471 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
472 goto finish;
473 }
474
475 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
476 log_error("Bind mount for /proc/kmsg failed: %m");
477 r = -errno;
478 goto finish;
479 }
480
481 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
482 if (fd < 0) {
483 log_error("Failed to open fifo: %m");
484 r = -errno;
485 goto finish;
486 }
487
488 zero(mh);
489 zero(control);
490
491 mh.msg_control = &control;
492 mh.msg_controllen = sizeof(control);
493
494 cmsg = CMSG_FIRSTHDR(&mh);
495 cmsg->cmsg_level = SOL_SOCKET;
496 cmsg->cmsg_type = SCM_RIGHTS;
497 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
498 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
499
500 mh.msg_controllen = cmsg->cmsg_len;
501
502 /* Store away the fd in the socket, so that it stays open as
503 * long as we run the child */
504 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
505 close_nointr_nofail(fd);
506
507 if (k < 0) {
508 log_error("Failed to send FIFO fd: %m");
509 r = -errno;
510 goto finish;
511 }
512
513 /* And now make the FIFO unavailable as /dev/kmsg... */
514 unlink(from);
515
516 finish:
517 free(from);
518 free(to);
519 umask(u);
520
521 return r;
522 }
523
524 static int setup_hostname(void) {
525 char *hn;
526 int r = 0;
527
528 hn = path_get_file_name(arg_directory);
529 if (hn) {
530 hn = strdup(hn);
531 if (!hn)
532 return -ENOMEM;
533
534 hostname_cleanup(hn);
535
536 if (!isempty(hn))
537 if (sethostname(hn, strlen(hn)) < 0)
538 r = -errno;
539
540 free(hn);
541 }
542
543 return r;
544 }
545
546 static int drop_capabilities(void) {
547
548 static const uint64_t retain =
549 (1ULL << CAP_CHOWN) |
550 (1ULL << CAP_DAC_OVERRIDE) |
551 (1ULL << CAP_DAC_READ_SEARCH) |
552 (1ULL << CAP_FOWNER) |
553 (1ULL << CAP_FSETID) |
554 (1ULL << CAP_IPC_OWNER) |
555 (1ULL << CAP_KILL) |
556 (1ULL << CAP_LEASE) |
557 (1ULL << CAP_LINUX_IMMUTABLE) |
558 (1ULL << CAP_NET_BIND_SERVICE) |
559 (1ULL << CAP_NET_BROADCAST) |
560 (1ULL << CAP_NET_RAW) |
561 (1ULL << CAP_SETGID) |
562 (1ULL << CAP_SETFCAP) |
563 (1ULL << CAP_SETPCAP) |
564 (1ULL << CAP_SETUID) |
565 (1ULL << CAP_SYS_ADMIN) |
566 (1ULL << CAP_SYS_CHROOT) |
567 (1ULL << CAP_SYS_NICE) |
568 (1ULL << CAP_SYS_PTRACE) |
569 (1ULL << CAP_SYS_TTY_CONFIG);
570
571 return capability_bounding_set_drop(~retain, false);
572 }
573
574 static int is_os_tree(const char *path) {
575 int r;
576 char *p;
577 /* We use /bin/sh as flag file if something is an OS */
578
579 if (asprintf(&p, "%s/bin/sh", path) < 0)
580 return -ENOMEM;
581
582 r = access(p, F_OK);
583 free(p);
584
585 return r < 0 ? 0 : 1;
586 }
587
588 static int process_pty(int master, sigset_t *mask) {
589
590 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
591 size_t in_buffer_full = 0, out_buffer_full = 0;
592 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
593 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
594 int ep = -1, signal_fd = -1, r;
595
596 fd_nonblock(STDIN_FILENO, 1);
597 fd_nonblock(STDOUT_FILENO, 1);
598 fd_nonblock(master, 1);
599
600 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
601 log_error("signalfd(): %m");
602 r = -errno;
603 goto finish;
604 }
605
606 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
607 log_error("Failed to create epoll: %m");
608 r = -errno;
609 goto finish;
610 }
611
612 zero(stdin_ev);
613 stdin_ev.events = EPOLLIN|EPOLLET;
614 stdin_ev.data.fd = STDIN_FILENO;
615
616 zero(stdout_ev);
617 stdout_ev.events = EPOLLOUT|EPOLLET;
618 stdout_ev.data.fd = STDOUT_FILENO;
619
620 zero(master_ev);
621 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
622 master_ev.data.fd = master;
623
624 zero(signal_ev);
625 signal_ev.events = EPOLLIN;
626 signal_ev.data.fd = signal_fd;
627
628 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
629 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
630 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
631 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
632 log_error("Failed to regiser fds in epoll: %m");
633 r = -errno;
634 goto finish;
635 }
636
637 for (;;) {
638 struct epoll_event ev[16];
639 ssize_t k;
640 int i, nfds;
641
642 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
643
644 if (errno == EINTR || errno == EAGAIN)
645 continue;
646
647 log_error("epoll_wait(): %m");
648 r = -errno;
649 goto finish;
650 }
651
652 assert(nfds >= 1);
653
654 for (i = 0; i < nfds; i++) {
655 if (ev[i].data.fd == STDIN_FILENO) {
656
657 if (ev[i].events & (EPOLLIN|EPOLLHUP))
658 stdin_readable = true;
659
660 } else if (ev[i].data.fd == STDOUT_FILENO) {
661
662 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
663 stdout_writable = true;
664
665 } else if (ev[i].data.fd == master) {
666
667 if (ev[i].events & (EPOLLIN|EPOLLHUP))
668 master_readable = true;
669
670 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
671 master_writable = true;
672
673 } else if (ev[i].data.fd == signal_fd) {
674 struct signalfd_siginfo sfsi;
675 ssize_t n;
676
677 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
678
679 if (n >= 0) {
680 log_error("Failed to read from signalfd: invalid block size");
681 r = -EIO;
682 goto finish;
683 }
684
685 if (errno != EINTR && errno != EAGAIN) {
686 log_error("Failed to read from signalfd: %m");
687 r = -errno;
688 goto finish;
689 }
690 } else {
691
692 if (sfsi.ssi_signo == SIGWINCH) {
693 struct winsize ws;
694
695 /* The window size changed, let's forward that. */
696 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
697 ioctl(master, TIOCSWINSZ, &ws);
698 } else {
699 r = 0;
700 goto finish;
701 }
702 }
703 }
704 }
705
706 while ((stdin_readable && in_buffer_full <= 0) ||
707 (master_writable && in_buffer_full > 0) ||
708 (master_readable && out_buffer_full <= 0) ||
709 (stdout_writable && out_buffer_full > 0)) {
710
711 if (stdin_readable && in_buffer_full < LINE_MAX) {
712
713 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
714
715 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
716 stdin_readable = false;
717 else {
718 log_error("read(): %m");
719 r = -errno;
720 goto finish;
721 }
722 } else
723 in_buffer_full += (size_t) k;
724 }
725
726 if (master_writable && in_buffer_full > 0) {
727
728 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
729
730 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
731 master_writable = false;
732 else {
733 log_error("write(): %m");
734 r = -errno;
735 goto finish;
736 }
737
738 } else {
739 assert(in_buffer_full >= (size_t) k);
740 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
741 in_buffer_full -= k;
742 }
743 }
744
745 if (master_readable && out_buffer_full < LINE_MAX) {
746
747 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
748
749 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
750 master_readable = false;
751 else {
752 log_error("read(): %m");
753 r = -errno;
754 goto finish;
755 }
756 } else
757 out_buffer_full += (size_t) k;
758 }
759
760 if (stdout_writable && out_buffer_full > 0) {
761
762 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
763
764 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
765 stdout_writable = false;
766 else {
767 log_error("write(): %m");
768 r = -errno;
769 goto finish;
770 }
771
772 } else {
773 assert(out_buffer_full >= (size_t) k);
774 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
775 out_buffer_full -= k;
776 }
777 }
778 }
779 }
780
781 finish:
782 if (ep >= 0)
783 close_nointr_nofail(ep);
784
785 if (signal_fd >= 0)
786 close_nointr_nofail(signal_fd);
787
788 return r;
789 }
790
791 int main(int argc, char *argv[]) {
792 pid_t pid = 0;
793 int r = EXIT_FAILURE, k;
794 char *oldcg = NULL, *newcg = NULL;
795 char **controller = NULL;
796 int master = -1;
797 const char *console = NULL;
798 struct termios saved_attr, raw_attr;
799 sigset_t mask;
800 bool saved_attr_valid = false;
801 struct winsize ws;
802 int kmsg_socket_pair[2] = { -1, -1 };
803
804 log_parse_environment();
805 log_open();
806
807 if ((r = parse_argv(argc, argv)) <= 0)
808 goto finish;
809
810 if (arg_directory) {
811 char *p;
812
813 p = path_make_absolute_cwd(arg_directory);
814 free(arg_directory);
815 arg_directory = p;
816 } else
817 arg_directory = get_current_dir_name();
818
819 if (!arg_directory) {
820 log_error("Failed to determine path");
821 goto finish;
822 }
823
824 path_kill_slashes(arg_directory);
825
826 if (geteuid() != 0) {
827 log_error("Need to be root.");
828 goto finish;
829 }
830
831 if (sd_booted() <= 0) {
832 log_error("Not running on a systemd system.");
833 goto finish;
834 }
835
836 if (path_equal(arg_directory, "/")) {
837 log_error("Spawning container on root directory not supported.");
838 goto finish;
839 }
840
841 if (is_os_tree(arg_directory) <= 0) {
842 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
843 goto finish;
844 }
845
846 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
847 log_error("Failed to determine current cgroup: %s", strerror(-k));
848 goto finish;
849 }
850
851 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
852 log_error("Failed to allocate cgroup path.");
853 goto finish;
854 }
855
856 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
857 if (k < 0) {
858 log_error("Failed to create cgroup: %s", strerror(-k));
859 goto finish;
860 }
861
862 STRV_FOREACH(controller,arg_controllers) {
863 k = cg_create_and_attach(*controller, newcg, 0);
864 if (k < 0)
865 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
866 }
867
868 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
869 log_error("Failed to acquire pseudo tty: %m");
870 goto finish;
871 }
872
873 if (!(console = ptsname(master))) {
874 log_error("Failed to determine tty name: %m");
875 goto finish;
876 }
877
878 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
879
880 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
881 ioctl(master, TIOCSWINSZ, &ws);
882
883 if (unlockpt(master) < 0) {
884 log_error("Failed to unlock tty: %m");
885 goto finish;
886 }
887
888 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
889 log_error("Failed to get terminal attributes: %m");
890 goto finish;
891 }
892
893 saved_attr_valid = true;
894
895 raw_attr = saved_attr;
896 cfmakeraw(&raw_attr);
897 raw_attr.c_lflag &= ~ECHO;
898
899 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
900 log_error("Failed to set terminal attributes: %m");
901 goto finish;
902 }
903
904 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
905 log_error("Failed to create kmsg socket pair");
906 goto finish;
907 }
908
909 assert_se(sigemptyset(&mask) == 0);
910 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
911 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
912
913 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
914 if (pid < 0) {
915 if (errno == EINVAL)
916 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
917 else
918 log_error("clone() failed: %m");
919
920 goto finish;
921 }
922
923 if (pid == 0) {
924 /* child */
925
926 const char *home = NULL;
927 uid_t uid = (uid_t) -1;
928 gid_t gid = (gid_t) -1;
929 const char *envp[] = {
930 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
931 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
932 NULL, /* TERM */
933 NULL, /* HOME */
934 NULL, /* USER */
935 NULL, /* LOGNAME */
936 NULL, /* container_uuid */
937 NULL
938 };
939
940 envp[2] = strv_find_prefix(environ, "TERM=");
941
942 close_nointr_nofail(master);
943
944 close_nointr(STDIN_FILENO);
945 close_nointr(STDOUT_FILENO);
946 close_nointr(STDERR_FILENO);
947
948 close_all_fds(&kmsg_socket_pair[1], 1);
949
950 reset_all_signal_handlers();
951
952 assert_se(sigemptyset(&mask) == 0);
953 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
954
955 if (setsid() < 0)
956 goto child_fail;
957
958 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
959 goto child_fail;
960
961 /* Mark / as private, in case somebody marked it shared */
962 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
963 goto child_fail;
964
965 /* Turn directory into bind mount */
966 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
967 log_error("Failed to make bind mount.");
968 goto child_fail;
969 }
970
971 if (arg_read_only)
972 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
973 log_error("Failed to make read-only.");
974 goto child_fail;
975 }
976
977 if (mount_all(arg_directory) < 0)
978 goto child_fail;
979
980 if (copy_devnodes(arg_directory) < 0)
981 goto child_fail;
982
983 if (setup_dev_console(arg_directory, console) < 0)
984 goto child_fail;
985
986 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
987 goto child_fail;
988
989 close_nointr_nofail(kmsg_socket_pair[1]);
990
991 if (setup_timezone(arg_directory) < 0)
992 goto child_fail;
993
994 if (setup_resolv_conf(arg_directory) < 0)
995 goto child_fail;
996
997 if (chdir(arg_directory) < 0) {
998 log_error("chdir(%s) failed: %m", arg_directory);
999 goto child_fail;
1000 }
1001
1002 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1003 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1004 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1005 goto child_fail;
1006
1007 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1008 log_error("mount(MS_BIND) failed: %m");
1009 goto child_fail;
1010 }
1011
1012 if (chroot(".") < 0) {
1013 log_error("chroot() failed: %m");
1014 goto child_fail;
1015 }
1016
1017 if (chdir("/") < 0) {
1018 log_error("chdir() failed: %m");
1019 goto child_fail;
1020 }
1021
1022 umask(0022);
1023
1024 loopback_setup();
1025
1026 if (drop_capabilities() < 0) {
1027 log_error("drop_capabilities() failed: %m");
1028 goto child_fail;
1029 }
1030
1031 if (arg_user) {
1032
1033 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1034 log_error("get_user_creds() failed: %m");
1035 goto child_fail;
1036 }
1037
1038 if (mkdir_parents(home, 0775) < 0) {
1039 log_error("mkdir_parents() failed: %m");
1040 goto child_fail;
1041 }
1042
1043 if (safe_mkdir(home, 0775, uid, gid) < 0) {
1044 log_error("safe_mkdir() failed: %m");
1045 goto child_fail;
1046 }
1047
1048 if (initgroups((const char*)arg_user, gid) < 0) {
1049 log_error("initgroups() failed: %m");
1050 goto child_fail;
1051 }
1052
1053 if (setresgid(gid, gid, gid) < 0) {
1054 log_error("setregid() failed: %m");
1055 goto child_fail;
1056 }
1057
1058 if (setresuid(uid, uid, uid) < 0) {
1059 log_error("setreuid() failed: %m");
1060 goto child_fail;
1061 }
1062 }
1063
1064 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1065 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1066 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1067 log_error("Out of memory");
1068 goto child_fail;
1069 }
1070
1071 if (arg_uuid) {
1072 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1073 log_error("Out of memory");
1074 goto child_fail;
1075 }
1076 }
1077
1078 setup_hostname();
1079
1080 if (arg_boot) {
1081 char **a;
1082 size_t l;
1083
1084 /* Automatically search for the init system */
1085
1086 l = 1 + argc - optind;
1087 a = newa(char*, l + 1);
1088 memcpy(a + 1, argv + optind, l * sizeof(char*));
1089
1090 a[0] = (char*) "/usr/lib/systemd/systemd";
1091 execve(a[0], a, (char**) envp);
1092
1093 a[0] = (char*) "/lib/systemd/systemd";
1094 execve(a[0], a, (char**) envp);
1095
1096 a[0] = (char*) "/sbin/init";
1097 execve(a[0], a, (char**) envp);
1098 } else if (argc > optind)
1099 execvpe(argv[optind], argv + optind, (char**) envp);
1100 else {
1101 chdir(home ? home : "/root");
1102 execle("/bin/bash", "-bash", NULL, (char**) envp);
1103 }
1104
1105 log_error("execv() failed: %m");
1106
1107 child_fail:
1108 _exit(EXIT_FAILURE);
1109 }
1110
1111 if (process_pty(master, &mask) < 0)
1112 goto finish;
1113
1114 if (saved_attr_valid) {
1115 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1116 saved_attr_valid = false;
1117 }
1118
1119 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1120
1121 if (r < 0)
1122 r = EXIT_FAILURE;
1123
1124 finish:
1125 if (saved_attr_valid)
1126 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1127
1128 if (master >= 0)
1129 close_nointr_nofail(master);
1130
1131 close_pipe(kmsg_socket_pair);
1132
1133 if (oldcg)
1134 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1135
1136 if (newcg)
1137 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1138
1139 free(arg_directory);
1140 strv_free(arg_controllers);
1141 free(oldcg);
1142 free(newcg);
1143
1144 return r;
1145 }