]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
7050c05ec2be3e059a9b8da3f29945443744b26c
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
59 static bool arg_boot = false;
60
61 static int help(void) {
62
63 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
64 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
65 " -h --help Show this help\n"
66 " -D --directory=NAME Root directory for the container\n"
67 " -b --boot Boot up full system (i.e. invoke init)\n"
68 " -u --user=USER Run the command under specified user or uid\n"
69 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
70 " --private-network Disable network in container\n",
71 program_invocation_short_name);
72
73 return 0;
74 }
75
76 static int parse_argv(int argc, char *argv[]) {
77
78 enum {
79 ARG_PRIVATE_NETWORK = 0x100
80 };
81
82 static const struct option options[] = {
83 { "help", no_argument, NULL, 'h' },
84 { "directory", required_argument, NULL, 'D' },
85 { "user", required_argument, NULL, 'u' },
86 { "controllers", required_argument, NULL, 'C' },
87 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
88 { "boot", no_argument, NULL, 'b' },
89 { NULL, 0, NULL, 0 }
90 };
91
92 int c;
93
94 assert(argc >= 0);
95 assert(argv);
96
97 while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
98
99 switch (c) {
100
101 case 'h':
102 help();
103 return 0;
104
105 case 'D':
106 free(arg_directory);
107 arg_directory = canonicalize_file_name(optarg);
108 if (!arg_directory) {
109 log_error("Failed to canonicalize root directory.");
110 return -ENOMEM;
111 }
112
113 break;
114
115 case 'u':
116 free(arg_user);
117 if (!(arg_user = strdup(optarg))) {
118 log_error("Failed to duplicate user name.");
119 return -ENOMEM;
120 }
121
122 break;
123
124 case 'C':
125 strv_free(arg_controllers);
126 arg_controllers = strv_split(optarg, ",");
127 if (!arg_controllers) {
128 log_error("Failed to split controllers list.");
129 return -ENOMEM;
130 }
131 strv_uniq(arg_controllers);
132
133 break;
134
135 case ARG_PRIVATE_NETWORK:
136 arg_private_network = true;
137 break;
138
139 case 'b':
140 arg_boot = true;
141 break;
142
143 case '?':
144 return -EINVAL;
145
146 default:
147 log_error("Unknown option code %c", c);
148 return -EINVAL;
149 }
150 }
151
152 return 1;
153 }
154
155 static int mount_all(const char *dest) {
156
157 typedef struct MountPoint {
158 const char *what;
159 const char *where;
160 const char *type;
161 const char *options;
162 unsigned long flags;
163 bool fatal;
164 } MountPoint;
165
166 static const MountPoint mount_table[] = {
167 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
168 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
169 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
170 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
171 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
172 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
173 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
174 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
175 #ifdef HAVE_SELINUX
176 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
177 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
178 #endif
179 };
180
181 unsigned k;
182 int r = 0;
183 char *where;
184
185 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
186 int t;
187
188 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
189 log_error("Out of memory");
190
191 if (r == 0)
192 r = -ENOMEM;
193
194 break;
195 }
196
197 t = path_is_mount_point(where, false);
198 if (t < 0) {
199 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
200 free(where);
201
202 if (r == 0)
203 r = t;
204
205 continue;
206 }
207
208 mkdir_p(where, 0755);
209
210 if (mount(mount_table[k].what,
211 where,
212 mount_table[k].type,
213 mount_table[k].flags,
214 mount_table[k].options) < 0 &&
215 mount_table[k].fatal) {
216
217 log_error("mount(%s) failed: %m", where);
218
219 if (r == 0)
220 r = -errno;
221 }
222
223 free(where);
224 }
225
226 return r;
227 }
228
229 static int setup_timezone(const char *dest) {
230 char *where;
231
232 assert(dest);
233
234 /* Fix the timezone, if possible */
235 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
236 log_error("Out of memory");
237 return -ENOMEM;
238 }
239
240 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
241 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
242
243 free(where);
244
245 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
246 log_error("Out of memory");
247 return -ENOMEM;
248 }
249
250 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
251 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
252
253 free(where);
254
255 return 0;
256 }
257
258 static int copy_devnodes(const char *dest) {
259
260 static const char devnodes[] =
261 "null\0"
262 "zero\0"
263 "full\0"
264 "random\0"
265 "urandom\0"
266 "tty\0"
267 "ptmx\0"
268 "rtc0\0";
269
270 const char *d;
271 int r = 0;
272 mode_t u;
273
274 assert(dest);
275
276 u = umask(0000);
277
278 NULSTR_FOREACH(d, devnodes) {
279 struct stat st;
280 char *from = NULL, *to = NULL;
281
282 asprintf(&from, "/dev/%s", d);
283 asprintf(&to, "%s/dev/%s", dest, d);
284
285 if (!from || !to) {
286 log_error("Failed to allocate devnode path");
287
288 free(from);
289 free(to);
290
291 from = to = NULL;
292
293 if (r == 0)
294 r = -ENOMEM;
295
296 break;
297 }
298
299 if (stat(from, &st) < 0) {
300
301 if (errno != ENOENT) {
302 log_error("Failed to stat %s: %m", from);
303 if (r == 0)
304 r = -errno;
305 }
306
307 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
308
309 log_error("%s is not a char or block device, cannot copy.", from);
310 if (r == 0)
311 r = -EIO;
312
313 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
314
315 log_error("mknod(%s) failed: %m", dest);
316 if (r == 0)
317 r = -errno;
318 }
319
320 free(from);
321 free(to);
322 }
323
324 umask(u);
325
326 return r;
327 }
328
329 static int setup_dev_console(const char *dest, const char *console) {
330 struct stat st;
331 char *to = NULL;
332 int r;
333 mode_t u;
334
335 assert(dest);
336 assert(console);
337
338 u = umask(0000);
339
340 if (stat(console, &st) < 0) {
341 log_error("Failed to stat %s: %m", console);
342 r = -errno;
343 goto finish;
344
345 } else if (!S_ISCHR(st.st_mode)) {
346 log_error("/dev/console is not a char device.");
347 r = -EIO;
348 goto finish;
349 }
350
351 r = chmod_and_chown(console, 0600, 0, 0);
352 if (r < 0) {
353 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
354 goto finish;
355 }
356
357 if (asprintf(&to, "%s/dev/console", dest) < 0) {
358 log_error("Out of memory");
359 r = -ENOMEM;
360 goto finish;
361 }
362
363 /* We need to bind mount the right tty to /dev/console since
364 * ptys can only exist on pts file systems. To have something
365 * to bind mount things on we create a device node first, that
366 * has the right major/minor (note that the major minor
367 * doesn't actually matter here, since we mount it over
368 * anyway). */
369
370 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
371 log_error("mknod() for /dev/console failed: %m");
372 r = -errno;
373 goto finish;
374 }
375
376 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
377 log_error("Bind mount for /dev/console failed: %m");
378 r = -errno;
379 goto finish;
380 }
381
382 finish:
383 free(to);
384 umask(u);
385
386 return r;
387 }
388
389 static int setup_kmsg(const char *dest, int kmsg_socket) {
390 char *from = NULL, *to = NULL;
391 int r, fd, k;
392 mode_t u;
393 union {
394 struct cmsghdr cmsghdr;
395 uint8_t buf[CMSG_SPACE(sizeof(int))];
396 } control;
397 struct msghdr mh;
398 struct cmsghdr *cmsg;
399
400 assert(dest);
401 assert(kmsg_socket >= 0);
402
403 u = umask(0000);
404
405 /* We create the kmsg FIFO as /dev/kmsg, but immediately
406 * delete it after bind mounting it to /proc/kmsg. While FIFOs
407 * on the reading side behave very similar to /proc/kmsg,
408 * their writing side behaves differently from /dev/kmsg in
409 * that writing blocks when nothing is reading. In order to
410 * avoid any problems with containers deadlocking due to this
411 * we simply make /dev/kmsg unavailable to the container. */
412 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
413 log_error("Out of memory");
414 r = -ENOMEM;
415 goto finish;
416 }
417
418 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
419 log_error("Out of memory");
420 r = -ENOMEM;
421 goto finish;
422 }
423
424 if (mkfifo(from, 0600) < 0) {
425 log_error("mkfifo() for /dev/kmsg failed: %m");
426 r = -errno;
427 goto finish;
428 }
429
430 r = chmod_and_chown(from, 0600, 0, 0);
431 if (r < 0) {
432 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
433 goto finish;
434 }
435
436 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
437 log_error("Bind mount for /proc/kmsg failed: %m");
438 r = -errno;
439 goto finish;
440 }
441
442 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
443 if (fd < 0) {
444 log_error("Failed to open fifo: %m");
445 r = -errno;
446 goto finish;
447 }
448
449 zero(mh);
450 zero(control);
451
452 mh.msg_control = &control;
453 mh.msg_controllen = sizeof(control);
454
455 cmsg = CMSG_FIRSTHDR(&mh);
456 cmsg->cmsg_level = SOL_SOCKET;
457 cmsg->cmsg_type = SCM_RIGHTS;
458 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
459 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
460
461 mh.msg_controllen = cmsg->cmsg_len;
462
463 /* Store away the fd in the socket, so that it stays open as
464 * long as we run the child */
465 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
466 close_nointr_nofail(fd);
467
468 if (k < 0) {
469 log_error("Failed to send FIFO fd: %m");
470 r = -errno;
471 goto finish;
472 }
473
474 /* And now make the FIFO unavailable as /dev/kmsg... */
475 unlink(from);
476
477 finish:
478 free(from);
479 free(to);
480 umask(u);
481
482 return r;
483 }
484
485 static int setup_hostname(void) {
486 char *hn;
487 int r = 0;
488
489 hn = file_name_from_path(arg_directory);
490 if (hn) {
491 hn = strdup(hn);
492 if (!hn)
493 return -ENOMEM;
494
495 hostname_cleanup(hn);
496
497 if (!isempty(hn))
498 if (sethostname(hn, strlen(hn)) < 0)
499 r = -errno;
500
501 free(hn);
502 }
503
504 return r;
505 }
506
507 static int drop_capabilities(void) {
508 static const unsigned long retain[] = {
509 CAP_CHOWN,
510 CAP_DAC_OVERRIDE,
511 CAP_DAC_READ_SEARCH,
512 CAP_FOWNER,
513 CAP_FSETID,
514 CAP_IPC_OWNER,
515 CAP_KILL,
516 CAP_LEASE,
517 CAP_LINUX_IMMUTABLE,
518 CAP_NET_BIND_SERVICE,
519 CAP_NET_BROADCAST,
520 CAP_NET_RAW,
521 CAP_SETGID,
522 CAP_SETFCAP,
523 CAP_SETPCAP,
524 CAP_SETUID,
525 CAP_SYS_ADMIN,
526 CAP_SYS_CHROOT,
527 CAP_SYS_NICE,
528 CAP_SYS_PTRACE,
529 CAP_SYS_TTY_CONFIG
530 };
531
532 unsigned long l;
533
534 for (l = 0; l <= cap_last_cap(); l++) {
535 unsigned i;
536
537 for (i = 0; i < ELEMENTSOF(retain); i++)
538 if (retain[i] == l)
539 break;
540
541 if (i < ELEMENTSOF(retain))
542 continue;
543
544 if (prctl(PR_CAPBSET_DROP, l) < 0) {
545 log_error("PR_CAPBSET_DROP failed: %m");
546 return -errno;
547 }
548 }
549
550 return 0;
551 }
552
553 static int is_os_tree(const char *path) {
554 int r;
555 char *p;
556 /* We use /bin/sh as flag file if something is an OS */
557
558 if (asprintf(&p, "%s/bin/sh", path) < 0)
559 return -ENOMEM;
560
561 r = access(p, F_OK);
562 free(p);
563
564 return r < 0 ? 0 : 1;
565 }
566
567 static int process_pty(int master, sigset_t *mask) {
568
569 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
570 size_t in_buffer_full = 0, out_buffer_full = 0;
571 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
572 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
573 int ep = -1, signal_fd = -1, r;
574
575 fd_nonblock(STDIN_FILENO, 1);
576 fd_nonblock(STDOUT_FILENO, 1);
577 fd_nonblock(master, 1);
578
579 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
580 log_error("signalfd(): %m");
581 r = -errno;
582 goto finish;
583 }
584
585 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
586 log_error("Failed to create epoll: %m");
587 r = -errno;
588 goto finish;
589 }
590
591 zero(stdin_ev);
592 stdin_ev.events = EPOLLIN|EPOLLET;
593 stdin_ev.data.fd = STDIN_FILENO;
594
595 zero(stdout_ev);
596 stdout_ev.events = EPOLLOUT|EPOLLET;
597 stdout_ev.data.fd = STDOUT_FILENO;
598
599 zero(master_ev);
600 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
601 master_ev.data.fd = master;
602
603 zero(signal_ev);
604 signal_ev.events = EPOLLIN;
605 signal_ev.data.fd = signal_fd;
606
607 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
608 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
609 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
610 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
611 log_error("Failed to regiser fds in epoll: %m");
612 r = -errno;
613 goto finish;
614 }
615
616 for (;;) {
617 struct epoll_event ev[16];
618 ssize_t k;
619 int i, nfds;
620
621 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
622
623 if (errno == EINTR || errno == EAGAIN)
624 continue;
625
626 log_error("epoll_wait(): %m");
627 r = -errno;
628 goto finish;
629 }
630
631 assert(nfds >= 1);
632
633 for (i = 0; i < nfds; i++) {
634 if (ev[i].data.fd == STDIN_FILENO) {
635
636 if (ev[i].events & (EPOLLIN|EPOLLHUP))
637 stdin_readable = true;
638
639 } else if (ev[i].data.fd == STDOUT_FILENO) {
640
641 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
642 stdout_writable = true;
643
644 } else if (ev[i].data.fd == master) {
645
646 if (ev[i].events & (EPOLLIN|EPOLLHUP))
647 master_readable = true;
648
649 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
650 master_writable = true;
651
652 } else if (ev[i].data.fd == signal_fd) {
653 struct signalfd_siginfo sfsi;
654 ssize_t n;
655
656 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
657
658 if (n >= 0) {
659 log_error("Failed to read from signalfd: invalid block size");
660 r = -EIO;
661 goto finish;
662 }
663
664 if (errno != EINTR && errno != EAGAIN) {
665 log_error("Failed to read from signalfd: %m");
666 r = -errno;
667 goto finish;
668 }
669 } else {
670
671 if (sfsi.ssi_signo == SIGWINCH) {
672 struct winsize ws;
673
674 /* The window size changed, let's forward that. */
675 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
676 ioctl(master, TIOCSWINSZ, &ws);
677 } else {
678 r = 0;
679 goto finish;
680 }
681 }
682 }
683 }
684
685 while ((stdin_readable && in_buffer_full <= 0) ||
686 (master_writable && in_buffer_full > 0) ||
687 (master_readable && out_buffer_full <= 0) ||
688 (stdout_writable && out_buffer_full > 0)) {
689
690 if (stdin_readable && in_buffer_full < LINE_MAX) {
691
692 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
693
694 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
695 stdin_readable = false;
696 else {
697 log_error("read(): %m");
698 r = -errno;
699 goto finish;
700 }
701 } else
702 in_buffer_full += (size_t) k;
703 }
704
705 if (master_writable && in_buffer_full > 0) {
706
707 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
708
709 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
710 master_writable = false;
711 else {
712 log_error("write(): %m");
713 r = -errno;
714 goto finish;
715 }
716
717 } else {
718 assert(in_buffer_full >= (size_t) k);
719 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
720 in_buffer_full -= k;
721 }
722 }
723
724 if (master_readable && out_buffer_full < LINE_MAX) {
725
726 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
727
728 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
729 master_readable = false;
730 else {
731 log_error("read(): %m");
732 r = -errno;
733 goto finish;
734 }
735 } else
736 out_buffer_full += (size_t) k;
737 }
738
739 if (stdout_writable && out_buffer_full > 0) {
740
741 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
742
743 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
744 stdout_writable = false;
745 else {
746 log_error("write(): %m");
747 r = -errno;
748 goto finish;
749 }
750
751 } else {
752 assert(out_buffer_full >= (size_t) k);
753 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
754 out_buffer_full -= k;
755 }
756 }
757 }
758 }
759
760 finish:
761 if (ep >= 0)
762 close_nointr_nofail(ep);
763
764 if (signal_fd >= 0)
765 close_nointr_nofail(signal_fd);
766
767 return r;
768 }
769
770 int main(int argc, char *argv[]) {
771 pid_t pid = 0;
772 int r = EXIT_FAILURE, k;
773 char *oldcg = NULL, *newcg = NULL;
774 char **controller = NULL;
775 int master = -1;
776 const char *console = NULL;
777 struct termios saved_attr, raw_attr;
778 sigset_t mask;
779 bool saved_attr_valid = false;
780 struct winsize ws;
781 int kmsg_socket_pair[2] = { -1, -1 };
782
783 log_parse_environment();
784 log_open();
785
786 if ((r = parse_argv(argc, argv)) <= 0)
787 goto finish;
788
789 if (arg_directory) {
790 char *p;
791
792 p = path_make_absolute_cwd(arg_directory);
793 free(arg_directory);
794 arg_directory = p;
795 } else
796 arg_directory = get_current_dir_name();
797
798 if (!arg_directory) {
799 log_error("Failed to determine path");
800 goto finish;
801 }
802
803 path_kill_slashes(arg_directory);
804
805 if (geteuid() != 0) {
806 log_error("Need to be root.");
807 goto finish;
808 }
809
810 if (sd_booted() <= 0) {
811 log_error("Not running on a systemd system.");
812 goto finish;
813 }
814
815 if (path_equal(arg_directory, "/")) {
816 log_error("Spawning container on root directory not supported.");
817 goto finish;
818 }
819
820 if (is_os_tree(arg_directory) <= 0) {
821 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
822 goto finish;
823 }
824
825 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
826 log_error("Failed to determine current cgroup: %s", strerror(-k));
827 goto finish;
828 }
829
830 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
831 log_error("Failed to allocate cgroup path.");
832 goto finish;
833 }
834
835 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
836 if (k < 0) {
837 log_error("Failed to create cgroup: %s", strerror(-k));
838 goto finish;
839 }
840
841 STRV_FOREACH(controller,arg_controllers) {
842 k = cg_create_and_attach(*controller, newcg, 0);
843 if (k < 0)
844 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
845 }
846
847 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
848 log_error("Failed to acquire pseudo tty: %m");
849 goto finish;
850 }
851
852 if (!(console = ptsname(master))) {
853 log_error("Failed to determine tty name: %m");
854 goto finish;
855 }
856
857 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
858
859 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
860 ioctl(master, TIOCSWINSZ, &ws);
861
862 if (unlockpt(master) < 0) {
863 log_error("Failed to unlock tty: %m");
864 goto finish;
865 }
866
867 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
868 log_error("Failed to get terminal attributes: %m");
869 goto finish;
870 }
871
872 saved_attr_valid = true;
873
874 raw_attr = saved_attr;
875 cfmakeraw(&raw_attr);
876 raw_attr.c_lflag &= ~ECHO;
877
878 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
879 log_error("Failed to set terminal attributes: %m");
880 goto finish;
881 }
882
883 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
884 log_error("Failed to create kmsg socket pair");
885 goto finish;
886 }
887
888 assert_se(sigemptyset(&mask) == 0);
889 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
890 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
891
892 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
893 if (pid < 0) {
894 if (errno == EINVAL)
895 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
896 else
897 log_error("clone() failed: %m");
898
899 goto finish;
900 }
901
902 if (pid == 0) {
903 /* child */
904
905 const char *home = NULL;
906 uid_t uid = (uid_t) -1;
907 gid_t gid = (gid_t) -1;
908 const char *envp[] = {
909 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
910 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
911 NULL, /* TERM */
912 NULL, /* HOME */
913 NULL, /* USER */
914 NULL, /* LOGNAME */
915 NULL
916 };
917
918 envp[2] = strv_find_prefix(environ, "TERM=");
919
920 close_nointr_nofail(master);
921
922 close_nointr(STDIN_FILENO);
923 close_nointr(STDOUT_FILENO);
924 close_nointr(STDERR_FILENO);
925
926 close_all_fds(&kmsg_socket_pair[1], 1);
927
928 reset_all_signal_handlers();
929
930 assert_se(sigemptyset(&mask) == 0);
931 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
932
933 if (setsid() < 0)
934 goto child_fail;
935
936 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
937 goto child_fail;
938
939 /* Mark / as private, in case somebody marked it shared */
940 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
941 goto child_fail;
942
943 if (mount_all(arg_directory) < 0)
944 goto child_fail;
945
946 if (copy_devnodes(arg_directory) < 0)
947 goto child_fail;
948
949 if (setup_dev_console(arg_directory, console) < 0)
950 goto child_fail;
951
952 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
953 goto child_fail;
954
955 close_nointr_nofail(kmsg_socket_pair[1]);
956
957 if (setup_timezone(arg_directory) < 0)
958 goto child_fail;
959
960 if (chdir(arg_directory) < 0) {
961 log_error("chdir(%s) failed: %m", arg_directory);
962 goto child_fail;
963 }
964
965 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
966 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
967 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
968 goto child_fail;
969
970 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
971 log_error("mount(MS_MOVE) failed: %m");
972 goto child_fail;
973 }
974
975 if (chroot(".") < 0) {
976 log_error("chroot() failed: %m");
977 goto child_fail;
978 }
979
980 if (chdir("/") < 0) {
981 log_error("chdir() failed: %m");
982 goto child_fail;
983 }
984
985 umask(0022);
986
987 loopback_setup();
988
989 if (drop_capabilities() < 0)
990 goto child_fail;
991
992 if (arg_user) {
993
994 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
995 log_error("get_user_creds() failed: %m");
996 goto child_fail;
997 }
998
999 if (mkdir_parents(home, 0775) < 0) {
1000 log_error("mkdir_parents() failed: %m");
1001 goto child_fail;
1002 }
1003
1004 if (safe_mkdir(home, 0775, uid, gid) < 0) {
1005 log_error("safe_mkdir() failed: %m");
1006 goto child_fail;
1007 }
1008
1009 if (initgroups((const char*)arg_user, gid) < 0) {
1010 log_error("initgroups() failed: %m");
1011 goto child_fail;
1012 }
1013
1014 if (setresgid(gid, gid, gid) < 0) {
1015 log_error("setregid() failed: %m");
1016 goto child_fail;
1017 }
1018
1019 if (setresuid(uid, uid, uid) < 0) {
1020 log_error("setreuid() failed: %m");
1021 goto child_fail;
1022 }
1023 }
1024
1025 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
1026 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
1027 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
1028 log_error("Out of memory");
1029 goto child_fail;
1030 }
1031
1032 setup_hostname();
1033
1034 if (arg_boot) {
1035 char **a;
1036 size_t l;
1037
1038 /* Automatically search for the init system */
1039
1040 l = 1 + argc - optind;
1041 a = newa(char*, l + 1);
1042 memcpy(a + 1, argv + optind, l * sizeof(char*));
1043
1044 a[0] = (char*) "/usr/lib/systemd/systemd";
1045 execve(a[0], a, (char**) envp);
1046
1047 a[0] = (char*) "/lib/systemd/systemd";
1048 execve(a[0], a, (char**) envp);
1049
1050 a[0] = (char*) "/sbin/init";
1051 execve(a[0], a, (char**) envp);
1052 } else if (argc > optind)
1053 execvpe(argv[optind], argv + optind, (char**) envp);
1054 else {
1055 chdir(home ? home : "/root");
1056 execle("/bin/bash", "-bash", NULL, (char**) envp);
1057 }
1058
1059 log_error("execv() failed: %m");
1060
1061 child_fail:
1062 _exit(EXIT_FAILURE);
1063 }
1064
1065 if (process_pty(master, &mask) < 0)
1066 goto finish;
1067
1068 if (saved_attr_valid) {
1069 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1070 saved_attr_valid = false;
1071 }
1072
1073 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1074
1075 if (r < 0)
1076 r = EXIT_FAILURE;
1077
1078 finish:
1079 if (saved_attr_valid)
1080 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1081
1082 if (master >= 0)
1083 close_nointr_nofail(master);
1084
1085 close_pipe(kmsg_socket_pair);
1086
1087 if (oldcg)
1088 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1089
1090 if (newcg)
1091 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1092
1093 free(arg_directory);
1094 strv_free(arg_controllers);
1095 free(oldcg);
1096 free(newcg);
1097
1098 return r;
1099 }