]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: make /dev/kmsg unavailable in the container, but allow access to /proc/kmsg
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
59
60 static int help(void) {
61
62 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64 " -h --help Show this help\n"
65 " -D --directory=NAME Root directory for the container\n"
66 " -u --user=USER Run the command under specified user or uid\n"
67 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68 " --private-network Disable network in container\n",
69 program_invocation_short_name);
70
71 return 0;
72 }
73
74 static int parse_argv(int argc, char *argv[]) {
75
76 enum {
77 ARG_PRIVATE_NETWORK = 0x100
78 };
79
80 static const struct option options[] = {
81 { "help", no_argument, NULL, 'h' },
82 { "directory", required_argument, NULL, 'D' },
83 { "user", required_argument, NULL, 'u' },
84 { "controllers", required_argument, NULL, 'C' },
85 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
86 { NULL, 0, NULL, 0 }
87 };
88
89 int c;
90
91 assert(argc >= 0);
92 assert(argv);
93
94 while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
95
96 switch (c) {
97
98 case 'h':
99 help();
100 return 0;
101
102 case 'D':
103 free(arg_directory);
104 if (!(arg_directory = strdup(optarg))) {
105 log_error("Failed to duplicate root directory.");
106 return -ENOMEM;
107 }
108
109 break;
110
111 case 'u':
112 free(arg_user);
113 if (!(arg_user = strdup(optarg))) {
114 log_error("Failed to duplicate user name.");
115 return -ENOMEM;
116 }
117
118 break;
119
120 case 'C':
121 strv_free(arg_controllers);
122 arg_controllers = strv_split(optarg, ",");
123 if (!arg_controllers) {
124 log_error("Failed to split controllers list.");
125 return -ENOMEM;
126 }
127 strv_uniq(arg_controllers);
128
129 break;
130
131 case ARG_PRIVATE_NETWORK:
132 arg_private_network = true;
133 break;
134
135 case '?':
136 return -EINVAL;
137
138 default:
139 log_error("Unknown option code %c", c);
140 return -EINVAL;
141 }
142 }
143
144 return 1;
145 }
146
147 static int mount_all(const char *dest) {
148
149 typedef struct MountPoint {
150 const char *what;
151 const char *where;
152 const char *type;
153 const char *options;
154 unsigned long flags;
155 bool fatal;
156 } MountPoint;
157
158 static const MountPoint mount_table[] = {
159 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
160 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
161 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
162 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
163 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
164 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
165 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
166 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
167 #ifdef HAVE_SELINUX
168 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
169 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
170 #endif
171 };
172
173 unsigned k;
174 int r = 0;
175 char *where;
176
177 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
178 int t;
179
180 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
181 log_error("Out of memory");
182
183 if (r == 0)
184 r = -ENOMEM;
185
186 break;
187 }
188
189 t = path_is_mount_point(where, false);
190 if (t < 0) {
191 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
192 free(where);
193
194 if (r == 0)
195 r = t;
196
197 continue;
198 }
199
200 mkdir_p(where, 0755);
201
202 if (mount(mount_table[k].what,
203 where,
204 mount_table[k].type,
205 mount_table[k].flags,
206 mount_table[k].options) < 0 &&
207 mount_table[k].fatal) {
208
209 log_error("mount(%s) failed: %m", where);
210
211 if (r == 0)
212 r = -errno;
213 }
214
215 free(where);
216 }
217
218 return r;
219 }
220
221 static int setup_timezone(const char *dest) {
222 char *where;
223
224 assert(dest);
225
226 /* Fix the timezone, if possible */
227 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
228 log_error("Out of memory");
229 return -ENOMEM;
230 }
231
232 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
233 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
234
235 free(where);
236
237 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
238 log_error("Out of memory");
239 return -ENOMEM;
240 }
241
242 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
243 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
244
245 free(where);
246
247 return 0;
248 }
249
250 static int copy_devnodes(const char *dest) {
251
252 static const char devnodes[] =
253 "null\0"
254 "zero\0"
255 "full\0"
256 "random\0"
257 "urandom\0"
258 "tty\0"
259 "ptmx\0"
260 "rtc0\0";
261
262 const char *d;
263 int r = 0;
264 mode_t u;
265
266 assert(dest);
267
268 u = umask(0000);
269
270 NULSTR_FOREACH(d, devnodes) {
271 struct stat st;
272 char *from = NULL, *to = NULL;
273
274 asprintf(&from, "/dev/%s", d);
275 asprintf(&to, "%s/dev/%s", dest, d);
276
277 if (!from || !to) {
278 log_error("Failed to allocate devnode path");
279
280 free(from);
281 free(to);
282
283 from = to = NULL;
284
285 if (r == 0)
286 r = -ENOMEM;
287
288 break;
289 }
290
291 if (stat(from, &st) < 0) {
292
293 if (errno != ENOENT) {
294 log_error("Failed to stat %s: %m", from);
295 if (r == 0)
296 r = -errno;
297 }
298
299 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
300
301 log_error("%s is not a char or block device, cannot copy.", from);
302 if (r == 0)
303 r = -EIO;
304
305 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
306
307 log_error("mknod(%s) failed: %m", dest);
308 if (r == 0)
309 r = -errno;
310 }
311
312 free(from);
313 free(to);
314 }
315
316 umask(u);
317
318 return r;
319 }
320
321 static int setup_dev_console(const char *dest, const char *console) {
322 struct stat st;
323 char *to = NULL;
324 int r;
325 mode_t u;
326
327 assert(dest);
328 assert(console);
329
330 u = umask(0000);
331
332 if (stat(console, &st) < 0) {
333 log_error("Failed to stat %s: %m", console);
334 r = -errno;
335 goto finish;
336
337 } else if (!S_ISCHR(st.st_mode)) {
338 log_error("/dev/console is not a char device.");
339 r = -EIO;
340 goto finish;
341 }
342
343 r = chmod_and_chown(console, 0600, 0, 0);
344 if (r < 0) {
345 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
346 goto finish;
347 }
348
349 if (asprintf(&to, "%s/dev/console", dest) < 0) {
350 log_error("Out of memory");
351 r = -ENOMEM;
352 goto finish;
353 }
354
355 /* We need to bind mount the right tty to /dev/console since
356 * ptys can only exist on pts file systems. To have something
357 * to bind mount things on we create a device node first, that
358 * has the right major/minor (note that the major minor
359 * doesn't actually matter here, since we mount it over
360 * anyway). */
361
362 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
363 log_error("mknod() for /dev/console failed: %m");
364 r = -errno;
365 goto finish;
366 }
367
368 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
369 log_error("Bind mount for /dev/console failed: %m");
370 r = -errno;
371 goto finish;
372 }
373
374 finish:
375 free(to);
376 umask(u);
377
378 return r;
379 }
380
381 static int setup_kmsg(const char *dest, int kmsg_socket) {
382 char *from = NULL, *to = NULL;
383 int r, fd, k;
384 mode_t u;
385 union {
386 struct cmsghdr cmsghdr;
387 uint8_t buf[CMSG_SPACE(sizeof(int))];
388 } control;
389 struct msghdr mh;
390 struct cmsghdr *cmsg;
391
392 assert(dest);
393 assert(kmsg_socket >= 0);
394
395 u = umask(0000);
396
397 /* We create the kmsg FIFO as /dev/kmsg, but immediately
398 * delete it after bind mounting it to /proc/kmsg. While FIFOs
399 * on the reading side behave very similar to /proc/kmsg,
400 * their writing side behaves differently from /dev/kmsg in
401 * that writing blocks when nothing is reading. In order to
402 * avoid any problems with containers deadlocking due to this
403 * we simply make /dev/kmsg unavailable to the container. */
404 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
405 log_error("Out of memory");
406 r = -ENOMEM;
407 goto finish;
408 }
409
410 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
411 log_error("Out of memory");
412 r = -ENOMEM;
413 goto finish;
414 }
415
416 if (mkfifo(from, 0600) < 0) {
417 log_error("mkfifo() for /dev/kmsg failed: %m");
418 r = -errno;
419 goto finish;
420 }
421
422 r = chmod_and_chown(from, 0600, 0, 0);
423 if (r < 0) {
424 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
425 goto finish;
426 }
427
428 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
429 log_error("Bind mount for /proc/kmsg failed: %m");
430 r = -errno;
431 goto finish;
432 }
433
434 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
435 if (fd < 0) {
436 log_error("Failed to open fifo: %m");
437 r = -errno;
438 goto finish;
439 }
440
441 zero(mh);
442 zero(control);
443
444 mh.msg_control = &control;
445 mh.msg_controllen = sizeof(control);
446
447 cmsg = CMSG_FIRSTHDR(&mh);
448 cmsg->cmsg_level = SOL_SOCKET;
449 cmsg->cmsg_type = SCM_RIGHTS;
450 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
451 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
452
453 mh.msg_controllen = cmsg->cmsg_len;
454
455 /* Store away the fd in the socket, so that it stays open as
456 * long as we run the child */
457 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
458 close_nointr_nofail(fd);
459
460 if (k < 0) {
461 log_error("Failed to send FIFO fd: %m");
462 r = -errno;
463 goto finish;
464 }
465
466 /* And now make the FIFO unavailable as /dev/kmsg... */
467 unlink(from);
468
469 finish:
470 free(from);
471 free(to);
472 umask(u);
473
474 return r;
475 }
476
477 static int drop_capabilities(void) {
478 static const unsigned long retain[] = {
479 CAP_CHOWN,
480 CAP_DAC_OVERRIDE,
481 CAP_DAC_READ_SEARCH,
482 CAP_FOWNER,
483 CAP_FSETID,
484 CAP_IPC_OWNER,
485 CAP_KILL,
486 CAP_LEASE,
487 CAP_LINUX_IMMUTABLE,
488 CAP_NET_BIND_SERVICE,
489 CAP_NET_BROADCAST,
490 CAP_NET_RAW,
491 CAP_SETGID,
492 CAP_SETFCAP,
493 CAP_SETPCAP,
494 CAP_SETUID,
495 CAP_SYS_ADMIN,
496 CAP_SYS_CHROOT,
497 CAP_SYS_NICE,
498 CAP_SYS_PTRACE,
499 CAP_SYS_TTY_CONFIG
500 };
501
502 unsigned long l;
503
504 for (l = 0; l <= cap_last_cap(); l++) {
505 unsigned i;
506
507 for (i = 0; i < ELEMENTSOF(retain); i++)
508 if (retain[i] == l)
509 break;
510
511 if (i < ELEMENTSOF(retain))
512 continue;
513
514 if (prctl(PR_CAPBSET_DROP, l) < 0) {
515 log_error("PR_CAPBSET_DROP failed: %m");
516 return -errno;
517 }
518 }
519
520 return 0;
521 }
522
523 static int is_os_tree(const char *path) {
524 int r;
525 char *p;
526 /* We use /bin/sh as flag file if something is an OS */
527
528 if (asprintf(&p, "%s/bin/sh", path) < 0)
529 return -ENOMEM;
530
531 r = access(p, F_OK);
532 free(p);
533
534 return r < 0 ? 0 : 1;
535 }
536
537 static int process_pty(int master, sigset_t *mask) {
538
539 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
540 size_t in_buffer_full = 0, out_buffer_full = 0;
541 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
542 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
543 int ep = -1, signal_fd = -1, r;
544
545 fd_nonblock(STDIN_FILENO, 1);
546 fd_nonblock(STDOUT_FILENO, 1);
547 fd_nonblock(master, 1);
548
549 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
550 log_error("signalfd(): %m");
551 r = -errno;
552 goto finish;
553 }
554
555 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
556 log_error("Failed to create epoll: %m");
557 r = -errno;
558 goto finish;
559 }
560
561 zero(stdin_ev);
562 stdin_ev.events = EPOLLIN|EPOLLET;
563 stdin_ev.data.fd = STDIN_FILENO;
564
565 zero(stdout_ev);
566 stdout_ev.events = EPOLLOUT|EPOLLET;
567 stdout_ev.data.fd = STDOUT_FILENO;
568
569 zero(master_ev);
570 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
571 master_ev.data.fd = master;
572
573 zero(signal_ev);
574 signal_ev.events = EPOLLIN;
575 signal_ev.data.fd = signal_fd;
576
577 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
578 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
579 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
580 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
581 log_error("Failed to regiser fds in epoll: %m");
582 r = -errno;
583 goto finish;
584 }
585
586 for (;;) {
587 struct epoll_event ev[16];
588 ssize_t k;
589 int i, nfds;
590
591 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
592
593 if (errno == EINTR || errno == EAGAIN)
594 continue;
595
596 log_error("epoll_wait(): %m");
597 r = -errno;
598 goto finish;
599 }
600
601 assert(nfds >= 1);
602
603 for (i = 0; i < nfds; i++) {
604 if (ev[i].data.fd == STDIN_FILENO) {
605
606 if (ev[i].events & (EPOLLIN|EPOLLHUP))
607 stdin_readable = true;
608
609 } else if (ev[i].data.fd == STDOUT_FILENO) {
610
611 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
612 stdout_writable = true;
613
614 } else if (ev[i].data.fd == master) {
615
616 if (ev[i].events & (EPOLLIN|EPOLLHUP))
617 master_readable = true;
618
619 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
620 master_writable = true;
621
622 } else if (ev[i].data.fd == signal_fd) {
623 struct signalfd_siginfo sfsi;
624 ssize_t n;
625
626 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
627
628 if (n >= 0) {
629 log_error("Failed to read from signalfd: invalid block size");
630 r = -EIO;
631 goto finish;
632 }
633
634 if (errno != EINTR && errno != EAGAIN) {
635 log_error("Failed to read from signalfd: %m");
636 r = -errno;
637 goto finish;
638 }
639 } else {
640
641 if (sfsi.ssi_signo == SIGWINCH) {
642 struct winsize ws;
643
644 /* The window size changed, let's forward that. */
645 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
646 ioctl(master, TIOCSWINSZ, &ws);
647 } else {
648 r = 0;
649 goto finish;
650 }
651 }
652 }
653 }
654
655 while ((stdin_readable && in_buffer_full <= 0) ||
656 (master_writable && in_buffer_full > 0) ||
657 (master_readable && out_buffer_full <= 0) ||
658 (stdout_writable && out_buffer_full > 0)) {
659
660 if (stdin_readable && in_buffer_full < LINE_MAX) {
661
662 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
663
664 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
665 stdin_readable = false;
666 else {
667 log_error("read(): %m");
668 r = -errno;
669 goto finish;
670 }
671 } else
672 in_buffer_full += (size_t) k;
673 }
674
675 if (master_writable && in_buffer_full > 0) {
676
677 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
678
679 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
680 master_writable = false;
681 else {
682 log_error("write(): %m");
683 r = -errno;
684 goto finish;
685 }
686
687 } else {
688 assert(in_buffer_full >= (size_t) k);
689 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
690 in_buffer_full -= k;
691 }
692 }
693
694 if (master_readable && out_buffer_full < LINE_MAX) {
695
696 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
697
698 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
699 master_readable = false;
700 else {
701 log_error("read(): %m");
702 r = -errno;
703 goto finish;
704 }
705 } else
706 out_buffer_full += (size_t) k;
707 }
708
709 if (stdout_writable && out_buffer_full > 0) {
710
711 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
712
713 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
714 stdout_writable = false;
715 else {
716 log_error("write(): %m");
717 r = -errno;
718 goto finish;
719 }
720
721 } else {
722 assert(out_buffer_full >= (size_t) k);
723 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
724 out_buffer_full -= k;
725 }
726 }
727 }
728 }
729
730 finish:
731 if (ep >= 0)
732 close_nointr_nofail(ep);
733
734 if (signal_fd >= 0)
735 close_nointr_nofail(signal_fd);
736
737 return r;
738 }
739
740 int main(int argc, char *argv[]) {
741 pid_t pid = 0;
742 int r = EXIT_FAILURE, k;
743 char *oldcg = NULL, *newcg = NULL;
744 char **controller = NULL;
745 int master = -1;
746 const char *console = NULL;
747 struct termios saved_attr, raw_attr;
748 sigset_t mask;
749 bool saved_attr_valid = false;
750 struct winsize ws;
751 int kmsg_socket_pair[2] = { -1, -1 };
752
753 log_parse_environment();
754 log_open();
755
756 if ((r = parse_argv(argc, argv)) <= 0)
757 goto finish;
758
759 if (arg_directory) {
760 char *p;
761
762 p = path_make_absolute_cwd(arg_directory);
763 free(arg_directory);
764 arg_directory = p;
765 } else
766 arg_directory = get_current_dir_name();
767
768 if (!arg_directory) {
769 log_error("Failed to determine path");
770 goto finish;
771 }
772
773 path_kill_slashes(arg_directory);
774
775 if (geteuid() != 0) {
776 log_error("Need to be root.");
777 goto finish;
778 }
779
780 if (sd_booted() <= 0) {
781 log_error("Not running on a systemd system.");
782 goto finish;
783 }
784
785 if (path_equal(arg_directory, "/")) {
786 log_error("Spawning container on root directory not supported.");
787 goto finish;
788 }
789
790 if (is_os_tree(arg_directory) <= 0) {
791 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
792 goto finish;
793 }
794
795 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
796 log_error("Failed to determine current cgroup: %s", strerror(-k));
797 goto finish;
798 }
799
800 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
801 log_error("Failed to allocate cgroup path.");
802 goto finish;
803 }
804
805 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
806 if (k < 0) {
807 log_error("Failed to create cgroup: %s", strerror(-k));
808 goto finish;
809 }
810
811 STRV_FOREACH(controller,arg_controllers) {
812 k = cg_create_and_attach(*controller, newcg, 0);
813 if (k < 0)
814 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
815 }
816
817 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
818 log_error("Failed to acquire pseudo tty: %m");
819 goto finish;
820 }
821
822 if (!(console = ptsname(master))) {
823 log_error("Failed to determine tty name: %m");
824 goto finish;
825 }
826
827 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
828
829 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
830 ioctl(master, TIOCSWINSZ, &ws);
831
832 if (unlockpt(master) < 0) {
833 log_error("Failed to unlock tty: %m");
834 goto finish;
835 }
836
837 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
838 log_error("Failed to get terminal attributes: %m");
839 goto finish;
840 }
841
842 saved_attr_valid = true;
843
844 raw_attr = saved_attr;
845 cfmakeraw(&raw_attr);
846 raw_attr.c_lflag &= ~ECHO;
847
848 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
849 log_error("Failed to set terminal attributes: %m");
850 goto finish;
851 }
852
853 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
854 log_error("Failed to create kmsg socket pair");
855 goto finish;
856 }
857
858 assert_se(sigemptyset(&mask) == 0);
859 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
860 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
861
862 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
863 if (pid < 0) {
864 if (errno == EINVAL)
865 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
866 else
867 log_error("clone() failed: %m");
868
869 goto finish;
870 }
871
872 if (pid == 0) {
873 /* child */
874
875 const char *hn;
876 const char *home = NULL;
877 uid_t uid = (uid_t) -1;
878 gid_t gid = (gid_t) -1;
879 const char *envp[] = {
880 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
881 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
882 NULL, /* TERM */
883 NULL, /* HOME */
884 NULL, /* USER */
885 NULL, /* LOGNAME */
886 NULL
887 };
888
889 envp[2] = strv_find_prefix(environ, "TERM=");
890
891 close_nointr_nofail(master);
892
893 close_nointr(STDIN_FILENO);
894 close_nointr(STDOUT_FILENO);
895 close_nointr(STDERR_FILENO);
896
897 close_all_fds(&kmsg_socket_pair[1], 1);
898
899 reset_all_signal_handlers();
900
901 assert_se(sigemptyset(&mask) == 0);
902 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
903
904 if (setsid() < 0)
905 goto child_fail;
906
907 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
908 goto child_fail;
909
910 /* Mark / as private, in case somebody marked it shared */
911 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
912 goto child_fail;
913
914 if (mount_all(arg_directory) < 0)
915 goto child_fail;
916
917 if (copy_devnodes(arg_directory) < 0)
918 goto child_fail;
919
920 if (setup_dev_console(arg_directory, console) < 0)
921 goto child_fail;
922
923 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
924 goto child_fail;
925
926 close_nointr_nofail(kmsg_socket_pair[1]);
927
928 if (setup_timezone(arg_directory) < 0)
929 goto child_fail;
930
931 if (chdir(arg_directory) < 0) {
932 log_error("chdir(%s) failed: %m", arg_directory);
933 goto child_fail;
934 }
935
936 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
937 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
938 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
939 goto child_fail;
940
941 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
942 log_error("mount(MS_MOVE) failed: %m");
943 goto child_fail;
944 }
945
946 if (chroot(".") < 0) {
947 log_error("chroot() failed: %m");
948 goto child_fail;
949 }
950
951 if (chdir("/") < 0) {
952 log_error("chdir() failed: %m");
953 goto child_fail;
954 }
955
956 umask(0022);
957
958 loopback_setup();
959
960 if (drop_capabilities() < 0)
961 goto child_fail;
962
963 if (arg_user) {
964
965 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
966 log_error("get_user_creds() failed: %m");
967 goto child_fail;
968 }
969
970 if (mkdir_parents(home, 0775) < 0) {
971 log_error("mkdir_parents() failed: %m");
972 goto child_fail;
973 }
974
975 if (safe_mkdir(home, 0775, uid, gid) < 0) {
976 log_error("safe_mkdir() failed: %m");
977 goto child_fail;
978 }
979
980 if (initgroups((const char*)arg_user, gid) < 0) {
981 log_error("initgroups() failed: %m");
982 goto child_fail;
983 }
984
985 if (setresgid(gid, gid, gid) < 0) {
986 log_error("setregid() failed: %m");
987 goto child_fail;
988 }
989
990 if (setresuid(uid, uid, uid) < 0) {
991 log_error("setreuid() failed: %m");
992 goto child_fail;
993 }
994 }
995
996 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
997 (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
998 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
999 log_error("Out of memory");
1000 goto child_fail;
1001 }
1002
1003 if ((hn = file_name_from_path(arg_directory)))
1004 sethostname(hn, strlen(hn));
1005
1006 if (argc > optind)
1007 execvpe(argv[optind], argv + optind, (char**) envp);
1008 else {
1009 chdir(home ? home : "/root");
1010 execle("/bin/bash", "-bash", NULL, (char**) envp);
1011 }
1012
1013 log_error("execv() failed: %m");
1014
1015 child_fail:
1016 _exit(EXIT_FAILURE);
1017 }
1018
1019 if (process_pty(master, &mask) < 0)
1020 goto finish;
1021
1022 if (saved_attr_valid) {
1023 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1024 saved_attr_valid = false;
1025 }
1026
1027 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1028
1029 if (r < 0)
1030 r = EXIT_FAILURE;
1031
1032 finish:
1033 if (saved_attr_valid)
1034 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1035
1036 if (master >= 0)
1037 close_nointr_nofail(master);
1038
1039 close_pipe(kmsg_socket_pair);
1040
1041 if (oldcg)
1042 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1043
1044 if (newcg)
1045 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1046
1047 free(arg_directory);
1048 strv_free(arg_controllers);
1049 free(oldcg);
1050 free(newcg);
1051
1052 return r;
1053 }