]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: add --read-only switch
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static char *arg_uuid = NULL;
59 static bool arg_private_network = false;
60 static bool arg_read_only = false;
61 static bool arg_boot = false;
62
63 static int help(void) {
64
65 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
66 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
67 " -h --help Show this help\n"
68 " -D --directory=NAME Root directory for the container\n"
69 " -b --boot Boot up full system (i.e. invoke init)\n"
70 " -u --user=USER Run the command under specified user or uid\n"
71 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
72 " --uuid=UUID Set a specific machine UUID for the container\n"
73 " --private-network Disable network in container\n"
74 " --read-only Mount the root directory read-only\n",
75 program_invocation_short_name);
76
77 return 0;
78 }
79
80 static int parse_argv(int argc, char *argv[]) {
81
82 enum {
83 ARG_PRIVATE_NETWORK = 0x100,
84 ARG_UUID,
85 ARG_READ_ONLY
86 };
87
88 static const struct option options[] = {
89 { "help", no_argument, NULL, 'h' },
90 { "directory", required_argument, NULL, 'D' },
91 { "user", required_argument, NULL, 'u' },
92 { "controllers", required_argument, NULL, 'C' },
93 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
94 { "boot", no_argument, NULL, 'b' },
95 { "uuid", required_argument, NULL, ARG_UUID },
96 { "read-only", no_argument, NULL, ARG_READ_ONLY },
97 { NULL, 0, NULL, 0 }
98 };
99
100 int c;
101
102 assert(argc >= 0);
103 assert(argv);
104
105 while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
106
107 switch (c) {
108
109 case 'h':
110 help();
111 return 0;
112
113 case 'D':
114 free(arg_directory);
115 arg_directory = canonicalize_file_name(optarg);
116 if (!arg_directory) {
117 log_error("Failed to canonicalize root directory.");
118 return -ENOMEM;
119 }
120
121 break;
122
123 case 'u':
124 free(arg_user);
125 if (!(arg_user = strdup(optarg))) {
126 log_error("Failed to duplicate user name.");
127 return -ENOMEM;
128 }
129
130 break;
131
132 case 'C':
133 strv_free(arg_controllers);
134 arg_controllers = strv_split(optarg, ",");
135 if (!arg_controllers) {
136 log_error("Failed to split controllers list.");
137 return -ENOMEM;
138 }
139 strv_uniq(arg_controllers);
140
141 break;
142
143 case ARG_PRIVATE_NETWORK:
144 arg_private_network = true;
145 break;
146
147 case 'b':
148 arg_boot = true;
149 break;
150
151 case ARG_UUID:
152 arg_uuid = optarg;
153 break;
154
155 case ARG_READ_ONLY:
156 arg_read_only = true;
157 break;
158
159 case '?':
160 return -EINVAL;
161
162 default:
163 log_error("Unknown option code %c", c);
164 return -EINVAL;
165 }
166 }
167
168 return 1;
169 }
170
171 static int mount_all(const char *dest) {
172
173 typedef struct MountPoint {
174 const char *what;
175 const char *where;
176 const char *type;
177 const char *options;
178 unsigned long flags;
179 bool fatal;
180 } MountPoint;
181
182 static const MountPoint mount_table[] = {
183 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
184 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
185 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
186 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
187 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
188 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
189 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
190 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
191 #ifdef HAVE_SELINUX
192 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
193 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
194 #endif
195 };
196
197 unsigned k;
198 int r = 0;
199 char *where;
200
201 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
202 int t;
203
204 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
205 log_error("Out of memory");
206
207 if (r == 0)
208 r = -ENOMEM;
209
210 break;
211 }
212
213 t = path_is_mount_point(where, false);
214 if (t < 0) {
215 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
216 free(where);
217
218 if (r == 0)
219 r = t;
220
221 continue;
222 }
223
224 mkdir_p(where, 0755);
225
226 if (mount(mount_table[k].what,
227 where,
228 mount_table[k].type,
229 mount_table[k].flags,
230 mount_table[k].options) < 0 &&
231 mount_table[k].fatal) {
232
233 log_error("mount(%s) failed: %m", where);
234
235 if (r == 0)
236 r = -errno;
237 }
238
239 free(where);
240 }
241
242 return r;
243 }
244
245 static int setup_timezone(const char *dest) {
246 char *where;
247
248 assert(dest);
249
250 /* Fix the timezone, if possible */
251 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
252 log_error("Out of memory");
253 return -ENOMEM;
254 }
255
256 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
257 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
258
259 free(where);
260
261 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
262 log_error("Out of memory");
263 return -ENOMEM;
264 }
265
266 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
267 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
268
269 free(where);
270
271 return 0;
272 }
273
274 static int setup_resolv_conf(const char *dest) {
275 char *where;
276
277 assert(dest);
278
279 if (arg_private_network)
280 return 0;
281
282 /* Fix resolv.conf, if possible */
283 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
284 log_error("Out of memory");
285 return -ENOMEM;
286 }
287
288 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
289 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
290
291 free(where);
292
293 return 0;
294 }
295
296 static int copy_devnodes(const char *dest) {
297
298 static const char devnodes[] =
299 "null\0"
300 "zero\0"
301 "full\0"
302 "random\0"
303 "urandom\0"
304 "tty\0"
305 "ptmx\0"
306 "rtc0\0";
307
308 const char *d;
309 int r = 0;
310 mode_t u;
311
312 assert(dest);
313
314 u = umask(0000);
315
316 NULSTR_FOREACH(d, devnodes) {
317 struct stat st;
318 char *from = NULL, *to = NULL;
319
320 asprintf(&from, "/dev/%s", d);
321 asprintf(&to, "%s/dev/%s", dest, d);
322
323 if (!from || !to) {
324 log_error("Failed to allocate devnode path");
325
326 free(from);
327 free(to);
328
329 from = to = NULL;
330
331 if (r == 0)
332 r = -ENOMEM;
333
334 break;
335 }
336
337 if (stat(from, &st) < 0) {
338
339 if (errno != ENOENT) {
340 log_error("Failed to stat %s: %m", from);
341 if (r == 0)
342 r = -errno;
343 }
344
345 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
346
347 log_error("%s is not a char or block device, cannot copy.", from);
348 if (r == 0)
349 r = -EIO;
350
351 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
352
353 log_error("mknod(%s) failed: %m", dest);
354 if (r == 0)
355 r = -errno;
356 }
357
358 free(from);
359 free(to);
360 }
361
362 umask(u);
363
364 return r;
365 }
366
367 static int setup_dev_console(const char *dest, const char *console) {
368 struct stat st;
369 char *to = NULL;
370 int r;
371 mode_t u;
372
373 assert(dest);
374 assert(console);
375
376 u = umask(0000);
377
378 if (stat(console, &st) < 0) {
379 log_error("Failed to stat %s: %m", console);
380 r = -errno;
381 goto finish;
382
383 } else if (!S_ISCHR(st.st_mode)) {
384 log_error("/dev/console is not a char device.");
385 r = -EIO;
386 goto finish;
387 }
388
389 r = chmod_and_chown(console, 0600, 0, 0);
390 if (r < 0) {
391 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
392 goto finish;
393 }
394
395 if (asprintf(&to, "%s/dev/console", dest) < 0) {
396 log_error("Out of memory");
397 r = -ENOMEM;
398 goto finish;
399 }
400
401 /* We need to bind mount the right tty to /dev/console since
402 * ptys can only exist on pts file systems. To have something
403 * to bind mount things on we create a device node first, that
404 * has the right major/minor (note that the major minor
405 * doesn't actually matter here, since we mount it over
406 * anyway). */
407
408 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
409 log_error("mknod() for /dev/console failed: %m");
410 r = -errno;
411 goto finish;
412 }
413
414 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
415 log_error("Bind mount for /dev/console failed: %m");
416 r = -errno;
417 goto finish;
418 }
419
420 finish:
421 free(to);
422 umask(u);
423
424 return r;
425 }
426
427 static int setup_kmsg(const char *dest, int kmsg_socket) {
428 char *from = NULL, *to = NULL;
429 int r, fd, k;
430 mode_t u;
431 union {
432 struct cmsghdr cmsghdr;
433 uint8_t buf[CMSG_SPACE(sizeof(int))];
434 } control;
435 struct msghdr mh;
436 struct cmsghdr *cmsg;
437
438 assert(dest);
439 assert(kmsg_socket >= 0);
440
441 u = umask(0000);
442
443 /* We create the kmsg FIFO as /dev/kmsg, but immediately
444 * delete it after bind mounting it to /proc/kmsg. While FIFOs
445 * on the reading side behave very similar to /proc/kmsg,
446 * their writing side behaves differently from /dev/kmsg in
447 * that writing blocks when nothing is reading. In order to
448 * avoid any problems with containers deadlocking due to this
449 * we simply make /dev/kmsg unavailable to the container. */
450 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
451 log_error("Out of memory");
452 r = -ENOMEM;
453 goto finish;
454 }
455
456 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
457 log_error("Out of memory");
458 r = -ENOMEM;
459 goto finish;
460 }
461
462 if (mkfifo(from, 0600) < 0) {
463 log_error("mkfifo() for /dev/kmsg failed: %m");
464 r = -errno;
465 goto finish;
466 }
467
468 r = chmod_and_chown(from, 0600, 0, 0);
469 if (r < 0) {
470 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
471 goto finish;
472 }
473
474 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
475 log_error("Bind mount for /proc/kmsg failed: %m");
476 r = -errno;
477 goto finish;
478 }
479
480 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
481 if (fd < 0) {
482 log_error("Failed to open fifo: %m");
483 r = -errno;
484 goto finish;
485 }
486
487 zero(mh);
488 zero(control);
489
490 mh.msg_control = &control;
491 mh.msg_controllen = sizeof(control);
492
493 cmsg = CMSG_FIRSTHDR(&mh);
494 cmsg->cmsg_level = SOL_SOCKET;
495 cmsg->cmsg_type = SCM_RIGHTS;
496 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
497 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
498
499 mh.msg_controllen = cmsg->cmsg_len;
500
501 /* Store away the fd in the socket, so that it stays open as
502 * long as we run the child */
503 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
504 close_nointr_nofail(fd);
505
506 if (k < 0) {
507 log_error("Failed to send FIFO fd: %m");
508 r = -errno;
509 goto finish;
510 }
511
512 /* And now make the FIFO unavailable as /dev/kmsg... */
513 unlink(from);
514
515 finish:
516 free(from);
517 free(to);
518 umask(u);
519
520 return r;
521 }
522
523 static int setup_hostname(void) {
524 char *hn;
525 int r = 0;
526
527 hn = file_name_from_path(arg_directory);
528 if (hn) {
529 hn = strdup(hn);
530 if (!hn)
531 return -ENOMEM;
532
533 hostname_cleanup(hn);
534
535 if (!isempty(hn))
536 if (sethostname(hn, strlen(hn)) < 0)
537 r = -errno;
538
539 free(hn);
540 }
541
542 return r;
543 }
544
545 static int drop_capabilities(void) {
546 static const unsigned long retain[] = {
547 CAP_CHOWN,
548 CAP_DAC_OVERRIDE,
549 CAP_DAC_READ_SEARCH,
550 CAP_FOWNER,
551 CAP_FSETID,
552 CAP_IPC_OWNER,
553 CAP_KILL,
554 CAP_LEASE,
555 CAP_LINUX_IMMUTABLE,
556 CAP_NET_BIND_SERVICE,
557 CAP_NET_BROADCAST,
558 CAP_NET_RAW,
559 CAP_SETGID,
560 CAP_SETFCAP,
561 CAP_SETPCAP,
562 CAP_SETUID,
563 CAP_SYS_ADMIN,
564 CAP_SYS_CHROOT,
565 CAP_SYS_NICE,
566 CAP_SYS_PTRACE,
567 CAP_SYS_TTY_CONFIG
568 };
569
570 unsigned long l;
571
572 for (l = 0; l <= cap_last_cap(); l++) {
573 unsigned i;
574
575 for (i = 0; i < ELEMENTSOF(retain); i++)
576 if (retain[i] == l)
577 break;
578
579 if (i < ELEMENTSOF(retain))
580 continue;
581
582 if (prctl(PR_CAPBSET_DROP, l) < 0) {
583 log_error("PR_CAPBSET_DROP failed: %m");
584 return -errno;
585 }
586 }
587
588 return 0;
589 }
590
591 static int is_os_tree(const char *path) {
592 int r;
593 char *p;
594 /* We use /bin/sh as flag file if something is an OS */
595
596 if (asprintf(&p, "%s/bin/sh", path) < 0)
597 return -ENOMEM;
598
599 r = access(p, F_OK);
600 free(p);
601
602 return r < 0 ? 0 : 1;
603 }
604
605 static int process_pty(int master, sigset_t *mask) {
606
607 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
608 size_t in_buffer_full = 0, out_buffer_full = 0;
609 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
610 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
611 int ep = -1, signal_fd = -1, r;
612
613 fd_nonblock(STDIN_FILENO, 1);
614 fd_nonblock(STDOUT_FILENO, 1);
615 fd_nonblock(master, 1);
616
617 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
618 log_error("signalfd(): %m");
619 r = -errno;
620 goto finish;
621 }
622
623 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
624 log_error("Failed to create epoll: %m");
625 r = -errno;
626 goto finish;
627 }
628
629 zero(stdin_ev);
630 stdin_ev.events = EPOLLIN|EPOLLET;
631 stdin_ev.data.fd = STDIN_FILENO;
632
633 zero(stdout_ev);
634 stdout_ev.events = EPOLLOUT|EPOLLET;
635 stdout_ev.data.fd = STDOUT_FILENO;
636
637 zero(master_ev);
638 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
639 master_ev.data.fd = master;
640
641 zero(signal_ev);
642 signal_ev.events = EPOLLIN;
643 signal_ev.data.fd = signal_fd;
644
645 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
646 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
647 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
648 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
649 log_error("Failed to regiser fds in epoll: %m");
650 r = -errno;
651 goto finish;
652 }
653
654 for (;;) {
655 struct epoll_event ev[16];
656 ssize_t k;
657 int i, nfds;
658
659 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
660
661 if (errno == EINTR || errno == EAGAIN)
662 continue;
663
664 log_error("epoll_wait(): %m");
665 r = -errno;
666 goto finish;
667 }
668
669 assert(nfds >= 1);
670
671 for (i = 0; i < nfds; i++) {
672 if (ev[i].data.fd == STDIN_FILENO) {
673
674 if (ev[i].events & (EPOLLIN|EPOLLHUP))
675 stdin_readable = true;
676
677 } else if (ev[i].data.fd == STDOUT_FILENO) {
678
679 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
680 stdout_writable = true;
681
682 } else if (ev[i].data.fd == master) {
683
684 if (ev[i].events & (EPOLLIN|EPOLLHUP))
685 master_readable = true;
686
687 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
688 master_writable = true;
689
690 } else if (ev[i].data.fd == signal_fd) {
691 struct signalfd_siginfo sfsi;
692 ssize_t n;
693
694 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
695
696 if (n >= 0) {
697 log_error("Failed to read from signalfd: invalid block size");
698 r = -EIO;
699 goto finish;
700 }
701
702 if (errno != EINTR && errno != EAGAIN) {
703 log_error("Failed to read from signalfd: %m");
704 r = -errno;
705 goto finish;
706 }
707 } else {
708
709 if (sfsi.ssi_signo == SIGWINCH) {
710 struct winsize ws;
711
712 /* The window size changed, let's forward that. */
713 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
714 ioctl(master, TIOCSWINSZ, &ws);
715 } else {
716 r = 0;
717 goto finish;
718 }
719 }
720 }
721 }
722
723 while ((stdin_readable && in_buffer_full <= 0) ||
724 (master_writable && in_buffer_full > 0) ||
725 (master_readable && out_buffer_full <= 0) ||
726 (stdout_writable && out_buffer_full > 0)) {
727
728 if (stdin_readable && in_buffer_full < LINE_MAX) {
729
730 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
731
732 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
733 stdin_readable = false;
734 else {
735 log_error("read(): %m");
736 r = -errno;
737 goto finish;
738 }
739 } else
740 in_buffer_full += (size_t) k;
741 }
742
743 if (master_writable && in_buffer_full > 0) {
744
745 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
746
747 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
748 master_writable = false;
749 else {
750 log_error("write(): %m");
751 r = -errno;
752 goto finish;
753 }
754
755 } else {
756 assert(in_buffer_full >= (size_t) k);
757 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
758 in_buffer_full -= k;
759 }
760 }
761
762 if (master_readable && out_buffer_full < LINE_MAX) {
763
764 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
765
766 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
767 master_readable = false;
768 else {
769 log_error("read(): %m");
770 r = -errno;
771 goto finish;
772 }
773 } else
774 out_buffer_full += (size_t) k;
775 }
776
777 if (stdout_writable && out_buffer_full > 0) {
778
779 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
780
781 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
782 stdout_writable = false;
783 else {
784 log_error("write(): %m");
785 r = -errno;
786 goto finish;
787 }
788
789 } else {
790 assert(out_buffer_full >= (size_t) k);
791 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
792 out_buffer_full -= k;
793 }
794 }
795 }
796 }
797
798 finish:
799 if (ep >= 0)
800 close_nointr_nofail(ep);
801
802 if (signal_fd >= 0)
803 close_nointr_nofail(signal_fd);
804
805 return r;
806 }
807
808 int main(int argc, char *argv[]) {
809 pid_t pid = 0;
810 int r = EXIT_FAILURE, k;
811 char *oldcg = NULL, *newcg = NULL;
812 char **controller = NULL;
813 int master = -1;
814 const char *console = NULL;
815 struct termios saved_attr, raw_attr;
816 sigset_t mask;
817 bool saved_attr_valid = false;
818 struct winsize ws;
819 int kmsg_socket_pair[2] = { -1, -1 };
820
821 log_parse_environment();
822 log_open();
823
824 if ((r = parse_argv(argc, argv)) <= 0)
825 goto finish;
826
827 if (arg_directory) {
828 char *p;
829
830 p = path_make_absolute_cwd(arg_directory);
831 free(arg_directory);
832 arg_directory = p;
833 } else
834 arg_directory = get_current_dir_name();
835
836 if (!arg_directory) {
837 log_error("Failed to determine path");
838 goto finish;
839 }
840
841 path_kill_slashes(arg_directory);
842
843 if (geteuid() != 0) {
844 log_error("Need to be root.");
845 goto finish;
846 }
847
848 if (sd_booted() <= 0) {
849 log_error("Not running on a systemd system.");
850 goto finish;
851 }
852
853 if (path_equal(arg_directory, "/")) {
854 log_error("Spawning container on root directory not supported.");
855 goto finish;
856 }
857
858 if (is_os_tree(arg_directory) <= 0) {
859 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
860 goto finish;
861 }
862
863 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
864 log_error("Failed to determine current cgroup: %s", strerror(-k));
865 goto finish;
866 }
867
868 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
869 log_error("Failed to allocate cgroup path.");
870 goto finish;
871 }
872
873 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
874 if (k < 0) {
875 log_error("Failed to create cgroup: %s", strerror(-k));
876 goto finish;
877 }
878
879 STRV_FOREACH(controller,arg_controllers) {
880 k = cg_create_and_attach(*controller, newcg, 0);
881 if (k < 0)
882 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
883 }
884
885 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
886 log_error("Failed to acquire pseudo tty: %m");
887 goto finish;
888 }
889
890 if (!(console = ptsname(master))) {
891 log_error("Failed to determine tty name: %m");
892 goto finish;
893 }
894
895 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
896
897 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
898 ioctl(master, TIOCSWINSZ, &ws);
899
900 if (unlockpt(master) < 0) {
901 log_error("Failed to unlock tty: %m");
902 goto finish;
903 }
904
905 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
906 log_error("Failed to get terminal attributes: %m");
907 goto finish;
908 }
909
910 saved_attr_valid = true;
911
912 raw_attr = saved_attr;
913 cfmakeraw(&raw_attr);
914 raw_attr.c_lflag &= ~ECHO;
915
916 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
917 log_error("Failed to set terminal attributes: %m");
918 goto finish;
919 }
920
921 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
922 log_error("Failed to create kmsg socket pair");
923 goto finish;
924 }
925
926 assert_se(sigemptyset(&mask) == 0);
927 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
928 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
929
930 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
931 if (pid < 0) {
932 if (errno == EINVAL)
933 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
934 else
935 log_error("clone() failed: %m");
936
937 goto finish;
938 }
939
940 if (pid == 0) {
941 /* child */
942
943 const char *home = NULL;
944 uid_t uid = (uid_t) -1;
945 gid_t gid = (gid_t) -1;
946 const char *envp[] = {
947 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
948 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
949 NULL, /* TERM */
950 NULL, /* HOME */
951 NULL, /* USER */
952 NULL, /* LOGNAME */
953 NULL, /* container_uuid */
954 NULL
955 };
956
957 envp[2] = strv_find_prefix(environ, "TERM=");
958
959 close_nointr_nofail(master);
960
961 close_nointr(STDIN_FILENO);
962 close_nointr(STDOUT_FILENO);
963 close_nointr(STDERR_FILENO);
964
965 close_all_fds(&kmsg_socket_pair[1], 1);
966
967 reset_all_signal_handlers();
968
969 assert_se(sigemptyset(&mask) == 0);
970 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
971
972 if (setsid() < 0)
973 goto child_fail;
974
975 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
976 goto child_fail;
977
978 /* Mark / as private, in case somebody marked it shared */
979 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
980 goto child_fail;
981
982 /* Turn directory into bind mount */
983 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
984 log_error("Failed to make bind mount.");
985 goto child_fail;
986 }
987
988 if (arg_read_only)
989 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
990 log_error("Failed to make read-only.");
991 goto child_fail;
992 }
993
994 if (mount_all(arg_directory) < 0)
995 goto child_fail;
996
997 if (copy_devnodes(arg_directory) < 0)
998 goto child_fail;
999
1000 if (setup_dev_console(arg_directory, console) < 0)
1001 goto child_fail;
1002
1003 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1004 goto child_fail;
1005
1006 close_nointr_nofail(kmsg_socket_pair[1]);
1007
1008 if (setup_timezone(arg_directory) < 0)
1009 goto child_fail;
1010
1011 if (setup_resolv_conf(arg_directory) < 0)
1012 goto child_fail;
1013
1014 if (chdir(arg_directory) < 0) {
1015 log_error("chdir(%s) failed: %m", arg_directory);
1016 goto child_fail;
1017 }
1018
1019 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1020 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1021 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1022 goto child_fail;
1023
1024 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1025 log_error("mount(MS_BIND) failed: %m");
1026 goto child_fail;
1027 }
1028
1029 if (chroot(".") < 0) {
1030 log_error("chroot() failed: %m");
1031 goto child_fail;
1032 }
1033
1034 if (chdir("/") < 0) {
1035 log_error("chdir() failed: %m");
1036 goto child_fail;
1037 }
1038
1039 umask(0022);
1040
1041 loopback_setup();
1042
1043 if (drop_capabilities() < 0)
1044 goto child_fail;
1045
1046 if (arg_user) {
1047
1048 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1049 log_error("get_user_creds() failed: %m");
1050 goto child_fail;
1051 }
1052
1053 if (mkdir_parents(home, 0775) < 0) {
1054 log_error("mkdir_parents() failed: %m");
1055 goto child_fail;
1056 }
1057
1058 if (safe_mkdir(home, 0775, uid, gid) < 0) {
1059 log_error("safe_mkdir() failed: %m");
1060 goto child_fail;
1061 }
1062
1063 if (initgroups((const char*)arg_user, gid) < 0) {
1064 log_error("initgroups() failed: %m");
1065 goto child_fail;
1066 }
1067
1068 if (setresgid(gid, gid, gid) < 0) {
1069 log_error("setregid() failed: %m");
1070 goto child_fail;
1071 }
1072
1073 if (setresuid(uid, uid, uid) < 0) {
1074 log_error("setreuid() failed: %m");
1075 goto child_fail;
1076 }
1077 }
1078
1079 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1080 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1081 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1082 log_error("Out of memory");
1083 goto child_fail;
1084 }
1085
1086 if (arg_uuid) {
1087 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1088 log_error("Out of memory");
1089 goto child_fail;
1090 }
1091 }
1092
1093 setup_hostname();
1094
1095 if (arg_boot) {
1096 char **a;
1097 size_t l;
1098
1099 /* Automatically search for the init system */
1100
1101 l = 1 + argc - optind;
1102 a = newa(char*, l + 1);
1103 memcpy(a + 1, argv + optind, l * sizeof(char*));
1104
1105 a[0] = (char*) "/usr/lib/systemd/systemd";
1106 execve(a[0], a, (char**) envp);
1107
1108 a[0] = (char*) "/lib/systemd/systemd";
1109 execve(a[0], a, (char**) envp);
1110
1111 a[0] = (char*) "/sbin/init";
1112 execve(a[0], a, (char**) envp);
1113 } else if (argc > optind)
1114 execvpe(argv[optind], argv + optind, (char**) envp);
1115 else {
1116 chdir(home ? home : "/root");
1117 execle("/bin/bash", "-bash", NULL, (char**) envp);
1118 }
1119
1120 log_error("execv() failed: %m");
1121
1122 child_fail:
1123 _exit(EXIT_FAILURE);
1124 }
1125
1126 if (process_pty(master, &mask) < 0)
1127 goto finish;
1128
1129 if (saved_attr_valid) {
1130 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1131 saved_attr_valid = false;
1132 }
1133
1134 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1135
1136 if (r < 0)
1137 r = EXIT_FAILURE;
1138
1139 finish:
1140 if (saved_attr_valid)
1141 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1142
1143 if (master >= 0)
1144 close_nointr_nofail(master);
1145
1146 close_pipe(kmsg_socket_pair);
1147
1148 if (oldcg)
1149 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1150
1151 if (newcg)
1152 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1153
1154 free(arg_directory);
1155 strv_free(arg_controllers);
1156 free(oldcg);
1157 free(newcg);
1158
1159 return r;
1160 }