]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
util: split-out path-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55
56 static char *arg_directory = NULL;
57 static char *arg_user = NULL;
58 static char **arg_controllers = NULL;
59 static char *arg_uuid = NULL;
60 static bool arg_private_network = false;
61 static bool arg_read_only = false;
62 static bool arg_boot = false;
63
64 static int help(void) {
65
66 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
67 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
68 " -h --help Show this help\n"
69 " -D --directory=NAME Root directory for the container\n"
70 " -b --boot Boot up full system (i.e. invoke init)\n"
71 " -u --user=USER Run the command under specified user or uid\n"
72 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
73 " --uuid=UUID Set a specific machine UUID for the container\n"
74 " --private-network Disable network in container\n"
75 " --read-only Mount the root directory read-only\n",
76 program_invocation_short_name);
77
78 return 0;
79 }
80
81 static int parse_argv(int argc, char *argv[]) {
82
83 enum {
84 ARG_PRIVATE_NETWORK = 0x100,
85 ARG_UUID,
86 ARG_READ_ONLY
87 };
88
89 static const struct option options[] = {
90 { "help", no_argument, NULL, 'h' },
91 { "directory", required_argument, NULL, 'D' },
92 { "user", required_argument, NULL, 'u' },
93 { "controllers", required_argument, NULL, 'C' },
94 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
95 { "boot", no_argument, NULL, 'b' },
96 { "uuid", required_argument, NULL, ARG_UUID },
97 { "read-only", no_argument, NULL, ARG_READ_ONLY },
98 { NULL, 0, NULL, 0 }
99 };
100
101 int c;
102
103 assert(argc >= 0);
104 assert(argv);
105
106 while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
107
108 switch (c) {
109
110 case 'h':
111 help();
112 return 0;
113
114 case 'D':
115 free(arg_directory);
116 arg_directory = canonicalize_file_name(optarg);
117 if (!arg_directory) {
118 log_error("Failed to canonicalize root directory.");
119 return -ENOMEM;
120 }
121
122 break;
123
124 case 'u':
125 free(arg_user);
126 if (!(arg_user = strdup(optarg))) {
127 log_error("Failed to duplicate user name.");
128 return -ENOMEM;
129 }
130
131 break;
132
133 case 'C':
134 strv_free(arg_controllers);
135 arg_controllers = strv_split(optarg, ",");
136 if (!arg_controllers) {
137 log_error("Failed to split controllers list.");
138 return -ENOMEM;
139 }
140 strv_uniq(arg_controllers);
141
142 break;
143
144 case ARG_PRIVATE_NETWORK:
145 arg_private_network = true;
146 break;
147
148 case 'b':
149 arg_boot = true;
150 break;
151
152 case ARG_UUID:
153 arg_uuid = optarg;
154 break;
155
156 case ARG_READ_ONLY:
157 arg_read_only = true;
158 break;
159
160 case '?':
161 return -EINVAL;
162
163 default:
164 log_error("Unknown option code %c", c);
165 return -EINVAL;
166 }
167 }
168
169 return 1;
170 }
171
172 static int mount_all(const char *dest) {
173
174 typedef struct MountPoint {
175 const char *what;
176 const char *where;
177 const char *type;
178 const char *options;
179 unsigned long flags;
180 bool fatal;
181 } MountPoint;
182
183 static const MountPoint mount_table[] = {
184 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
185 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
186 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
187 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
188 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
189 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
190 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
191 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
192 #ifdef HAVE_SELINUX
193 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
194 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
195 #endif
196 };
197
198 unsigned k;
199 int r = 0;
200 char *where;
201
202 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
203 int t;
204
205 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
206 log_error("Out of memory");
207
208 if (r == 0)
209 r = -ENOMEM;
210
211 break;
212 }
213
214 t = path_is_mount_point(where, false);
215 if (t < 0) {
216 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
217 free(where);
218
219 if (r == 0)
220 r = t;
221
222 continue;
223 }
224
225 mkdir_p(where, 0755);
226
227 if (mount(mount_table[k].what,
228 where,
229 mount_table[k].type,
230 mount_table[k].flags,
231 mount_table[k].options) < 0 &&
232 mount_table[k].fatal) {
233
234 log_error("mount(%s) failed: %m", where);
235
236 if (r == 0)
237 r = -errno;
238 }
239
240 free(where);
241 }
242
243 return r;
244 }
245
246 static int setup_timezone(const char *dest) {
247 char *where;
248
249 assert(dest);
250
251 /* Fix the timezone, if possible */
252 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
253 log_error("Out of memory");
254 return -ENOMEM;
255 }
256
257 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
258 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
259
260 free(where);
261
262 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
263 log_error("Out of memory");
264 return -ENOMEM;
265 }
266
267 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
268 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
269
270 free(where);
271
272 return 0;
273 }
274
275 static int setup_resolv_conf(const char *dest) {
276 char *where;
277
278 assert(dest);
279
280 if (arg_private_network)
281 return 0;
282
283 /* Fix resolv.conf, if possible */
284 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
285 log_error("Out of memory");
286 return -ENOMEM;
287 }
288
289 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
290 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
291
292 free(where);
293
294 return 0;
295 }
296
297 static int copy_devnodes(const char *dest) {
298
299 static const char devnodes[] =
300 "null\0"
301 "zero\0"
302 "full\0"
303 "random\0"
304 "urandom\0"
305 "tty\0"
306 "ptmx\0"
307 "rtc0\0";
308
309 const char *d;
310 int r = 0;
311 mode_t u;
312
313 assert(dest);
314
315 u = umask(0000);
316
317 NULSTR_FOREACH(d, devnodes) {
318 struct stat st;
319 char *from = NULL, *to = NULL;
320
321 asprintf(&from, "/dev/%s", d);
322 asprintf(&to, "%s/dev/%s", dest, d);
323
324 if (!from || !to) {
325 log_error("Failed to allocate devnode path");
326
327 free(from);
328 free(to);
329
330 from = to = NULL;
331
332 if (r == 0)
333 r = -ENOMEM;
334
335 break;
336 }
337
338 if (stat(from, &st) < 0) {
339
340 if (errno != ENOENT) {
341 log_error("Failed to stat %s: %m", from);
342 if (r == 0)
343 r = -errno;
344 }
345
346 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
347
348 log_error("%s is not a char or block device, cannot copy.", from);
349 if (r == 0)
350 r = -EIO;
351
352 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
353
354 log_error("mknod(%s) failed: %m", dest);
355 if (r == 0)
356 r = -errno;
357 }
358
359 free(from);
360 free(to);
361 }
362
363 umask(u);
364
365 return r;
366 }
367
368 static int setup_dev_console(const char *dest, const char *console) {
369 struct stat st;
370 char *to = NULL;
371 int r;
372 mode_t u;
373
374 assert(dest);
375 assert(console);
376
377 u = umask(0000);
378
379 if (stat(console, &st) < 0) {
380 log_error("Failed to stat %s: %m", console);
381 r = -errno;
382 goto finish;
383
384 } else if (!S_ISCHR(st.st_mode)) {
385 log_error("/dev/console is not a char device.");
386 r = -EIO;
387 goto finish;
388 }
389
390 r = chmod_and_chown(console, 0600, 0, 0);
391 if (r < 0) {
392 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
393 goto finish;
394 }
395
396 if (asprintf(&to, "%s/dev/console", dest) < 0) {
397 log_error("Out of memory");
398 r = -ENOMEM;
399 goto finish;
400 }
401
402 /* We need to bind mount the right tty to /dev/console since
403 * ptys can only exist on pts file systems. To have something
404 * to bind mount things on we create a device node first, that
405 * has the right major/minor (note that the major minor
406 * doesn't actually matter here, since we mount it over
407 * anyway). */
408
409 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
410 log_error("mknod() for /dev/console failed: %m");
411 r = -errno;
412 goto finish;
413 }
414
415 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
416 log_error("Bind mount for /dev/console failed: %m");
417 r = -errno;
418 goto finish;
419 }
420
421 finish:
422 free(to);
423 umask(u);
424
425 return r;
426 }
427
428 static int setup_kmsg(const char *dest, int kmsg_socket) {
429 char *from = NULL, *to = NULL;
430 int r, fd, k;
431 mode_t u;
432 union {
433 struct cmsghdr cmsghdr;
434 uint8_t buf[CMSG_SPACE(sizeof(int))];
435 } control;
436 struct msghdr mh;
437 struct cmsghdr *cmsg;
438
439 assert(dest);
440 assert(kmsg_socket >= 0);
441
442 u = umask(0000);
443
444 /* We create the kmsg FIFO as /dev/kmsg, but immediately
445 * delete it after bind mounting it to /proc/kmsg. While FIFOs
446 * on the reading side behave very similar to /proc/kmsg,
447 * their writing side behaves differently from /dev/kmsg in
448 * that writing blocks when nothing is reading. In order to
449 * avoid any problems with containers deadlocking due to this
450 * we simply make /dev/kmsg unavailable to the container. */
451 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
452 log_error("Out of memory");
453 r = -ENOMEM;
454 goto finish;
455 }
456
457 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
458 log_error("Out of memory");
459 r = -ENOMEM;
460 goto finish;
461 }
462
463 if (mkfifo(from, 0600) < 0) {
464 log_error("mkfifo() for /dev/kmsg failed: %m");
465 r = -errno;
466 goto finish;
467 }
468
469 r = chmod_and_chown(from, 0600, 0, 0);
470 if (r < 0) {
471 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
472 goto finish;
473 }
474
475 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
476 log_error("Bind mount for /proc/kmsg failed: %m");
477 r = -errno;
478 goto finish;
479 }
480
481 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
482 if (fd < 0) {
483 log_error("Failed to open fifo: %m");
484 r = -errno;
485 goto finish;
486 }
487
488 zero(mh);
489 zero(control);
490
491 mh.msg_control = &control;
492 mh.msg_controllen = sizeof(control);
493
494 cmsg = CMSG_FIRSTHDR(&mh);
495 cmsg->cmsg_level = SOL_SOCKET;
496 cmsg->cmsg_type = SCM_RIGHTS;
497 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
498 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
499
500 mh.msg_controllen = cmsg->cmsg_len;
501
502 /* Store away the fd in the socket, so that it stays open as
503 * long as we run the child */
504 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
505 close_nointr_nofail(fd);
506
507 if (k < 0) {
508 log_error("Failed to send FIFO fd: %m");
509 r = -errno;
510 goto finish;
511 }
512
513 /* And now make the FIFO unavailable as /dev/kmsg... */
514 unlink(from);
515
516 finish:
517 free(from);
518 free(to);
519 umask(u);
520
521 return r;
522 }
523
524 static int setup_hostname(void) {
525 char *hn;
526 int r = 0;
527
528 hn = path_get_file_name(arg_directory);
529 if (hn) {
530 hn = strdup(hn);
531 if (!hn)
532 return -ENOMEM;
533
534 hostname_cleanup(hn);
535
536 if (!isempty(hn))
537 if (sethostname(hn, strlen(hn)) < 0)
538 r = -errno;
539
540 free(hn);
541 }
542
543 return r;
544 }
545
546 static int drop_capabilities(void) {
547 static const unsigned long retain[] = {
548 CAP_CHOWN,
549 CAP_DAC_OVERRIDE,
550 CAP_DAC_READ_SEARCH,
551 CAP_FOWNER,
552 CAP_FSETID,
553 CAP_IPC_OWNER,
554 CAP_KILL,
555 CAP_LEASE,
556 CAP_LINUX_IMMUTABLE,
557 CAP_NET_BIND_SERVICE,
558 CAP_NET_BROADCAST,
559 CAP_NET_RAW,
560 CAP_SETGID,
561 CAP_SETFCAP,
562 CAP_SETPCAP,
563 CAP_SETUID,
564 CAP_SYS_ADMIN,
565 CAP_SYS_CHROOT,
566 CAP_SYS_NICE,
567 CAP_SYS_PTRACE,
568 CAP_SYS_TTY_CONFIG
569 };
570
571 unsigned long l;
572
573 for (l = 0; l <= cap_last_cap(); l++) {
574 unsigned i;
575
576 for (i = 0; i < ELEMENTSOF(retain); i++)
577 if (retain[i] == l)
578 break;
579
580 if (i < ELEMENTSOF(retain))
581 continue;
582
583 if (prctl(PR_CAPBSET_DROP, l) < 0) {
584 log_error("PR_CAPBSET_DROP failed: %m");
585 return -errno;
586 }
587 }
588
589 return 0;
590 }
591
592 static int is_os_tree(const char *path) {
593 int r;
594 char *p;
595 /* We use /bin/sh as flag file if something is an OS */
596
597 if (asprintf(&p, "%s/bin/sh", path) < 0)
598 return -ENOMEM;
599
600 r = access(p, F_OK);
601 free(p);
602
603 return r < 0 ? 0 : 1;
604 }
605
606 static int process_pty(int master, sigset_t *mask) {
607
608 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
609 size_t in_buffer_full = 0, out_buffer_full = 0;
610 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
611 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
612 int ep = -1, signal_fd = -1, r;
613
614 fd_nonblock(STDIN_FILENO, 1);
615 fd_nonblock(STDOUT_FILENO, 1);
616 fd_nonblock(master, 1);
617
618 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
619 log_error("signalfd(): %m");
620 r = -errno;
621 goto finish;
622 }
623
624 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
625 log_error("Failed to create epoll: %m");
626 r = -errno;
627 goto finish;
628 }
629
630 zero(stdin_ev);
631 stdin_ev.events = EPOLLIN|EPOLLET;
632 stdin_ev.data.fd = STDIN_FILENO;
633
634 zero(stdout_ev);
635 stdout_ev.events = EPOLLOUT|EPOLLET;
636 stdout_ev.data.fd = STDOUT_FILENO;
637
638 zero(master_ev);
639 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
640 master_ev.data.fd = master;
641
642 zero(signal_ev);
643 signal_ev.events = EPOLLIN;
644 signal_ev.data.fd = signal_fd;
645
646 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
647 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
648 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
649 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
650 log_error("Failed to regiser fds in epoll: %m");
651 r = -errno;
652 goto finish;
653 }
654
655 for (;;) {
656 struct epoll_event ev[16];
657 ssize_t k;
658 int i, nfds;
659
660 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
661
662 if (errno == EINTR || errno == EAGAIN)
663 continue;
664
665 log_error("epoll_wait(): %m");
666 r = -errno;
667 goto finish;
668 }
669
670 assert(nfds >= 1);
671
672 for (i = 0; i < nfds; i++) {
673 if (ev[i].data.fd == STDIN_FILENO) {
674
675 if (ev[i].events & (EPOLLIN|EPOLLHUP))
676 stdin_readable = true;
677
678 } else if (ev[i].data.fd == STDOUT_FILENO) {
679
680 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
681 stdout_writable = true;
682
683 } else if (ev[i].data.fd == master) {
684
685 if (ev[i].events & (EPOLLIN|EPOLLHUP))
686 master_readable = true;
687
688 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
689 master_writable = true;
690
691 } else if (ev[i].data.fd == signal_fd) {
692 struct signalfd_siginfo sfsi;
693 ssize_t n;
694
695 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
696
697 if (n >= 0) {
698 log_error("Failed to read from signalfd: invalid block size");
699 r = -EIO;
700 goto finish;
701 }
702
703 if (errno != EINTR && errno != EAGAIN) {
704 log_error("Failed to read from signalfd: %m");
705 r = -errno;
706 goto finish;
707 }
708 } else {
709
710 if (sfsi.ssi_signo == SIGWINCH) {
711 struct winsize ws;
712
713 /* The window size changed, let's forward that. */
714 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
715 ioctl(master, TIOCSWINSZ, &ws);
716 } else {
717 r = 0;
718 goto finish;
719 }
720 }
721 }
722 }
723
724 while ((stdin_readable && in_buffer_full <= 0) ||
725 (master_writable && in_buffer_full > 0) ||
726 (master_readable && out_buffer_full <= 0) ||
727 (stdout_writable && out_buffer_full > 0)) {
728
729 if (stdin_readable && in_buffer_full < LINE_MAX) {
730
731 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
732
733 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
734 stdin_readable = false;
735 else {
736 log_error("read(): %m");
737 r = -errno;
738 goto finish;
739 }
740 } else
741 in_buffer_full += (size_t) k;
742 }
743
744 if (master_writable && in_buffer_full > 0) {
745
746 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
747
748 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
749 master_writable = false;
750 else {
751 log_error("write(): %m");
752 r = -errno;
753 goto finish;
754 }
755
756 } else {
757 assert(in_buffer_full >= (size_t) k);
758 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
759 in_buffer_full -= k;
760 }
761 }
762
763 if (master_readable && out_buffer_full < LINE_MAX) {
764
765 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
766
767 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
768 master_readable = false;
769 else {
770 log_error("read(): %m");
771 r = -errno;
772 goto finish;
773 }
774 } else
775 out_buffer_full += (size_t) k;
776 }
777
778 if (stdout_writable && out_buffer_full > 0) {
779
780 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
781
782 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
783 stdout_writable = false;
784 else {
785 log_error("write(): %m");
786 r = -errno;
787 goto finish;
788 }
789
790 } else {
791 assert(out_buffer_full >= (size_t) k);
792 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
793 out_buffer_full -= k;
794 }
795 }
796 }
797 }
798
799 finish:
800 if (ep >= 0)
801 close_nointr_nofail(ep);
802
803 if (signal_fd >= 0)
804 close_nointr_nofail(signal_fd);
805
806 return r;
807 }
808
809 int main(int argc, char *argv[]) {
810 pid_t pid = 0;
811 int r = EXIT_FAILURE, k;
812 char *oldcg = NULL, *newcg = NULL;
813 char **controller = NULL;
814 int master = -1;
815 const char *console = NULL;
816 struct termios saved_attr, raw_attr;
817 sigset_t mask;
818 bool saved_attr_valid = false;
819 struct winsize ws;
820 int kmsg_socket_pair[2] = { -1, -1 };
821
822 log_parse_environment();
823 log_open();
824
825 if ((r = parse_argv(argc, argv)) <= 0)
826 goto finish;
827
828 if (arg_directory) {
829 char *p;
830
831 p = path_make_absolute_cwd(arg_directory);
832 free(arg_directory);
833 arg_directory = p;
834 } else
835 arg_directory = get_current_dir_name();
836
837 if (!arg_directory) {
838 log_error("Failed to determine path");
839 goto finish;
840 }
841
842 path_kill_slashes(arg_directory);
843
844 if (geteuid() != 0) {
845 log_error("Need to be root.");
846 goto finish;
847 }
848
849 if (sd_booted() <= 0) {
850 log_error("Not running on a systemd system.");
851 goto finish;
852 }
853
854 if (path_equal(arg_directory, "/")) {
855 log_error("Spawning container on root directory not supported.");
856 goto finish;
857 }
858
859 if (is_os_tree(arg_directory) <= 0) {
860 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
861 goto finish;
862 }
863
864 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
865 log_error("Failed to determine current cgroup: %s", strerror(-k));
866 goto finish;
867 }
868
869 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
870 log_error("Failed to allocate cgroup path.");
871 goto finish;
872 }
873
874 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
875 if (k < 0) {
876 log_error("Failed to create cgroup: %s", strerror(-k));
877 goto finish;
878 }
879
880 STRV_FOREACH(controller,arg_controllers) {
881 k = cg_create_and_attach(*controller, newcg, 0);
882 if (k < 0)
883 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
884 }
885
886 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
887 log_error("Failed to acquire pseudo tty: %m");
888 goto finish;
889 }
890
891 if (!(console = ptsname(master))) {
892 log_error("Failed to determine tty name: %m");
893 goto finish;
894 }
895
896 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
897
898 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
899 ioctl(master, TIOCSWINSZ, &ws);
900
901 if (unlockpt(master) < 0) {
902 log_error("Failed to unlock tty: %m");
903 goto finish;
904 }
905
906 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
907 log_error("Failed to get terminal attributes: %m");
908 goto finish;
909 }
910
911 saved_attr_valid = true;
912
913 raw_attr = saved_attr;
914 cfmakeraw(&raw_attr);
915 raw_attr.c_lflag &= ~ECHO;
916
917 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
918 log_error("Failed to set terminal attributes: %m");
919 goto finish;
920 }
921
922 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
923 log_error("Failed to create kmsg socket pair");
924 goto finish;
925 }
926
927 assert_se(sigemptyset(&mask) == 0);
928 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
929 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
930
931 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
932 if (pid < 0) {
933 if (errno == EINVAL)
934 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
935 else
936 log_error("clone() failed: %m");
937
938 goto finish;
939 }
940
941 if (pid == 0) {
942 /* child */
943
944 const char *home = NULL;
945 uid_t uid = (uid_t) -1;
946 gid_t gid = (gid_t) -1;
947 const char *envp[] = {
948 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
949 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
950 NULL, /* TERM */
951 NULL, /* HOME */
952 NULL, /* USER */
953 NULL, /* LOGNAME */
954 NULL, /* container_uuid */
955 NULL
956 };
957
958 envp[2] = strv_find_prefix(environ, "TERM=");
959
960 close_nointr_nofail(master);
961
962 close_nointr(STDIN_FILENO);
963 close_nointr(STDOUT_FILENO);
964 close_nointr(STDERR_FILENO);
965
966 close_all_fds(&kmsg_socket_pair[1], 1);
967
968 reset_all_signal_handlers();
969
970 assert_se(sigemptyset(&mask) == 0);
971 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
972
973 if (setsid() < 0)
974 goto child_fail;
975
976 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
977 goto child_fail;
978
979 /* Mark / as private, in case somebody marked it shared */
980 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
981 goto child_fail;
982
983 /* Turn directory into bind mount */
984 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
985 log_error("Failed to make bind mount.");
986 goto child_fail;
987 }
988
989 if (arg_read_only)
990 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
991 log_error("Failed to make read-only.");
992 goto child_fail;
993 }
994
995 if (mount_all(arg_directory) < 0)
996 goto child_fail;
997
998 if (copy_devnodes(arg_directory) < 0)
999 goto child_fail;
1000
1001 if (setup_dev_console(arg_directory, console) < 0)
1002 goto child_fail;
1003
1004 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1005 goto child_fail;
1006
1007 close_nointr_nofail(kmsg_socket_pair[1]);
1008
1009 if (setup_timezone(arg_directory) < 0)
1010 goto child_fail;
1011
1012 if (setup_resolv_conf(arg_directory) < 0)
1013 goto child_fail;
1014
1015 if (chdir(arg_directory) < 0) {
1016 log_error("chdir(%s) failed: %m", arg_directory);
1017 goto child_fail;
1018 }
1019
1020 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1021 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1022 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1023 goto child_fail;
1024
1025 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1026 log_error("mount(MS_BIND) failed: %m");
1027 goto child_fail;
1028 }
1029
1030 if (chroot(".") < 0) {
1031 log_error("chroot() failed: %m");
1032 goto child_fail;
1033 }
1034
1035 if (chdir("/") < 0) {
1036 log_error("chdir() failed: %m");
1037 goto child_fail;
1038 }
1039
1040 umask(0022);
1041
1042 loopback_setup();
1043
1044 if (drop_capabilities() < 0)
1045 goto child_fail;
1046
1047 if (arg_user) {
1048
1049 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1050 log_error("get_user_creds() failed: %m");
1051 goto child_fail;
1052 }
1053
1054 if (mkdir_parents(home, 0775) < 0) {
1055 log_error("mkdir_parents() failed: %m");
1056 goto child_fail;
1057 }
1058
1059 if (safe_mkdir(home, 0775, uid, gid) < 0) {
1060 log_error("safe_mkdir() failed: %m");
1061 goto child_fail;
1062 }
1063
1064 if (initgroups((const char*)arg_user, gid) < 0) {
1065 log_error("initgroups() failed: %m");
1066 goto child_fail;
1067 }
1068
1069 if (setresgid(gid, gid, gid) < 0) {
1070 log_error("setregid() failed: %m");
1071 goto child_fail;
1072 }
1073
1074 if (setresuid(uid, uid, uid) < 0) {
1075 log_error("setreuid() failed: %m");
1076 goto child_fail;
1077 }
1078 }
1079
1080 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1081 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1082 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1083 log_error("Out of memory");
1084 goto child_fail;
1085 }
1086
1087 if (arg_uuid) {
1088 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1089 log_error("Out of memory");
1090 goto child_fail;
1091 }
1092 }
1093
1094 setup_hostname();
1095
1096 if (arg_boot) {
1097 char **a;
1098 size_t l;
1099
1100 /* Automatically search for the init system */
1101
1102 l = 1 + argc - optind;
1103 a = newa(char*, l + 1);
1104 memcpy(a + 1, argv + optind, l * sizeof(char*));
1105
1106 a[0] = (char*) "/usr/lib/systemd/systemd";
1107 execve(a[0], a, (char**) envp);
1108
1109 a[0] = (char*) "/lib/systemd/systemd";
1110 execve(a[0], a, (char**) envp);
1111
1112 a[0] = (char*) "/sbin/init";
1113 execve(a[0], a, (char**) envp);
1114 } else if (argc > optind)
1115 execvpe(argv[optind], argv + optind, (char**) envp);
1116 else {
1117 chdir(home ? home : "/root");
1118 execle("/bin/bash", "-bash", NULL, (char**) envp);
1119 }
1120
1121 log_error("execv() failed: %m");
1122
1123 child_fail:
1124 _exit(EXIT_FAILURE);
1125 }
1126
1127 if (process_pty(master, &mask) < 0)
1128 goto finish;
1129
1130 if (saved_attr_valid) {
1131 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1132 saved_attr_valid = false;
1133 }
1134
1135 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1136
1137 if (r < 0)
1138 r = EXIT_FAILURE;
1139
1140 finish:
1141 if (saved_attr_valid)
1142 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1143
1144 if (master >= 0)
1145 close_nointr_nofail(master);
1146
1147 close_pipe(kmsg_socket_pair);
1148
1149 if (oldcg)
1150 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1151
1152 if (newcg)
1153 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1154
1155 free(arg_directory);
1156 strv_free(arg_controllers);
1157 free(oldcg);
1158 free(newcg);
1159
1160 return r;
1161 }