1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
57 typedef enum LinkJournal
{
64 static char *arg_directory
= NULL
;
65 static char *arg_user
= NULL
;
66 static char **arg_controllers
= NULL
;
67 static char *arg_uuid
= NULL
;
68 static bool arg_private_network
= false;
69 static bool arg_read_only
= false;
70 static bool arg_boot
= false;
71 static LinkJournal arg_link_journal
= LINK_AUTO
;
72 static uint64_t arg_retain
=
74 (1ULL << CAP_DAC_OVERRIDE
) |
75 (1ULL << CAP_DAC_READ_SEARCH
) |
76 (1ULL << CAP_FOWNER
) |
77 (1ULL << CAP_FSETID
) |
78 (1ULL << CAP_IPC_OWNER
) |
81 (1ULL << CAP_LINUX_IMMUTABLE
) |
82 (1ULL << CAP_NET_BIND_SERVICE
) |
83 (1ULL << CAP_NET_BROADCAST
) |
84 (1ULL << CAP_NET_RAW
) |
85 (1ULL << CAP_SETGID
) |
86 (1ULL << CAP_SETFCAP
) |
87 (1ULL << CAP_SETPCAP
) |
88 (1ULL << CAP_SETUID
) |
89 (1ULL << CAP_SYS_ADMIN
) |
90 (1ULL << CAP_SYS_CHROOT
) |
91 (1ULL << CAP_SYS_NICE
) |
92 (1ULL << CAP_SYS_PTRACE
) |
93 (1ULL << CAP_SYS_TTY_CONFIG
) |
94 (1ULL << CAP_SYS_RESOURCE
);
96 static int help(void) {
98 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100 " -h --help Show this help\n"
101 " -D --directory=NAME Root directory for the container\n"
102 " -b --boot Boot up full system (i.e. invoke init)\n"
103 " -u --user=USER Run the command under specified user or uid\n"
104 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
105 " --uuid=UUID Set a specific machine UUID for the container\n"
106 " --private-network Disable network in container\n"
107 " --read-only Mount the root directory read-only\n"
108 " --capability=CAP In addition to the default, retain specified capability\n"
109 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
110 " -j Equivalent to --link-journal=host\n",
111 program_invocation_short_name
);
116 static int parse_argv(int argc
, char *argv
[]) {
119 ARG_PRIVATE_NETWORK
= 0x100,
126 static const struct option options
[] = {
127 { "help", no_argument
, NULL
, 'h' },
128 { "directory", required_argument
, NULL
, 'D' },
129 { "user", required_argument
, NULL
, 'u' },
130 { "controllers", required_argument
, NULL
, 'C' },
131 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
132 { "boot", no_argument
, NULL
, 'b' },
133 { "uuid", required_argument
, NULL
, ARG_UUID
},
134 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
135 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
136 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
145 while ((c
= getopt_long(argc
, argv
, "+hD:u:C:bj", options
, NULL
)) >= 0) {
155 arg_directory
= canonicalize_file_name(optarg
);
156 if (!arg_directory
) {
157 log_error("Failed to canonicalize root directory.");
165 if (!(arg_user
= strdup(optarg
))) {
166 log_error("Failed to duplicate user name.");
173 strv_free(arg_controllers
);
174 arg_controllers
= strv_split(optarg
, ",");
175 if (!arg_controllers
) {
176 log_error("Failed to split controllers list.");
179 strv_uniq(arg_controllers
);
183 case ARG_PRIVATE_NETWORK
:
184 arg_private_network
= true;
196 arg_read_only
= true;
199 case ARG_CAPABILITY
: {
203 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
207 t
= strndup(word
, length
);
209 log_error("Out of memory.");
213 if (cap_from_name(t
, &cap
) < 0) {
214 log_error("Failed to parse capability %s.", t
);
220 arg_retain
|= 1ULL << (uint64_t) cap
;
227 arg_link_journal
= LINK_GUEST
;
230 case ARG_LINK_JOURNAL
:
231 if (streq(optarg
, "auto"))
232 arg_link_journal
= LINK_AUTO
;
233 else if (streq(optarg
, "no"))
234 arg_link_journal
= LINK_NO
;
235 else if (streq(optarg
, "guest"))
236 arg_link_journal
= LINK_GUEST
;
237 else if (streq(optarg
, "host"))
238 arg_link_journal
= LINK_HOST
;
240 log_error("Failed to parse link journal mode %s", optarg
);
250 log_error("Unknown option code %c", c
);
258 static int mount_all(const char *dest
) {
260 typedef struct MountPoint
{
269 static const MountPoint mount_table
[] = {
270 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
271 { "/proc/sys", "/proc/sys", "bind", NULL
, MS_BIND
, true }, /* Bind mount first */
272 { "/proc/sys", "/proc/sys", "bind", NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true }, /* Then, make it r/o */
273 { "/sys", "/sys", "bind", NULL
, MS_BIND
, true }, /* Bind mount first */
274 { "/sys", "/sys", "bind", NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true }, /* Then, make it r/o */
275 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true },
276 { "/dev/pts", "/dev/pts", "bind", NULL
, MS_BIND
, true },
277 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true },
279 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL
, MS_BIND
, false }, /* Bind mount first */
280 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, false }, /* Then, make it r/o */
288 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
291 if (asprintf(&where
, "%s/%s", dest
, mount_table
[k
].where
) < 0) {
292 log_error("Out of memory");
300 t
= path_is_mount_point(where
, false);
302 log_error("Failed to detect whether %s is a mount point: %s", where
, strerror(-t
));
311 mkdir_p_label(where
, 0755);
313 if (mount(mount_table
[k
].what
,
316 mount_table
[k
].flags
,
317 mount_table
[k
].options
) < 0 &&
318 mount_table
[k
].fatal
) {
320 log_error("mount(%s) failed: %m", where
);
332 static int setup_timezone(const char *dest
) {
337 /* Fix the timezone, if possible */
338 if (asprintf(&where
, "%s/etc/localtime", dest
) < 0) {
339 log_error("Out of memory");
343 if (mount("/etc/localtime", where
, "bind", MS_BIND
, NULL
) >= 0)
344 mount("/etc/localtime", where
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
348 if (asprintf(&where
, "%s/etc/timezone", dest
) < 0) {
349 log_error("Out of memory");
353 if (mount("/etc/timezone", where
, "bind", MS_BIND
, NULL
) >= 0)
354 mount("/etc/timezone", where
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
361 static int setup_resolv_conf(const char *dest
) {
366 if (arg_private_network
)
369 /* Fix resolv.conf, if possible */
370 if (asprintf(&where
, "%s/etc/resolv.conf", dest
) < 0) {
371 log_error("Out of memory");
375 if (mount("/etc/resolv.conf", where
, "bind", MS_BIND
, NULL
) >= 0)
376 mount("/etc/resolv.conf", where
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
383 static int copy_devnodes(const char *dest
) {
385 static const char devnodes
[] =
403 NULSTR_FOREACH(d
, devnodes
) {
405 char *from
= NULL
, *to
= NULL
;
407 asprintf(&from
, "/dev/%s", d
);
408 asprintf(&to
, "%s/dev/%s", dest
, d
);
411 log_error("Failed to allocate devnode path");
424 if (stat(from
, &st
) < 0) {
426 if (errno
!= ENOENT
) {
427 log_error("Failed to stat %s: %m", from
);
432 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
434 log_error("%s is not a char or block device, cannot copy.", from
);
438 } else if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
440 log_error("mknod(%s) failed: %m", dest
);
454 static int setup_dev_console(const char *dest
, const char *console
) {
465 if (stat(console
, &st
) < 0) {
466 log_error("Failed to stat %s: %m", console
);
470 } else if (!S_ISCHR(st
.st_mode
)) {
471 log_error("/dev/console is not a char device.");
476 r
= chmod_and_chown(console
, 0600, 0, 0);
478 log_error("Failed to correct access mode for TTY: %s", strerror(-r
));
482 if (asprintf(&to
, "%s/dev/console", dest
) < 0) {
483 log_error("Out of memory");
488 /* We need to bind mount the right tty to /dev/console since
489 * ptys can only exist on pts file systems. To have something
490 * to bind mount things on we create a device node first, that
491 * has the right major/minor (note that the major minor
492 * doesn't actually matter here, since we mount it over
495 if (mknod(to
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
496 log_error("mknod() for /dev/console failed: %m");
501 if (mount(console
, to
, "bind", MS_BIND
, NULL
) < 0) {
502 log_error("Bind mount for /dev/console failed: %m");
514 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
515 char *from
= NULL
, *to
= NULL
;
519 struct cmsghdr cmsghdr
;
520 uint8_t buf
[CMSG_SPACE(sizeof(int))];
523 struct cmsghdr
*cmsg
;
526 assert(kmsg_socket
>= 0);
530 /* We create the kmsg FIFO as /dev/kmsg, but immediately
531 * delete it after bind mounting it to /proc/kmsg. While FIFOs
532 * on the reading side behave very similar to /proc/kmsg,
533 * their writing side behaves differently from /dev/kmsg in
534 * that writing blocks when nothing is reading. In order to
535 * avoid any problems with containers deadlocking due to this
536 * we simply make /dev/kmsg unavailable to the container. */
537 if (asprintf(&from
, "%s/dev/kmsg", dest
) < 0) {
538 log_error("Out of memory");
543 if (asprintf(&to
, "%s/proc/kmsg", dest
) < 0) {
544 log_error("Out of memory");
549 if (mkfifo(from
, 0600) < 0) {
550 log_error("mkfifo() for /dev/kmsg failed: %m");
555 r
= chmod_and_chown(from
, 0600, 0, 0);
557 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r
));
561 if (mount(from
, to
, "bind", MS_BIND
, NULL
) < 0) {
562 log_error("Bind mount for /proc/kmsg failed: %m");
567 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
569 log_error("Failed to open fifo: %m");
577 mh
.msg_control
= &control
;
578 mh
.msg_controllen
= sizeof(control
);
580 cmsg
= CMSG_FIRSTHDR(&mh
);
581 cmsg
->cmsg_level
= SOL_SOCKET
;
582 cmsg
->cmsg_type
= SCM_RIGHTS
;
583 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
584 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
586 mh
.msg_controllen
= cmsg
->cmsg_len
;
588 /* Store away the fd in the socket, so that it stays open as
589 * long as we run the child */
590 k
= sendmsg(kmsg_socket
, &mh
, MSG_DONTWAIT
|MSG_NOSIGNAL
);
591 close_nointr_nofail(fd
);
594 log_error("Failed to send FIFO fd: %m");
599 /* And now make the FIFO unavailable as /dev/kmsg... */
610 static int setup_hostname(void) {
614 hn
= path_get_file_name(arg_directory
);
620 hostname_cleanup(hn
);
623 if (sethostname(hn
, strlen(hn
)) < 0)
632 static int setup_journal(const char *directory
) {
633 sd_id128_t machine_id
;
634 char *p
= NULL
, *b
= NULL
, *l
, *q
= NULL
, *d
= NULL
;
637 if (arg_link_journal
== LINK_NO
)
640 p
= strappend(directory
, "/etc/machine-id");
642 log_error("Out of memory");
647 r
= read_one_line_file(p
, &b
);
648 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
) {
652 log_error("Failed to read machine ID: %s", strerror(-r
));
657 if (isempty(l
) && arg_link_journal
== LINK_AUTO
) {
662 /* Verify validaty */
663 r
= sd_id128_from_string(l
, &machine_id
);
665 log_error("Failed to parse machine ID: %s", strerror(-r
));
670 p
= strappend("/var/log/journal/", l
);
671 q
= strjoin(directory
, "/var/log/journal/", l
, NULL
);
673 log_error("Out of memory");
678 if (path_is_mount_point(p
, false) > 0 ||
679 path_is_mount_point(q
, false) > 0) {
680 if (arg_link_journal
!= LINK_AUTO
) {
681 log_error("Journal already a mount point, refusing.");
690 r
= readlink_and_make_absolute(p
, &d
);
692 if ((arg_link_journal
== LINK_GUEST
||
693 arg_link_journal
== LINK_AUTO
) &&
703 log_error("Failed to remove symlink %s: %m", p
);
707 } else if (r
== -EINVAL
) {
709 if (arg_link_journal
== LINK_GUEST
&&
712 if (errno
== ENOTDIR
)
713 log_error("%s already exists and is neither symlink nor directory.", p
);
715 log_error("Failed to remove %s: %m", p
);
721 } else if (r
!= -ENOENT
) {
722 log_error("readlink(%s) failed: %m", p
);
726 if (arg_link_journal
== LINK_GUEST
) {
728 if (symlink(q
, p
) < 0) {
729 log_error("Failed to symlink %s to %s: %m", q
, p
);
740 if (arg_link_journal
== LINK_HOST
) {
741 r
= mkdir_p(p
, 0755);
743 log_error("Failed to create %s: %m", p
);
747 } else if (access(p
, F_OK
) < 0) {
752 if (dir_is_empty(q
) == 0) {
753 log_error("%s not empty.", q
);
758 r
= mkdir_p(q
, 0755);
760 log_error("Failed to create %s: %m", q
);
764 if (mount(p
, q
, "bind", MS_BIND
, NULL
) < 0) {
765 log_error("Failed to bind mount journal from host into guest: %m");
781 static int drop_capabilities(void) {
782 return capability_bounding_set_drop(~arg_retain
, false);
785 static int is_os_tree(const char *path
) {
788 /* We use /bin/sh as flag file if something is an OS */
790 if (asprintf(&p
, "%s/bin/sh", path
) < 0)
796 return r
< 0 ? 0 : 1;
799 static int process_pty(int master
, sigset_t
*mask
) {
801 char in_buffer
[LINE_MAX
], out_buffer
[LINE_MAX
];
802 size_t in_buffer_full
= 0, out_buffer_full
= 0;
803 struct epoll_event stdin_ev
, stdout_ev
, master_ev
, signal_ev
;
804 bool stdin_readable
= false, stdout_writable
= false, master_readable
= false, master_writable
= false;
805 int ep
= -1, signal_fd
= -1, r
;
807 fd_nonblock(STDIN_FILENO
, 1);
808 fd_nonblock(STDOUT_FILENO
, 1);
809 fd_nonblock(master
, 1);
811 if ((signal_fd
= signalfd(-1, mask
, SFD_NONBLOCK
|SFD_CLOEXEC
)) < 0) {
812 log_error("signalfd(): %m");
817 if ((ep
= epoll_create1(EPOLL_CLOEXEC
)) < 0) {
818 log_error("Failed to create epoll: %m");
824 stdin_ev
.events
= EPOLLIN
|EPOLLET
;
825 stdin_ev
.data
.fd
= STDIN_FILENO
;
828 stdout_ev
.events
= EPOLLOUT
|EPOLLET
;
829 stdout_ev
.data
.fd
= STDOUT_FILENO
;
832 master_ev
.events
= EPOLLIN
|EPOLLOUT
|EPOLLET
;
833 master_ev
.data
.fd
= master
;
836 signal_ev
.events
= EPOLLIN
;
837 signal_ev
.data
.fd
= signal_fd
;
839 if (epoll_ctl(ep
, EPOLL_CTL_ADD
, STDIN_FILENO
, &stdin_ev
) < 0 ||
840 epoll_ctl(ep
, EPOLL_CTL_ADD
, STDOUT_FILENO
, &stdout_ev
) < 0 ||
841 epoll_ctl(ep
, EPOLL_CTL_ADD
, master
, &master_ev
) < 0 ||
842 epoll_ctl(ep
, EPOLL_CTL_ADD
, signal_fd
, &signal_ev
) < 0) {
843 log_error("Failed to regiser fds in epoll: %m");
849 struct epoll_event ev
[16];
853 if ((nfds
= epoll_wait(ep
, ev
, ELEMENTSOF(ev
), -1)) < 0) {
855 if (errno
== EINTR
|| errno
== EAGAIN
)
858 log_error("epoll_wait(): %m");
865 for (i
= 0; i
< nfds
; i
++) {
866 if (ev
[i
].data
.fd
== STDIN_FILENO
) {
868 if (ev
[i
].events
& (EPOLLIN
|EPOLLHUP
))
869 stdin_readable
= true;
871 } else if (ev
[i
].data
.fd
== STDOUT_FILENO
) {
873 if (ev
[i
].events
& (EPOLLOUT
|EPOLLHUP
))
874 stdout_writable
= true;
876 } else if (ev
[i
].data
.fd
== master
) {
878 if (ev
[i
].events
& (EPOLLIN
|EPOLLHUP
))
879 master_readable
= true;
881 if (ev
[i
].events
& (EPOLLOUT
|EPOLLHUP
))
882 master_writable
= true;
884 } else if (ev
[i
].data
.fd
== signal_fd
) {
885 struct signalfd_siginfo sfsi
;
888 if ((n
= read(signal_fd
, &sfsi
, sizeof(sfsi
))) != sizeof(sfsi
)) {
891 log_error("Failed to read from signalfd: invalid block size");
896 if (errno
!= EINTR
&& errno
!= EAGAIN
) {
897 log_error("Failed to read from signalfd: %m");
903 if (sfsi
.ssi_signo
== SIGWINCH
) {
906 /* The window size changed, let's forward that. */
907 if (ioctl(STDIN_FILENO
, TIOCGWINSZ
, &ws
) >= 0)
908 ioctl(master
, TIOCSWINSZ
, &ws
);
917 while ((stdin_readable
&& in_buffer_full
<= 0) ||
918 (master_writable
&& in_buffer_full
> 0) ||
919 (master_readable
&& out_buffer_full
<= 0) ||
920 (stdout_writable
&& out_buffer_full
> 0)) {
922 if (stdin_readable
&& in_buffer_full
< LINE_MAX
) {
924 if ((k
= read(STDIN_FILENO
, in_buffer
+ in_buffer_full
, LINE_MAX
- in_buffer_full
)) < 0) {
926 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
927 stdin_readable
= false;
929 log_error("read(): %m");
934 in_buffer_full
+= (size_t) k
;
937 if (master_writable
&& in_buffer_full
> 0) {
939 if ((k
= write(master
, in_buffer
, in_buffer_full
)) < 0) {
941 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
942 master_writable
= false;
944 log_error("write(): %m");
950 assert(in_buffer_full
>= (size_t) k
);
951 memmove(in_buffer
, in_buffer
+ k
, in_buffer_full
- k
);
956 if (master_readable
&& out_buffer_full
< LINE_MAX
) {
958 if ((k
= read(master
, out_buffer
+ out_buffer_full
, LINE_MAX
- out_buffer_full
)) < 0) {
960 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
961 master_readable
= false;
963 log_error("read(): %m");
968 out_buffer_full
+= (size_t) k
;
971 if (stdout_writable
&& out_buffer_full
> 0) {
973 if ((k
= write(STDOUT_FILENO
, out_buffer
, out_buffer_full
)) < 0) {
975 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
976 stdout_writable
= false;
978 log_error("write(): %m");
984 assert(out_buffer_full
>= (size_t) k
);
985 memmove(out_buffer
, out_buffer
+ k
, out_buffer_full
- k
);
986 out_buffer_full
-= k
;
994 close_nointr_nofail(ep
);
997 close_nointr_nofail(signal_fd
);
1002 int main(int argc
, char *argv
[]) {
1004 int r
= EXIT_FAILURE
, k
;
1005 char *oldcg
= NULL
, *newcg
= NULL
;
1006 char **controller
= NULL
;
1008 const char *console
= NULL
;
1009 struct termios saved_attr
, raw_attr
;
1011 bool saved_attr_valid
= false;
1013 int kmsg_socket_pair
[2] = { -1, -1 };
1015 log_parse_environment();
1018 if ((r
= parse_argv(argc
, argv
)) <= 0)
1021 if (arg_directory
) {
1024 p
= path_make_absolute_cwd(arg_directory
);
1025 free(arg_directory
);
1028 arg_directory
= get_current_dir_name();
1030 if (!arg_directory
) {
1031 log_error("Failed to determine path");
1035 path_kill_slashes(arg_directory
);
1037 if (geteuid() != 0) {
1038 log_error("Need to be root.");
1042 if (sd_booted() <= 0) {
1043 log_error("Not running on a systemd system.");
1047 if (path_equal(arg_directory
, "/")) {
1048 log_error("Spawning container on root directory not supported.");
1052 if (is_os_tree(arg_directory
) <= 0) {
1053 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory
);
1057 if ((k
= cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER
, 0, &oldcg
)) < 0) {
1058 log_error("Failed to determine current cgroup: %s", strerror(-k
));
1062 if (asprintf(&newcg
, "%s/nspawn-%lu", oldcg
, (unsigned long) getpid()) < 0) {
1063 log_error("Failed to allocate cgroup path.");
1067 k
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, newcg
, 0);
1069 log_error("Failed to create cgroup: %s", strerror(-k
));
1073 STRV_FOREACH(controller
,arg_controllers
) {
1074 k
= cg_create_and_attach(*controller
, newcg
, 0);
1076 log_warning("Failed to create cgroup in controller %s: %s", *controller
, strerror(-k
));
1079 if ((master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
)) < 0) {
1080 log_error("Failed to acquire pseudo tty: %m");
1084 if (!(console
= ptsname(master
))) {
1085 log_error("Failed to determine tty name: %m");
1089 log_info("Spawning namespace container on %s (console is %s).", arg_directory
, console
);
1091 if (ioctl(STDIN_FILENO
, TIOCGWINSZ
, &ws
) >= 0)
1092 ioctl(master
, TIOCSWINSZ
, &ws
);
1094 if (unlockpt(master
) < 0) {
1095 log_error("Failed to unlock tty: %m");
1099 if (tcgetattr(STDIN_FILENO
, &saved_attr
) < 0) {
1100 log_error("Failed to get terminal attributes: %m");
1104 saved_attr_valid
= true;
1106 raw_attr
= saved_attr
;
1107 cfmakeraw(&raw_attr
);
1108 raw_attr
.c_lflag
&= ~ECHO
;
1110 if (tcsetattr(STDIN_FILENO
, TCSANOW
, &raw_attr
) < 0) {
1111 log_error("Failed to set terminal attributes: %m");
1115 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
1116 log_error("Failed to create kmsg socket pair");
1120 assert_se(sigemptyset(&mask
) == 0);
1121 sigset_add_many(&mask
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1);
1122 assert_se(sigprocmask(SIG_BLOCK
, &mask
, NULL
) == 0);
1124 pid
= syscall(__NR_clone
, SIGCHLD
|CLONE_NEWIPC
|CLONE_NEWNS
|CLONE_NEWPID
|CLONE_NEWUTS
|(arg_private_network
? CLONE_NEWNET
: 0), NULL
);
1126 if (errno
== EINVAL
)
1127 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1129 log_error("clone() failed: %m");
1137 const char *home
= NULL
;
1138 uid_t uid
= (uid_t
) -1;
1139 gid_t gid
= (gid_t
) -1;
1140 const char *envp
[] = {
1141 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1147 NULL
, /* container_uuid */
1151 envp
[2] = strv_find_prefix(environ
, "TERM=");
1153 close_nointr_nofail(master
);
1155 close_nointr(STDIN_FILENO
);
1156 close_nointr(STDOUT_FILENO
);
1157 close_nointr(STDERR_FILENO
);
1159 close_all_fds(&kmsg_socket_pair
[1], 1);
1161 reset_all_signal_handlers();
1163 assert_se(sigemptyset(&mask
) == 0);
1164 assert_se(sigprocmask(SIG_SETMASK
, &mask
, NULL
) == 0);
1169 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
1172 /* Mark / as private, in case somebody marked it shared */
1173 if (mount(NULL
, "/", NULL
, MS_PRIVATE
|MS_REC
, NULL
) < 0)
1176 /* Turn directory into bind mount */
1177 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
, NULL
) < 0) {
1178 log_error("Failed to make bind mount.");
1183 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0) {
1184 log_error("Failed to make read-only.");
1188 if (mount_all(arg_directory
) < 0)
1191 if (copy_devnodes(arg_directory
) < 0)
1194 if (setup_dev_console(arg_directory
, console
) < 0)
1197 if (setup_kmsg(arg_directory
, kmsg_socket_pair
[1]) < 0)
1200 close_nointr_nofail(kmsg_socket_pair
[1]);
1202 if (setup_timezone(arg_directory
) < 0)
1205 if (setup_resolv_conf(arg_directory
) < 0)
1208 if (setup_journal(arg_directory
) < 0)
1211 if (chdir(arg_directory
) < 0) {
1212 log_error("chdir(%s) failed: %m", arg_directory
);
1216 if (open_terminal("dev/console", O_RDWR
) != STDIN_FILENO
||
1217 dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
1218 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
1221 if (mount(arg_directory
, "/", "bind", MS_MOVE
, NULL
) < 0) {
1222 log_error("mount(MS_BIND) failed: %m");
1226 if (chroot(".") < 0) {
1227 log_error("chroot() failed: %m");
1231 if (chdir("/") < 0) {
1232 log_error("chdir() failed: %m");
1240 if (drop_capabilities() < 0) {
1241 log_error("drop_capabilities() failed: %m");
1247 if (get_user_creds((const char**)&arg_user
, &uid
, &gid
, &home
, NULL
) < 0) {
1248 log_error("get_user_creds() failed: %m");
1252 if (mkdir_parents_label(home
, 0775) < 0) {
1253 log_error("mkdir_parents_label() failed: %m");
1257 if (mkdir_safe_label(home
, 0775, uid
, gid
) < 0) {
1258 log_error("mkdir_safe_label() failed: %m");
1262 if (initgroups((const char*)arg_user
, gid
) < 0) {
1263 log_error("initgroups() failed: %m");
1267 if (setresgid(gid
, gid
, gid
) < 0) {
1268 log_error("setregid() failed: %m");
1272 if (setresuid(uid
, uid
, uid
) < 0) {
1273 log_error("setreuid() failed: %m");
1278 if ((asprintf((char**)(envp
+ 3), "HOME=%s", home
? home
: "/root") < 0) ||
1279 (asprintf((char**)(envp
+ 4), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
1280 (asprintf((char**)(envp
+ 5), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0)) {
1281 log_error("Out of memory");
1286 if (asprintf((char**)(envp
+ 6), "container_uuid=%s", arg_uuid
) < 0) {
1287 log_error("Out of memory");
1298 /* Automatically search for the init system */
1300 l
= 1 + argc
- optind
;
1301 a
= newa(char*, l
+ 1);
1302 memcpy(a
+ 1, argv
+ optind
, l
* sizeof(char*));
1304 a
[0] = (char*) "/usr/lib/systemd/systemd";
1305 execve(a
[0], a
, (char**) envp
);
1307 a
[0] = (char*) "/lib/systemd/systemd";
1308 execve(a
[0], a
, (char**) envp
);
1310 a
[0] = (char*) "/sbin/init";
1311 execve(a
[0], a
, (char**) envp
);
1312 } else if (argc
> optind
)
1313 execvpe(argv
[optind
], argv
+ optind
, (char**) envp
);
1315 chdir(home
? home
: "/root");
1316 execle("/bin/bash", "-bash", NULL
, (char**) envp
);
1319 log_error("execv() failed: %m");
1322 _exit(EXIT_FAILURE
);
1325 if (process_pty(master
, &mask
) < 0)
1328 if (saved_attr_valid
) {
1329 tcsetattr(STDIN_FILENO
, TCSANOW
, &saved_attr
);
1330 saved_attr_valid
= false;
1333 r
= wait_for_terminate_and_warn(argc
> optind
? argv
[optind
] : "bash", pid
);
1339 if (saved_attr_valid
)
1340 tcsetattr(STDIN_FILENO
, TCSANOW
, &saved_attr
);
1343 close_nointr_nofail(master
);
1345 close_pipe(kmsg_socket_pair
);
1348 cg_attach(SYSTEMD_CGROUP_CONTROLLER
, oldcg
, 0);
1351 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER
, newcg
, true);
1353 free(arg_directory
);
1354 strv_free(arg_controllers
);